def testDoSkippedModelCreation(self, mock_runner, mock_run_model_inference, _): input_dict = { 'examples': [self._examples], 'model': [self._model], 'model_blessing': [self._model_blessing], } output_dict = { 'inference_result': [self._inference_result], } ai_platform_serving_args = { 'model_name': 'model_name', 'project_id': 'project_id' } # Create exe properties. exec_properties = { 'data_spec': proto_utils.proto_to_json(bulk_inferrer_pb2.DataSpec()), 'custom_config': json_utils.dumps( {executor.SERVING_ARGS_KEY: ai_platform_serving_args}), } mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') mock_runner.create_model_for_aip_prediction_if_not_exist.return_value = False # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(input_dict, output_dict, exec_properties) ai_platform_prediction_model_spec = ( model_spec_pb2.AIPlatformPredictionModelSpec( project_id='project_id', model_name='model_name', version_name=self._model_version)) ai_platform_prediction_model_spec.use_serialization_config = True inference_endpoint = model_spec_pb2.InferenceSpecType() inference_endpoint.ai_platform_prediction_model_spec.CopyFrom( ai_platform_prediction_model_spec) mock_run_model_inference.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY, inference_endpoint) executor_class_path = '%s.%s' % (bulk_inferrer.__class__.__module__, bulk_inferrer.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( serving_path=path_utils.serving_model_path(self._model.uri), model_version_name=mock.ANY, ai_platform_serving_args=ai_platform_serving_args, labels=job_labels, api=mock.ANY, skip_model_endpoint_creation=True, set_default=False) mock_runner.delete_model_from_aip_if_exists.assert_called_once_with( model_version_name=mock.ANY, ai_platform_serving_args=ai_platform_serving_args, api=mock.ANY, delete_model_endpoint=False)
def testDoWithOutputExamplesSpecifiedSplits(self): self._exec_properties['data_spec'] = proto_utils.proto_to_json( text_format.Parse( """ example_splits: 'unlabelled' """, bulk_inferrer_pb2.DataSpec())) self._exec_properties[ 'output_example_spec'] = proto_utils.proto_to_json( text_format.Parse( """ output_columns_spec { classify_output { label_column: 'classify_label' score_column: 'classify_score' } } """, bulk_inferrer_pb2.OutputExampleSpec())) # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(self._input_dict, self._output_dict_oe, self._exec_properties) # Check outputs. self.assertTrue(fileio.exists(self._output_examples_dir)) self._verify_example_split('unlabelled') self.assertFalse( fileio.exists( os.path.join(self._output_examples_dir, 'unlabelled2')))
def testDoWithBlessedModel(self): input_dict = { 'examples': [self._examples], 'model_export': [self._model_export], 'model_blessing': [self._model_blessing], } output_dict = { 'output': [self._inference_result], } # Create exe properties. exec_properties = { 'data_spec': json_format.MessageToJson(bulk_inferrer_pb2.DataSpec()), 'model_spec': json_format.MessageToJson(bulk_inferrer_pb2.ModelSpec()), 'component_id': self.component_id, } # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(input_dict, output_dict, exec_properties) # Check outputs. self.assertTrue(tf.io.gfile.exists(self._prediction_log_dir)) results = self._get_results(self._prediction_log_dir) self.assertTrue(results) self.assertEqual( len(results[0].classify_log.response.result.classifications), 1) self.assertEqual( len(results[0].classify_log.response.result.classifications[0]. classes), 2)
def __init__(self, examples: types.Channel, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, output_example_spec: Optional[Union[ bulk_inferrer_pb2.OutputExampleSpec, Dict[Text, Any]]] = None, custom_config: Optional[Dict[Text, Any]] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify if you want BulkInferrer to output examples instead of inference result. If any field is provided as a RuntimeParameter, output_example_spec should be constructed as a dict with the same field names as OutputExampleSpec proto message. custom_config: A dict which contains the deployment job parameters to be passed to Google Cloud AI Platform. custom_config.ai_platform_serving_args need to contain the serving job parameters. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models Raises: ValueError: Must not specify inference_result or output_examples depends on whether output_example_spec is set or not. """ if output_example_spec: output_examples = types.Channel(type=standard_artifacts.Examples) inference_result = None else: inference_result = types.Channel( type=standard_artifacts.InferenceResult) output_examples = None spec = CloudAIBulkInferrerComponentSpec( examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), output_example_spec=output_example_spec, custom_config=json_utils.dumps(custom_config), inference_result=inference_result, output_examples=output_examples) super(CloudAIBulkInferrerComponent, self).__init__(spec=spec)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result, optional. output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - model_spec: JSON string of bulk_inferrer_pb2.ModelSpec instance. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'inference_result' not in output_dict: raise ValueError('\'inference_result\' is missing in output dict.') output = artifact_utils.get_single_instance( output_dict['inference_result']) if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if 'model_blessing' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info( 'Model blessing is not provided, exported model will be ' 'used.') model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) logging.info('Use exported model from %s.', model_path) data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) if self._run_model_inference( data_spec, input_dict['examples'], output.uri, self._get_inference_spec(model_path, exec_properties)): output.set_int_custom_property('inferred', 1) else: output.set_int_custom_property('inferred', 0)
def __init__(self, examples: types.Channel = None, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec, Dict[Text, Any]]] = None, output_example_spec: Optional[Union[ bulk_inferrer_pb2.OutputExampleSpec, Dict[Text, Any]]] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. If any field is provided as a RuntimeParameter, model_spec should be constructed as a dict with the same field names as ModelSpec proto message. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify if you want BulkInferrer to output examples instead of inference result. If any field is provided as a RuntimeParameter, output_example_spec should be constructed as a dict with the same field names as OutputExampleSpec proto message. """ if output_example_spec: output_examples = types.Channel(type=standard_artifacts.Examples) inference_result = None else: inference_result = types.Channel( type=standard_artifacts.InferenceResult) output_examples = None spec = BulkInferrerSpec(examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), output_example_spec=output_example_spec, inference_result=inference_result, output_examples=output_examples) super(BulkInferrer, self).__init__(spec=spec)
def __init__(self, examples: types.Channel = None, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec, Dict[Text, Any]]] = None, inference_result: Optional[types.Channel] = None, instance_name: Optional[Text] = None, enable_cache: Optional[bool] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. If any field is provided as a RuntimeParameter, model_spec should be constructed as a dict with the same field names as ModelSpec proto message. inference_result: Channel of type `standard_artifacts.InferenceResult` to store the inference results. instance_name: Optional name assigned to this specific instance of BulkInferrer. Required only if multiple BulkInferrer components are declared in the same pipeline. enable_cache: Optional boolean to indicate if cache is enabled for the BulkInferrer component. If not specified, defaults to the value specified for pipeline's enable_cache parameter. """ inference_result = inference_result or types.Channel( type=standard_artifacts.InferenceResult, artifacts=[standard_artifacts.InferenceResult()]) spec = BulkInferrerSpec(examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), inference_result=inference_result) super(BulkInferrer, self).__init__(spec=spec, instance_name=instance_name, enable_cache=enable_cache)
def __init__( self, examples: types.BaseChannel, model: Optional[types.BaseChannel] = None, model_blessing: Optional[types.BaseChannel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, data_types.RuntimeParameter]] = None, model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec, data_types.RuntimeParameter]] = None, output_example_spec: Optional[ Union[bulk_inferrer_pb2.OutputExampleSpec, data_types.RuntimeParameter]] = None): """Construct an BulkInferrer component. Args: examples: A BaseChannel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A BaseChannel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A BaseChannel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify if you want BulkInferrer to output examples instead of inference result. """ if output_example_spec: output_examples = types.Channel(type=standard_artifacts.Examples) inference_result = None else: inference_result = types.Channel( type=standard_artifacts.InferenceResult) output_examples = None spec = standard_component_specs.BulkInferrerSpec( examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), output_example_spec=output_example_spec, inference_result=inference_result, output_examples=output_examples) super().__init__(spec=spec)
def __init__(self, examples: types.Channel = None, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, custom_config: Dict[Text, Any] = None, inference_result: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. custom_config: A dict which contains the deployment job parameters to be passed to Google Cloud AI Platform. custom_config.ai_platform_serving_args need to contain the serving job parameters. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models inference_result: Channel of type `standard_artifacts.InferenceResult` to store the inference results. instance_name: Optional name assigned to this specific instance of BulkInferrer. Required only if multiple BulkInferrer components are declared in the same pipeline. """ inference_result = inference_result or types.Channel( type=standard_artifacts.InferenceResult, artifacts=[standard_artifacts.InferenceResult()]) spec = CloudAIBulkInferrerComponentSpec( examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), custom_config=json_utils.dumps(custom_config), inference_result=inference_result) super(CloudAIBulkInferrerComponent, self).__init__( spec=spec, instance_name=instance_name)
def __init__(self, examples: types.Channel = None, model_export: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, model_push: Optional[types.Channel] = None, data_spec: Optional[bulk_inferrer_pb2.DataSpec] = None, model_spec: Optional[bulk_inferrer_pb2.ModelSpec] = None, output: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an BulkInferrer component. Args: examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen component. _required_ model_export: A Channel of 'ModelExportPath' type, usually produced by Trainer component. model_blessing: A Channel of 'ModelBlessingPath' type, usually produced by Model Validator component. model_push: A Channel of 'PushedModel' type, usually produced by Pusher component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. output: Channel of `InferenceResult` to store the inference results. instance_name: Optional name assigned to this specific instance of BulkInferrer. Required only if multiple BulkInferrer components are declared in the same pipeline. """ output = output or types.Channel( type=standard_artifacts.InferenceResult, artifacts=[standard_artifacts.InferenceResult()]) spec = BulkInferrerSpec(examples=examples, model_export=model_export, model_blessing=model_blessing, model_push=model_push, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), output=output) super(BulkInferrer, self).__init__(spec=spec, instance_name=instance_name)
def testDoFailedModelDeployment(self, mock_runner, mock_run_model_inference, _): input_dict = { 'examples': [self._examples], 'model': [self._model], 'model_blessing': [self._model_blessing], } output_dict = { 'inference_result': [self._inference_result], } ai_platform_serving_args = { 'model_name': 'model_name', 'project_id': 'project_id' } # Create exe properties. exec_properties = { 'data_spec': json_format.MessageToJson(bulk_inferrer_pb2.DataSpec(), preserving_proto_field_name=True), 'custom_config': json_utils.dumps( {executor.SERVING_ARGS_KEY: ai_platform_serving_args}), } mock_runner.deploy_model_for_aip_prediction.side_effect = ( Exception('Deployment failed')) mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') mock_runner.create_model_for_aip_prediction_if_not_exist.return_value = True bulk_inferrer = executor.Executor(self._context) with self.assertRaises(Exception): bulk_inferrer.Do(input_dict, output_dict, exec_properties) mock_runner.delete_model_version_from_aip_if_exists.assert_called_once_with( mock.ANY, mock.ANY, ai_platform_serving_args) mock_runner.delete_model_from_aip_if_exists.assert_called_once_with( mock.ANY, ai_platform_serving_args)
def setUp(self): super(ExecutorTest, self).setUp() self._source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self.component_id = 'test_component' # Create input dict. self._examples = standard_artifacts.Examples() unlabelled_path = os.path.join(self._source_data_dir, 'csv_example_gen', 'unlabelled') self._examples.uri = os.path.join(self._output_data_dir, 'csv_example_gen') io_utils.copy_dir(unlabelled_path, os.path.join(self._examples.uri, 'unlabelled')) io_utils.copy_dir(unlabelled_path, os.path.join(self._examples.uri, 'unlabelled2')) self._examples.split_names = artifact_utils.encode_split_names( ['unlabelled', 'unlabelled2']) self._model = standard_artifacts.Model() self._model.uri = os.path.join(self._source_data_dir, 'trainer/current') self._model_blessing = standard_artifacts.ModelBlessing() self._model_blessing.uri = os.path.join(self._source_data_dir, 'model_validator/blessed') self._model_blessing.set_int_custom_property('blessed', 1) self._input_dict = { 'examples': [self._examples], 'model': [self._model], 'model_blessing': [self._model_blessing], } # Create output dict. self._inference_result = standard_artifacts.InferenceResult() self._prediction_log_dir = os.path.join(self._output_data_dir, 'prediction_logs') self._inference_result.uri = self._prediction_log_dir self._output_examples = standard_artifacts.Examples() self._output_examples_dir = os.path.join(self._output_data_dir, 'output_examples') self._output_examples.uri = self._output_examples_dir self._output_dict_ir = { 'inference_result': [self._inference_result], } self._output_dict_oe = { 'output_examples': [self._output_examples], } # Create exe properties. self._exec_properties = { 'data_spec': proto_utils.proto_to_json(bulk_inferrer_pb2.DataSpec()), 'model_spec': proto_utils.proto_to_json(bulk_inferrer_pb2.ModelSpec()), 'component_id': self.component_id, } # Create context self._tmp_dir = os.path.join(self._output_data_dir, '.temp') self._context = executor.Executor.Context(tmp_dir=self._tmp_dir, unique_id='2')
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. This function creates a new model (if necessary) and a new model version before inference, and cleans up resources after inference. It provides re-executability as it cleans up (only) the model resources that are created during the process even inference job failed. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. - custom_config: custom_config.ai_platform_serving_args need to contain the serving job parameters sent to Google Cloud AI Platform. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'inference_result' not in output_dict: raise ValueError('\'inference_result\' is missing in output dict.') output = artifact_utils.get_single_instance(output_dict['inference_result']) if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if 'model_blessing' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info('Model blessing is not provided, exported model will be ' 'used.') if _CUSTOM_CONFIG_KEY not in exec_properties: raise ValueError('Input exec properties are not valid, {} ' 'need to be specified.'.format(_CUSTOM_CONFIG_KEY)) custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY) if not ai_platform_serving_args: raise ValueError( '\'ai_platform_serving_args\' is missing in \'custom_config\'') service_name, api_version = runner.get_service_name_and_api_version( ai_platform_serving_args) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) logging.info('Use exported model from %s.', model_path) # Use model artifact uri to generate model version to guarantee the # 1:1 mapping from model version to model. model_version = 'version_' + hashlib.sha256(model.uri.encode()).hexdigest() inference_spec = self._get_inference_spec(model_path, model_version, ai_platform_serving_args) data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) api = discovery.build(service_name, api_version) new_model_created = False try: new_model_created = runner.create_model_for_aip_prediction_if_not_exist( api, job_labels, ai_platform_serving_args) runner.deploy_model_for_aip_prediction( api, model_path, model_version, ai_platform_serving_args, job_labels, skip_model_creation=True, set_default_version=False, ) self._run_model_inference(data_spec, input_dict['examples'], output.uri, inference_spec) except Exception as e: logging.error('Error in executing CloudAIBulkInferrerComponent: %s', str(e)) output.set_int_custom_property('inferred', 0) raise finally: # Guarantee newly created resources are cleaned up even if theinference # job failed. # Clean up the newly deployed model. runner.delete_model_version_from_aip_if_exists(api, model_version, ai_platform_serving_args) if new_model_created: runner.delete_model_from_aip_if_exists(api, ai_platform_serving_args) # Mark the inferenence as successful after resources are cleaned up. output.set_int_custom_property('inferred', 1)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, direct_num_workers: int) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" training_examples = external_input(training_data_root) # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen( input_base=training_examples, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. infer_schema = SchemaGen( statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. validate_stats = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=infer_schema.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform( examples=training_example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=infer_schema.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Uses TFMA to compute a evaluation statistics over features of a model. model_analyzer = Evaluator( examples=training_example_gen.outputs['examples'], model_exports=trainer.outputs['model'], feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[ evaluator_pb2.SingleSlicingSpec( column_for_slicing=['trip_start_hour']) ])) # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=training_example_gen.outputs['examples'], model=trainer.outputs['model']) inference_examples = external_input(inference_data_root) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_examples, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig( splits=[example_gen_pb2.SplitConfig.Split( name='unlabelled', hash_buckets=100)])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model=trainer.outputs['model'], model_blessing=model_validator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, infer_schema, validate_stats, transform, trainer, model_analyzer, model_validator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), # TODO(b/141578059): The multi-processing API might change. beam_pipeline_args=['--direct_num_workers=%d' % direct_num_workers])
def __init__(self, examples: types.Channel = None, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec, Dict[Text, Any]]] = None, output_example_spec: Optional[Union[ bulk_inferrer_pb2.OutputExampleSpec, Dict[Text, Any]]] = None, inference_result: Optional[types.Channel] = None, output_examples: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. If any field is provided as a RuntimeParameter, model_spec should be constructed as a dict with the same field names as ModelSpec proto message. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify if you want BulkInferrer to output examples instead of inference result. If any field is provided as a RuntimeParameter, output_example_spec should be constructed as a dict with the same field names as OutputExampleSpec proto message. inference_result: Channel of type `standard_artifacts.InferenceResult` to store the inference results, must not be specified when output_example_spec is set. output_examples: Channel of type `standard_artifacts.Examples` to store the output examples, must not be specified when output_example_spec is unset. Check output_example_spec for details. instance_name: Optional name assigned to this specific instance of BulkInferrer. Required only if multiple BulkInferrer components are declared in the same pipeline. Raises: ValueError: Must not specify inference_result or output_examples depends on whether output_example_spec is set or not. """ if output_example_spec: if inference_result: raise ValueError( 'Must not specify inference_result when output_example_spec is set.' ) output_examples = output_examples or types.Channel( type=standard_artifacts.Examples) else: if output_examples: raise ValueError( 'Must not specify output_examples when output_example_spec is unset.' ) inference_result = inference_result or types.Channel( type=standard_artifacts.InferenceResult) spec = BulkInferrerSpec(examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), output_example_spec=output_example_spec, inference_result=inference_result, output_examples=output_examples) super(BulkInferrer, self).__init__(spec=spec, instance_name=instance_name)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - model_spec: JSON string of bulk_inferrer_pb2.ModelSpec instance. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'inference_result' not in output_dict: raise ValueError('\'inference_result\' is missing in output dict.') output = artifact_utils.get_single_instance( output_dict['inference_result']) if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if 'model_blessing' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info( 'Model blessing is not provided, exported model will be ' 'used.') model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) logging.info('Use exported model from %s.', model_path) data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) example_uris = {} if data_spec.example_splits: for example in input_dict['examples']: for split in artifact_utils.decode_split_names( example.split_names): if split in data_spec.example_splits: example_uris[split] = os.path.join(example.uri, split) else: for example in input_dict['examples']: for split in artifact_utils.decode_split_names( example.split_names): example_uris[split] = os.path.join(example.uri, split) model_spec = bulk_inferrer_pb2.ModelSpec() json_format.Parse(exec_properties['model_spec'], model_spec) output_path = os.path.join(output.uri, _PREDICTION_LOGS_DIR_NAME) self._run_model_inference(model_path, example_uris, output_path, model_spec) logging.info('BulkInferrer generates prediction log to %s', output_path) output.set_int_custom_property('inferred', 1)
def _create_pipeline(pipeline_name: Text, pipeline_root: Text, training_data_root: Text, inference_data_root: Text, module_file: Text, metadata_path: Text, beam_pipeline_args: List[Text]) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" # Brings training data into the pipeline or otherwise joins/converts # training data. training_example_gen = CsvExampleGen(input_base=training_data_root, instance_name='training_example_gen') # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen( input_data=training_example_gen.outputs['examples']) # Generates schema based on statistics files. schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=False) # Performs anomaly detection based on statistics and data schema. example_validator = ExampleValidator( statistics=statistics_gen.outputs['statistics'], schema=schema_gen.outputs['schema']) # Performs transformations and feature engineering in training and serving. transform = Transform(examples=training_example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) # Uses user-provided Python function that implements a model using TF-Learn. trainer = Trainer( module_file=module_file, transformed_examples=transform.outputs['transformed_examples'], schema=schema_gen.outputs['schema'], transform_graph=transform.outputs['transform_graph'], train_args=trainer_pb2.TrainArgs(num_steps=10000), eval_args=trainer_pb2.EvalArgs(num_steps=5000)) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(signature_name='eval')], slicing_specs=[ tfma.SlicingSpec(), tfma.SlicingSpec(feature_keys=['trip_start_hour']) ], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.6}), # Change threshold will be ignored if there is no # baseline model resolved from MLMD (first run). change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator(examples=training_example_gen.outputs['examples'], model=trainer.outputs['model'], baseline_model=model_resolver.outputs['model'], eval_config=eval_config) # Brings inference data into the pipeline. inference_example_gen = CsvExampleGen( input_base=inference_data_root, output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split(name='unlabelled', hash_buckets=100) ])), instance_name='inference_example_gen') # Performs offline batch inference over inference examples. bulk_inferrer = BulkInferrer( examples=inference_example_gen.outputs['examples'], model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], # Empty data_spec.example_splits will result in using all splits. data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec()) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[ training_example_gen, inference_example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, model_resolver, evaluator, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), beam_pipeline_args=beam_pipeline_args)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result, optional. output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - model_spec: JSON string of bulk_inferrer_pb2.ModelSpec instance. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if output_dict.get(standard_component_specs.INFERENCE_RESULT_KEY): inference_result = artifact_utils.get_single_instance( output_dict[standard_component_specs.INFERENCE_RESULT_KEY]) else: inference_result = None if output_dict.get(standard_component_specs.OUTPUT_EXAMPLES_KEY): output_examples = artifact_utils.get_single_instance( output_dict[standard_component_specs.OUTPUT_EXAMPLES_KEY]) else: output_examples = None if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if standard_component_specs.MODEL_BLESSING_KEY in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict[standard_component_specs.MODEL_BLESSING_KEY]) if not model_utils.is_model_blessed(model_blessing): logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info( 'Model blessing is not provided, exported model will be ' 'used.') model = artifact_utils.get_single_instance( input_dict[standard_component_specs.MODEL_KEY]) model_path = path_utils.serving_model_path( model.uri, path_utils.is_old_model_artifact(model)) logging.info('Use exported model from %s.', model_path) data_spec = bulk_inferrer_pb2.DataSpec() proto_utils.json_to_proto( exec_properties[standard_component_specs.DATA_SPEC_KEY], data_spec) output_example_spec = bulk_inferrer_pb2.OutputExampleSpec() if exec_properties.get( standard_component_specs.OUTPUT_EXAMPLE_SPEC_KEY): proto_utils.json_to_proto( exec_properties[ standard_component_specs.OUTPUT_EXAMPLE_SPEC_KEY], output_example_spec) self._run_model_inference( data_spec, output_example_spec, input_dict[standard_component_specs.EXAMPLES_KEY], output_examples, inference_result, self._get_inference_spec(model_path, exec_properties))
def generate_pipeline(pipeline_name, pipeline_root, train_data, test_data, train_steps, eval_steps, pusher_target, runner): module_file = 'util.py' # util.py is a file in the same folder # RuntimeParameter is only supported on KubeflowDagRunner currently if runner == 'kubeflow': pipeline_root_param = os.path.join('gs://{{kfp-default-bucket}}', pipeline_name, '{{workflow.uid}}') train_data_param = data_types.RuntimeParameter( name='train-data', default= 'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/train', ptype=Text) test_data_param = data_types.RuntimeParameter( name='test-data', default= 'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/test', ptype=Text) pusher_target_param = data_types.RuntimeParameter( name='pusher-destination', default= 'gs://renming-mlpipeline-kubeflowpipelines-default/kaggle/santander/serving', ptype=Text) else: pipeline_root_param = pipeline_root train_data_param = train_data test_data_param = test_data pusher_target_param = pusher_target examples = external_input(train_data_param) example_gen = CsvExampleGen(input=examples, instance_name="train") test_examples = external_input(test_data_param) test_example_gen = CsvExampleGen(input=test_examples, output_config={ 'split_config': { 'splits': [{ 'name': 'test', 'hash_buckets': 1 }] } }, instance_name="test") statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) schema_gen = SchemaGen(statistics=statistics_gen.outputs['statistics'], infer_feature_shape=True ) # infer_feature_shape controls sparse or dense # Transform is too slow in my side. transform = Transform(examples=example_gen.outputs['examples'], schema=schema_gen.outputs['schema'], module_file=module_file) trainer = Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec(GenericExecutor), examples=transform.outputs['transformed_examples'], transform_graph=transform.outputs['transform_graph'], schema=schema_gen.outputs['schema'], module_file=module_file, train_args=trainer_pb2.TrainArgs(num_steps=train_steps), eval_args=trainer_pb2.EvalArgs(num_steps=eval_steps), instance_name="train", enable_cache=False) # Get the latest blessed model for model validation. model_resolver = ResolverNode( instance_name='latest_blessed_model_resolver', resolver_class=latest_blessed_model_resolver. LatestBlessedModelResolver, model=Channel(type=Model), model_blessing=Channel(type=ModelBlessing)) # Uses TFMA to compute a evaluation statistics over features of a model and # perform quality validation of a candidate model (compared to a baseline). eval_config = tfma.EvalConfig( model_specs=[tfma.ModelSpec(label_key='target')], # tfma.SlicingSpec(feature_keys=['var_0', 'var_1']) when add more, Evaluator can't ouptput BLESSED status. It should be a bug in TFMA. slicing_specs=[tfma.SlicingSpec()], metrics_specs=[ tfma.MetricsSpec( thresholds={ 'binary_accuracy': tfma.config.MetricThreshold( value_threshold=tfma.GenericValueThreshold( lower_bound={'value': 0.4}), change_threshold=tfma.GenericChangeThreshold( direction=tfma.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1e-10})) }) ]) evaluator = Evaluator( examples=example_gen.outputs['examples'], model=trainer.outputs['model'], # baseline_model=model_resolver.outputs['model'], # Change threshold will be ignored if there is no baseline (first run). eval_config=eval_config, instance_name="eval5") # Checks whether the model passed the validation steps and pushes the model # to a file destination if check passed. pusher = Pusher(model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], push_destination={ 'filesystem': { 'base_directory': pusher_target_param } }) bulk_inferrer = BulkInferrer( examples=test_example_gen.outputs['examples'], model=trainer.outputs['model'], # model_blessing=evaluator.outputs['blessing'], data_spec=bulk_inferrer_pb2.DataSpec(), model_spec=bulk_inferrer_pb2.ModelSpec(), instance_name="bulkInferrer") hello = component.HelloComponent( input_data=bulk_inferrer.outputs['inference_result'], instance_name='csvGen') return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root_param, components=[ example_gen, statistics_gen, schema_gen, transform, trainer, model_resolver, evaluator, pusher, hello, test_example_gen, bulk_inferrer ], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( os.path.join(pipeline_root, 'metadata.sqlite')), beam_pipeline_args=['--direct_num_workers=0'])
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model_export: exported model. - model_blessing: model blessing result - model_push: pushed model Either model_push or (model_export and model_blessing) need to present. output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - model_spec: JSON string of bulk_inferrer_pb2.ModelSpec instance. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'output' not in output_dict: raise ValueError('\'output\' is missing in output dict.') output = artifact_utils.get_single_instance(output_dict['output']) if 'model_push' in input_dict: model_push = artifact_utils.get_single_instance( input_dict['model_push']) model_path = io_utils.get_only_uri_in_dir(model_push.uri) logging.info('Use pushed model from %s.', model_path) elif 'model_blessing' in input_dict and 'model_export' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return model_export = artifact_utils.get_single_instance( input_dict['model_export']) model_path = path_utils.serving_model_path(model_export.uri) logging.info('Use exported model from %s.', model_path) else: raise ValueError( 'Input models are not valid. Either model_push or ' '(model_blessing and model_export) need to be ' 'specified.') data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) example_uris = {} if data_spec.example_splits: for example in input_dict['examples']: if example.split in data_spec.example_splits: example_uris[example.split] = example.uri else: for example in input_dict['examples']: example_uris[example.split] = example.uri model_spec = bulk_inferrer_pb2.ModelSpec() json_format.Parse(exec_properties['model_spec'], model_spec) output_path = os.path.join(output.uri, _PREDICTION_LOGS_DIR_NAME) self._run_model_inference(model_path, example_uris, output_path, model_spec) logging.info('BulkInferrer generates prediction log to %s', output_path) output.set_int_custom_property('inferred', 1)