def testScopedLabels(self): """Test for scoped_labels.""" orig_labels = telemetry_utils.get_labels_dict() with telemetry_utils.scoped_labels({'foo': 'bar'}): self.assertDictEqual(telemetry_utils.get_labels_dict(), dict({'foo': 'bar'}, **orig_labels)) with telemetry_utils.scoped_labels({'inner': 'baz'}): self.assertDictEqual( telemetry_utils.get_labels_dict(), dict({ 'foo': 'bar', 'inner': 'baz' }, **orig_labels))
def testDoBlessed(self, mock_runner, _): self._model_blessing.uri = os.path.join(self._source_data_dir, 'model_validator/blessed') self._model_blessing.set_int_custom_property('blessed', 1) mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') self._executor.Do(self._input_dict, self._output_dict, self._serialize_custom_config_under_test()) executor_class_path = '%s.%s' % (self._executor.__class__.__module__, self._executor.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( mock.ANY, self._model_push.uri, mock.ANY, mock.ANY, job_labels, ) self.assertPushed() version = self._model_push.get_string_custom_property('pushed_version') self.assertEqual( self._model_push.get_string_custom_property('pushed_destination'), 'projects/project_id/models/model_name/versions/{}'.format( version))
def _ReadFromBigQueryImpl( # pylint: disable=invalid-name pipeline: beam.Pipeline, query: Text, use_bigquery_source: bool = False) -> beam.pvalue.PCollection: """Read from BigQuery. Args: pipeline: beam pipeline. query: a BigQuery sql string. use_bigquery_source: Whether to use BigQuerySource instead of experimental `ReadFromBigQuery` PTransform. Returns: PCollection of dict. """ # TODO(b/155441037): Consolidate to ReadFromBigQuery once its performance # on dataflow runner is on par with BigQuerySource. if use_bigquery_source: return ( pipeline | 'ReadFromBigQuerySource' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) return (pipeline | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery( query=query, use_standard_sql=True, bigquery_job_labels=telemetry_utils.get_labels_dict()))
def setUp(self): super(RunnerTest, self).setUp() self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._project_id = '12345' self._mock_api_client = mock.Mock() self._inputs = {} self._outputs = {} self._training_inputs = { 'project': self._project_id, } self._job_id = 'my_jobid' # Dict format of exec_properties. custom_config needs to be serialized # before being passed into start_aip_training function. self._exec_properties = { 'custom_config': { executor.TRAINING_ARGS_KEY: self._training_inputs, }, } self._model_name = 'model_name' self._ai_platform_serving_args = { 'model_name': self._model_name, 'project_id': self._project_id, } self._executor_class_path = 'my.executor.Executor' with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path}): self._job_labels = telemetry_utils.get_labels_dict()
def _ReadFromBigQueryImpl( # pylint: disable=invalid-name pipeline: beam.Pipeline, query: Text, use_bigquery_source: bool = False) -> beam.pvalue.PCollection: """Read from BigQuery. Args: pipeline: beam pipeline. query: a BigQuery sql string. use_bigquery_source: Whether to use BigQuerySource instead of experimental `ReadFromBigQuery` PTransform. Returns: PCollection of dict. """ if use_bigquery_source: return ( pipeline | 'ReadFromBigQuerySource' >> beam.io.Read( beam.io.BigQuerySource(query=query, use_standard_sql=True))) return (pipeline | 'ReadFromBigQuery' >> ReadFromBigQuery( query=query, use_standard_sql=True, bigquery_job_labels=telemetry_utils.get_labels_dict()))
def testDoWithBlessedModel(self, mock_runner, mock_run_model_inference, _): input_dict = { 'examples': [self._examples], 'model': [self._model], 'model_blessing': [self._model_blessing], } output_dict = { 'inference_result': [self._inference_result], } ai_platform_serving_args = { 'model_name': 'model_name', 'project_id': 'project_id' } # Create exe properties. exec_properties = { 'data_spec': proto_utils.proto_to_json(bulk_inferrer_pb2.DataSpec()), 'custom_config': json_utils.dumps( {executor.SERVING_ARGS_KEY: ai_platform_serving_args}), } mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') mock_runner.create_model_for_aip_prediction_if_not_exist.return_value = True # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(input_dict, output_dict, exec_properties) ai_platform_prediction_model_spec = ( model_spec_pb2.AIPlatformPredictionModelSpec( project_id='project_id', model_name='model_name', version_name=self._model_version)) ai_platform_prediction_model_spec.use_serialization_config = True inference_endpoint = model_spec_pb2.InferenceSpecType() inference_endpoint.ai_platform_prediction_model_spec.CopyFrom( ai_platform_prediction_model_spec) mock_run_model_inference.assert_called_once_with( mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY, inference_endpoint) executor_class_path = '%s.%s' % (bulk_inferrer.__class__.__module__, bulk_inferrer.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( mock.ANY, path_utils.serving_model_path(self._model.uri), mock.ANY, ai_platform_serving_args, job_labels, skip_model_creation=True, set_default_version=False, ) mock_runner.delete_model_version_from_aip_if_exists.assert_called_once_with( mock.ANY, mock.ANY, ai_platform_serving_args) mock_runner.delete_model_from_aip_if_exists.assert_called_once_with( mock.ANY, ai_platform_serving_args)
def run(self, pipeline: tfx_pipeline.Pipeline, parameter_values: Optional[Dict[Text, Any]] = None, write_out: Optional[bool] = True) -> Dict[Text, Any]: """Compiles a pipeline DSL object into pipeline file. Args: pipeline: TFX pipeline object. parameter_values: mapping from runtime parameter names to its values. write_out: set to True to actually write out the file to the place designated by output_dir and output_filename. Otherwise return the JSON-serialized pipeline job spec. Returns: Returns the JSON pipeline job spec. Raises: RuntimeError: if trying to write out to a place occupied by an existing file. """ # TODO(b/166343606): Support user-provided labels. # TODO(b/169095387): Deprecate .run() method in favor of the unified API # client. display_name = (self._config.display_name or pipeline.pipeline_info.pipeline_name) pipeline_spec = pipeline_builder.PipelineBuilder( tfx_pipeline=pipeline, default_image=self._config.default_image, default_commands=self._config.default_commands).build() pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__) pipeline_spec.schema_version = _SCHEMA_VERSION runtime_config = pipeline_builder.RuntimeConfigBuilder( pipeline_info=pipeline.pipeline_info, parameter_values=parameter_values).build() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}): result = pipeline_spec_pb2.PipelineJob( display_name=display_name or pipeline.pipeline_info.pipeline_name, labels=telemetry_utils.get_labels_dict(), runtime_config=runtime_config) result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec)) pipeline_json_dict = json_format.MessageToDict(result) if write_out: if fileio.exists( self._output_dir) and not fileio.isdir(self._output_dir): raise RuntimeError('Output path: %s is pointed to a file.' % self._output_dir) if not fileio.exists(self._output_dir): fileio.makedirs(self._output_dir) with fileio.open( os.path.join(self._output_dir, self._output_filename), 'wb') as f: f.write(json.dumps(pipeline_json_dict, sort_keys=True)) return pipeline_json_dict
def _assertDeployModelMockCalls(self, expected_models_create_body=None, expected_versions_create_body=None, expect_set_default=True): if not expected_models_create_body: expected_models_create_body = { 'name': self._model_name, 'regions': [], } if not expected_versions_create_body: with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path }): labels = telemetry_utils.get_labels_dict() expected_versions_create_body = { 'name': self._model_version, 'deployment_uri': self._serving_path, 'runtime_version': runner._get_tf_runtime_version(tf.__version__), 'python_version': runner._get_caip_python_version( runner._get_tf_runtime_version(tf.__version__)), 'labels': labels } self._mock_models_create.assert_called_with( body=mock.ANY, parent='projects/{}'.format(self._project_id), ) (_, models_create_kwargs) = self._mock_models_create.call_args self.assertDictEqual(expected_models_create_body, models_create_kwargs['body']) self._mock_versions_create.assert_called_with( body=mock.ANY, parent='projects/{}/models/{}'.format(self._project_id, self._model_name)) (_, versions_create_kwargs) = self._mock_versions_create.call_args self.assertDictEqual(expected_versions_create_body, versions_create_kwargs['body']) if not expect_set_default: return self._mock_set_default.assert_called_with( name='projects/{}/models/{}/versions/{}'.format( self._project_id, self._model_name, self._model_version)) self._mock_set_default_execute.assert_called_with()
def ReadFromBigQuery( pipeline: beam.Pipeline, query: Text) -> beam.pvalue.PCollection: """Read data from BigQuery. Args: pipeline: Beam pipeline. query: A BigQuery sql string. Returns: PCollection of dict. """ return (pipeline | 'ReadFromBigQuery' >> bigquery.ReadFromBigQuery( query=query, use_standard_sql=True, bigquery_job_labels=telemetry_utils.get_labels_dict()))
def _ReadFromBigQueryImpl( # pylint: disable=invalid-name pipeline: beam.Pipeline, query: Text) -> beam.pvalue.PCollection: """Read from BigQuery. Args: pipeline: beam pipeline. query: a BigQuery sql string. Returns: PCollection of dict. """ return (pipeline | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery( query=query, use_standard_sql=True, bigquery_job_labels=telemetry_utils.get_labels_dict()))
def testDeployModelForAIPPredictionWithCustomRegion(self, mock_discovery): mock_discovery.build.return_value = self._mock_api_client self._setUpPredictionMocks() self._ai_platform_serving_args['regions'] = ['custom-region'] runner.deploy_model_for_aip_prediction(self._serving_path, self._model_version, self._ai_platform_serving_args, self._executor_class_path) self._mock_models_create.assert_called_with( body=mock.ANY, parent='projects/{}'.format(self._project_id), ) (_, models_create_kwargs) = self._mock_models_create.call_args models_create_body = models_create_kwargs['body'] self.assertDictEqual( { 'name': 'model_name', 'regions': ['custom-region'] }, models_create_body) self._mock_versions_create.assert_called_with( body=mock.ANY, parent='projects/{}/models/{}'.format(self._project_id, 'model_name')) (_, versions_create_kwargs) = self._mock_versions_create.call_args versions_create_body = versions_create_kwargs['body'] with telemetry_utils.scoped_labels( {telemetry_utils.TFX_EXECUTOR: self._executor_class_path}): labels = telemetry_utils.get_labels_dict() runtime_version = runner._get_tf_runtime_version(tf.__version__) self.assertDictEqual( { 'name': self._model_version, 'deployment_uri': self._serving_path, 'runtime_version': runtime_version, 'python_version': runner._get_caip_python_version(runtime_version), 'labels': labels, }, versions_create_body) self._mock_get.assert_called_with(name='op_name') self._mock_set_default.assert_called_with( name='projects/{}/models/{}/versions/{}'.format( self._project_id, 'model_name', self._model_version)) self._mock_set_default_execute.assert_called_with()
def testDeployModelForAIPPredictionWithCustomRuntime(self, mock_discovery): mock_discovery.build.return_value = self._mock_api_client self._setUpPredictionMocks() self._ai_platform_serving_args['runtime_version'] = '1.23.45' runner.deploy_model_for_aip_prediction(self._serving_path, self._model_version, self._ai_platform_serving_args, self._executor_class_path) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path}): labels = telemetry_utils.get_labels_dict() expected_versions_create_body = { 'name': self._model_version, 'deployment_uri': self._serving_path, 'runtime_version': '1.23.45', 'python_version': runner._get_caip_python_version('1.23.45'), 'labels': labels, } self._assertDeployModelMockCalls( expected_versions_create_body=expected_versions_create_body)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. This function creates a new model (if necessary) and a new model version before inference, and cleans up resources after inference. It provides re-executability as it cleans up (only) the model resources that are created during the process even inference job failed. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. - custom_config: custom_config.ai_platform_serving_args need to contain the serving job parameters sent to Google Cloud AI Platform. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'inference_result' not in output_dict: raise ValueError('\'inference_result\' is missing in output dict.') output = artifact_utils.get_single_instance(output_dict['inference_result']) if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if 'model_blessing' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info('Model blessing is not provided, exported model will be ' 'used.') if _CUSTOM_CONFIG_KEY not in exec_properties: raise ValueError('Input exec properties are not valid, {} ' 'need to be specified.'.format(_CUSTOM_CONFIG_KEY)) custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY) if not ai_platform_serving_args: raise ValueError( '\'ai_platform_serving_args\' is missing in \'custom_config\'') service_name, api_version = runner.get_service_name_and_api_version( ai_platform_serving_args) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) logging.info('Use exported model from %s.', model_path) # Use model artifact uri to generate model version to guarantee the # 1:1 mapping from model version to model. model_version = 'version_' + hashlib.sha256(model.uri.encode()).hexdigest() inference_spec = self._get_inference_spec(model_path, model_version, ai_platform_serving_args) data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) api = discovery.build(service_name, api_version) new_model_created = False try: new_model_created = runner.create_model_for_aip_prediction_if_not_exist( api, job_labels, ai_platform_serving_args) runner.deploy_model_for_aip_prediction( api, model_path, model_version, ai_platform_serving_args, job_labels, skip_model_creation=True, set_default_version=False, ) self._run_model_inference(data_spec, input_dict['examples'], output.uri, inference_spec) except Exception as e: logging.error('Error in executing CloudAIBulkInferrerComponent: %s', str(e)) output.set_int_custom_property('inferred', 0) raise finally: # Guarantee newly created resources are cleaned up even if theinference # job failed. # Clean up the newly deployed model. runner.delete_model_version_from_aip_if_exists(api, model_version, ai_platform_serving_args) if new_model_created: runner.delete_model_from_aip_if_exists(api, ai_platform_serving_args) # Mark the inferenence as successful after resources are cleaned up. output.set_int_custom_property('inferred', 1)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.bigquery_serving_args is consumed by this class. For the full set of parameters supported by Big Query ML, refer to https://cloud.google.com/bigquery-ml/ Returns: None Raises: ValueError: If bigquery_serving_args is not in exec_properties.custom_config. If pipeline_root is not 'gs://...' RuntimeError: if the Big Query job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance( input_dict[tfx_pusher_executor.MODEL_KEY]) model_export_uri = model_export.uri custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') bigquery_serving_args = custom_config.get(SERVING_ARGS_KEY) # if configuration is missing error out if bigquery_serving_args is None: raise ValueError('Big Query ML configuration was not provided') bq_model_uri = '.'.join([ bigquery_serving_args[_PROJECT_ID_KEY], bigquery_serving_args[_BQ_DATASET_ID_KEY], bigquery_serving_args[_MODEL_NAME_KEY], ]) # Deploy the model. io_utils.copy_dir( src=path_utils.serving_model_path(model_export_uri), dst=model_push.uri) model_path = model_push.uri if not model_path.startswith(_GCS_PREFIX): raise ValueError('pipeline_root must be gs:// for BigQuery ML Pusher.') logging.info('Deploying the model to BigQuery ML for serving: %s from %s', bigquery_serving_args, model_path) query = _BQML_CREATE_OR_REPLACE_MODEL_QUERY_TEMPLATE.format( model_uri=bq_model_uri, model_path=model_path) # TODO(zhitaoli): Refactor the executor_class_path creation into a common # utility function. executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): default_query_job_config = bigquery.job.QueryJobConfig( labels=telemetry_utils.get_labels_dict()) client = bigquery.Client(default_query_job_config=default_query_job_config) try: query_job = client.query(query) query_job.result() # Waits for the query to finish except Exception as e: raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) logging.info('Successfully deployed model %s serving from %s', bq_model_uri, model_path) # Setting the push_destination to bigquery uri self._MarkPushed(model_push, pushed_destination=bq_model_uri)
def deploy_model_for_aip_prediction( serving_path: Text, model_version: Text, ai_platform_serving_args: Dict[Text, Any], executor_class_path: Text, ): """Deploys a model for serving with AI Platform. Args: serving_path: The path to the model. Must be a GCS URI. model_version: Version of the model being deployed. Must be different from what is currently being served. ai_platform_serving_args: Dictionary containing arguments for pushing to AI Platform. For the full set of parameters supported, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version executor_class_path: class path for TFX core default trainer. Raises: RuntimeError: if an error is encountered when trying to push. """ logging.info( 'Deploying to model with version %s to AI Platform for serving: %s', model_version, ai_platform_serving_args) model_name = ai_platform_serving_args['model_name'] project_id = ai_platform_serving_args['project_id'] regions = ai_platform_serving_args.get('regions', []) default_runtime_version = _get_tf_runtime_version(tf.__version__) runtime_version = ai_platform_serving_args.get('runtime_version', default_runtime_version) python_version = _get_caip_python_version(runtime_version) api = discovery.build('ml', 'v1') body = {'name': model_name, 'regions': regions} parent = 'projects/{}'.format(project_id) try: api.projects().models().create(body=body, parent=parent).execute() except errors.HttpError as e: # If the error is to create an already existing model, it's ok to ignore. # TODO(b/135211463): Remove the disable once the pytype bug is fixed. if e.resp.status == 409: # pytype: disable=attribute-error logging.warn('Model %s already exists', model_name) else: raise RuntimeError('AI Platform Push failed: {}'.format(e)) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() body = { 'name': model_version, 'deployment_uri': serving_path, 'runtime_version': runtime_version, 'python_version': python_version, 'labels': job_labels, } # Push to AIP, and record the operation name so we can poll for its state. model_name = 'projects/{}/models/{}'.format(project_id, model_name) response = api.projects().models().versions().create( body=body, parent=model_name).execute() op_name = response['name'] deploy_status_resc = api.projects().operations().get(name=op_name) while not deploy_status_resc.execute().get('done'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) logging.info('Model still being deployed...') deploy_status = deploy_status_resc.execute() if deploy_status.get('error'): # The operation completed with an error. raise RuntimeError( 'Failed to deploy model to AI Platform for serving: {}'.format( deploy_status['error'])) # Set the new version as default. # By API specification, if Long-Running-Operation is done and there is # no error, 'response' is guaranteed to exist. api.projects().models().versions().setDefault(name='{}/versions/{}'.format( model_name, deploy_status['response']['name'])).execute() logging.info( 'Successfully deployed model %s with version %s, serving from %s', model_name, model_version, serving_path)
def start_aip_training(input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any], job_id: Optional[Text]): """Start a trainer job on AI Platform (AIP). This is done by forwarding the inputs/outputs/exec_properties to the tfx.scripts.run_executor module on a AI Platform training job interpreter. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Training input argument for AI Platform training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput job_id: Job ID for AI Platform Training job. If not supplied, system-determined unique ID is given. Refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job Returns: None Raises: RuntimeError: if the Google Cloud AI Platform training job failed/cancelled. """ training_inputs = training_inputs.copy() json_inputs = artifact_utils.jsonify_artifact_dict(input_dict) logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = artifact_utils.jsonify_artifact_dict(output_dict) logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties, sort_keys=True) logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure AI Platform training job api_client = discovery.build('ml', 'v1') # We use custom containers to launch training on AI Platform, which invokes # the specified image using the container's entrypoint. The default # entrypoint for TFX containers is to call scripts/run_executor.py. The # arguments below are passed to this run_executor entry to run the executor # specified in `executor_class_path`. job_args = [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] if not training_inputs.get('masterConfig'): training_inputs['masterConfig'] = { 'imageUri': _TFX_IMAGE, } training_inputs['args'] = job_args # Pop project_id so AIP doesn't complain about an unexpected parameter. # It's been a stowaway in aip_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified. job_id = job_id or 'tfx_{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S')) job_spec = { 'jobId': job_id, 'trainingInput': training_inputs, 'labels': job_labels, } # Submit job to AIP Training logging.info('Submitting job=\'%s\', project=\'%s\' to AI Platform.', job_id, project) request = api_client.projects().jobs().create( body=job_spec, parent=project_id) request.execute() # Wait for AIP Training job to finish job_name = '{}/jobs/{}'.format(project_id, job_id) request = api_client.projects().jobs().get(name=job_name) response = request.execute() retry_count = 0 # Monitors the long-running operation by polling the job state periodically, # and retries the polling when a transient connectivity issue is encountered. # # Long-running operation monitoring: # The possible states of "get job" response can be found at # https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#State # where SUCCEEDED/FAILED/CANCELLED are considered to be final states. # The following logic will keep polling the state of the job until the job # enters a final state. # # During the polling, if a connection error was encountered, the GET request # will be retried by recreating the Python API client to refresh the lifecycle # of the connection being used. See # https://github.com/googleapis/google-api-python-client/issues/218 # for a detailed description of the problem. If the error persists for # _CONNECTION_ERROR_RETRY_LIMIT consecutive attempts, the function will exit # with code 1. while response['state'] not in ('SUCCEEDED', 'FAILED', 'CANCELLED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) try: response = request.execute() retry_count = 0 # Handle transient connection error. except ConnectionError as err: if retry_count < _CONNECTION_ERROR_RETRY_LIMIT: retry_count += 1 logging.warning( 'ConnectionError (%s) encountered when polling job: %s. Trying to ' 'recreate the API client.', err, job_id) # Recreate the Python API client. api_client = discovery.build('ml', 'v1') request = api_client.projects().jobs().get(name=job_name) else: # TODO(b/158433873): Consider raising the error instead of exit with # code 1 after CMLE supports configurable retry policy. # Currently CMLE will automatically retry the job unless return code # 1-128 is returned. logging.error('Request failed after %s retries.', _CONNECTION_ERROR_RETRY_LIMIT) sys.exit(1) if response['state'] in ('FAILED', 'CANCELLED'): err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) logging.error(err_msg) raise RuntimeError(err_msg) # AIP training complete logging.info('Job \'%s\' successful.', job_name)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from evaluator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.ai_platform_serving_args is consumed by this class. For the full set of parameters supported by Google Cloud AI Platform, refer to https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version. Raises: ValueError: If ai_platform_serving_args is not in exec_properties.custom_config. If Serving model path does not start with gs://. RuntimeError: if the Google Cloud AI Platform training job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance( input_dict[tfx_pusher_executor.MODEL_KEY]) custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict.') ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY) if not ai_platform_serving_args: raise ValueError( '\'ai_platform_serving_args\' is missing in \'custom_config\'') service_name, api_version = runner.get_service_name_and_api_version( ai_platform_serving_args) # Deploy the model. io_utils.copy_dir(src=path_utils.serving_model_path(model_export.uri), dst=model_push.uri) model_path = model_push.uri # TODO(jjong): Introduce Versioning. # Note that we're adding "v" prefix as Cloud AI Prediction only allows the # version name that starts with letters, and contains letters, digits, # underscore only. model_version = 'v{}'.format(int(time.time())) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() runner.deploy_model_for_aip_prediction( discovery.build(service_name, api_version), model_path, model_version, ai_platform_serving_args, job_labels, ) self._MarkPushed( model_push, pushed_destination=_CAIP_MODEL_VERSION_PATH_FORMAT.format( project_id=ai_platform_serving_args['project_id'], model=ai_platform_serving_args['model_name'], version=model_version), pushed_version=model_version)
def start_aip_training(input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any], job_id: Optional[Text]): """Start a trainer job on AI Platform (AIP). This is done by forwarding the inputs/outputs/exec_properties to the tfx.scripts.run_executor module on a AI Platform training job interpreter. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Training input argment for AI Platform training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput job_id: Job ID for AI Platform Training job. If not supplied, system-determined unique ID is given. Refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job Returns: None Raises: RuntimeError: if the Google Cloud AI Platform training job failed. """ training_inputs = training_inputs.copy() json_inputs = artifact_utils.jsonify_artifact_dict(input_dict) absl.logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = artifact_utils.jsonify_artifact_dict(output_dict) absl.logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties, sort_keys=True) absl.logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure AI Platform training job api_client = discovery.build('ml', 'v1') # We use custom containers to launch training on AI Platform, which invokes # the specified image using the container's entrypoint. The default # entrypoint for TFX containers is to call scripts/run_executor.py. The # arguments below are passed to this run_executor entry to run the executor # specified in `executor_class_path`. job_args = [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] if not training_inputs.get('masterConfig'): training_inputs['masterConfig'] = { 'imageUri': _TFX_IMAGE, } training_inputs['args'] = job_args # Pop project_id so AIP doesn't complain about an unexpected parameter. # It's been a stowaway in aip_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) job_labels = telemetry_utils.get_labels_dict( tfx_executor=executor_class_path) # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified. job_id = job_id or 'tfx_%s' % datetime.datetime.now().strftime( '%Y%m%d%H%M%S') job_spec = { 'jobId': job_id, 'trainingInput': training_inputs, 'labels': job_labels, } # Submit job to AIP Training absl.logging.info( 'Submitting job=\'{}\', project=\'{}\' to AI Platform.'.format( job_id, project)) request = api_client.projects().jobs().create(body=job_spec, parent=project_id) request.execute() # Wait for AIP Training job to finish job_name = '{}/jobs/{}'.format(project_id, job_id) request = api_client.projects().jobs().get(name=job_name) response = request.execute() while response['state'] not in ('SUCCEEDED', 'FAILED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) response = request.execute() if response['state'] == 'FAILED': err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) absl.logging.error(err_msg) raise RuntimeError(err_msg) # AIP training complete absl.logging.info('Job \'{}\' successful.'.format(job_name))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.bigquery_serving_args is consumed by this class. For the full set of parameters supported by Big Query ML, refer to https://cloud.google.com/bigquery-ml/ Returns: None Raises: ValueError: If bigquery_serving_args is not in exec_properties.custom_config. If pipeline_root is not 'gs://...' RuntimeError: if the Big Query job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance(output_dict['model_push']) if not self.CheckBlessing(input_dict): model_push.set_int_custom_property('pushed', 0) return model_export = artifact_utils.get_single_instance( input_dict['model_export']) model_export_uri = model_export.uri custom_config = exec_properties.get('custom_config', {}) bigquery_serving_args = custom_config.get('bigquery_serving_args', None) # if configuration is missing error out if bigquery_serving_args is None: raise ValueError('Big Query ML configuration was not provided') bq_model_uri = '`{}`.`{}`.`{}`'.format( bigquery_serving_args['project_id'], bigquery_serving_args['bq_dataset_id'], bigquery_serving_args['model_name']) # Deploy the model. model_path = path_utils.serving_model_path(model_export_uri) if not model_path.startswith('gs://'): raise ValueError( 'pipeline_root must be gs:// for BigQuery ML Pusher.') absl.logging.info( 'Deploying the model to BigQuery ML for serving: {} from {}'.format( bigquery_serving_args, model_path)) query = (""" CREATE OR REPLACE MODEL {} OPTIONS (model_type='tensorflow', model_path='{}')""".format(bq_model_uri, os.path.join(model_path, '*'))) # TODO(zhitaoli): Refactor the executor_class_path creation into a common # utility function. executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) default_query_job_config = bigquery.job.QueryJobConfig( labels=telemetry_utils.get_labels_dict( tfx_executor=executor_class_path)) client = bigquery.Client(default_query_job_config=default_query_job_config) try: query_job = client.query(query) query_job.result() # Waits for the query to finish except Exception as e: raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) absl.logging.info('Successfully deployed model {} serving from {}'.format( bq_model_uri, model_path)) # Setting the push_destination to bigquery uri model_push.set_int_custom_property('pushed', 1) model_push.set_string_custom_property('pushed_model', bq_model_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from evaluator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.bigquery_serving_args is consumed by this class, including: - bq_dataset_id: ID of the dataset you're creating or replacing - model_name: name of the model you're creating or replacing - project_id: GCP project where the model will be stored. It is also the project where the query is executed unless a compute_project_id is provided. - compute_project_id: GCP project where the query is executed. If not provided, the query is executed in project_id. For the full set of parameters supported by Big Query ML, refer to https://cloud.google.com/bigquery-ml/ Returns: None Raises: ValueError: If bigquery_serving_args is not in exec_properties.custom_config. If pipeline_root is not 'gs://...' RuntimeError: if the Big Query job failed. Example usage: from tfx.extensions.google_cloud_big_query.pusher import executor pusher = Pusher( model=trainer.outputs['model'], model_blessing=evaluator.outputs['blessing'], custom_executor_spec=executor_spec.ExecutorClassSpec(executor.Executor), custom_config={ 'bigquery_serving_args': { 'model_name': 'your_model_name', 'project_id': 'your_gcp_storage_project', 'bq_dataset_id': 'your_dataset_id', 'compute_project_id': 'your_gcp_compute_project', }, }, ) """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[standard_component_specs.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') bigquery_serving_args = custom_config.get(SERVING_ARGS_KEY) # if configuration is missing error out if bigquery_serving_args is None: raise ValueError('Big Query ML configuration was not provided') bq_model_uri = '.'.join([ bigquery_serving_args[_PROJECT_ID_KEY], bigquery_serving_args[_BQ_DATASET_ID_KEY], bigquery_serving_args[_MODEL_NAME_KEY], ]) # Deploy the model. io_utils.copy_dir(src=self.GetModelPath(input_dict), dst=model_push.uri) model_path = model_push.uri if not model_path.startswith(_GCS_PREFIX): raise ValueError('pipeline_root must be gs:// for BigQuery ML Pusher.') logging.info('Deploying the model to BigQuery ML for serving: %s from %s', bigquery_serving_args, model_path) query = _BQML_CREATE_OR_REPLACE_MODEL_QUERY_TEMPLATE.format( model_uri=bq_model_uri, model_path=model_path) # TODO(zhitaoli): Refactor the executor_class_path creation into a common # utility function. executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): default_query_job_config = bigquery.job.QueryJobConfig( labels=telemetry_utils.get_labels_dict()) # TODO(b/181368842) Add integration test for BQML Pusher + Managed Pipeline project_id = ( bigquery_serving_args.get(_COMPUTE_PROJECT_ID_KEY) or bigquery_serving_args[_PROJECT_ID_KEY]) client = bigquery.Client( default_query_job_config=default_query_job_config, project=project_id) try: query_job = client.query(query) query_job.result() # Waits for the query to finish except Exception as e: raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) from e logging.info('Successfully deployed model %s serving from %s', bq_model_uri, model_path) # Setting the push_destination to bigquery uri self._MarkPushed(model_push, pushed_destination=bq_model_uri)
def create_training_args(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any], job_id: Optional[Text]) -> Dict[Text, Any]: """Get training args for runner._launch_aip_training. The training args contain the inputs/outputs/exec_properties to the tfx.scripts.run_executor module. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Training input argument for AI Platform training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput job_id: Job ID for AI Platform Training job. If not supplied, system-determined unique ID is given. Refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job Returns: A dict containing the training arguments """ training_inputs = training_inputs.copy() json_inputs = artifact_utils.jsonify_artifact_dict(input_dict) logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = artifact_utils.jsonify_artifact_dict(output_dict) logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties, sort_keys=True) logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # We use custom containers to launch training on AI Platform, which invokes # the specified image using the container's entrypoint. The default # entrypoint for TFX containers is to call scripts/run_executor.py. The # arguments below are passed to this run_executor entry to run the executor # specified in `executor_class_path`. container_command = _CONTAINER_COMMAND + [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties, ] if not training_inputs.get('masterConfig'): training_inputs['masterConfig'] = { 'imageUri': _TFX_IMAGE, } # Always use our own entrypoint instead of relying on container default. if 'containerCommand' in training_inputs['masterConfig']: logging.warn('Overriding custom value of containerCommand') training_inputs['masterConfig']['containerCommand'] = container_command # Pop project_id so AIP doesn't complain about an unexpected parameter. # It's been a stowaway in aip_args and has finally reached its destination. project = training_inputs.pop('project') with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified. job_id = job_id or 'tfx_{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S')) training_args = { 'job_id': job_id, 'project': project, 'training_input': training_inputs, 'job_labels': job_labels } return training_args
def create_training_args(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any], job_id: Optional[Text]) -> Dict[Text, Any]: """Get training args for runner._launch_aip_training. The training args contain the inputs/outputs/exec_properties to the tfx.scripts.run_executor module. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Spec for CustomJob for AI Platform (Unified) custom training job. See https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1/CustomJobSpec for the detailed schema. job_id: Display name for AI Platform (Unified) custom training job. If not supplied, system-determined unique ID is given. Refer to https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1/projects.locations.customJobs Returns: A dict containing the training arguments """ training_inputs = training_inputs.copy() json_inputs = artifact_utils.jsonify_artifact_dict(input_dict) logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = artifact_utils.jsonify_artifact_dict(output_dict) logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties, sort_keys=True) logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # We use custom containers to launch training on AI Platform (unified), # which invokes the specified image using the container's entrypoint. The # default entrypoint for TFX containers is to call scripts/run_executor.py. # The arguments below are passed to this run_executor entry to run the # executor specified in `executor_class_path`. container_command = _CONTAINER_COMMAND + [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties, ] if not training_inputs.get('worker_pool_specs'): training_inputs['worker_pool_specs'] = [{}] for worker_pool_spec in training_inputs['worker_pool_specs']: if not worker_pool_spec.get('container_spec'): worker_pool_spec['container_spec'] = { 'image_uri': _TFX_IMAGE, } # Always use our own entrypoint instead of relying on container default. if 'command' in worker_pool_spec['container_spec']: logging.warn( 'Overriding custom value of container_spec.command') worker_pool_spec['container_spec']['command'] = container_command # Pop project_id so AIP doesn't complain about an unexpected parameter. # It's been a stowaway in aip_args and has finally reached its destination. project = training_inputs.pop('project') with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() # 'tfx_YYYYmmddHHMMSS' is the default job display name if not explicitly # specified. job_id = job_id or 'tfx_{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S')) training_args = { 'job_id': job_id, 'project': project, 'training_input': training_inputs, 'job_labels': job_labels } return training_args
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from evaluator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. The following keys in `custom_config` are consumed by this class: - ai_platform_serving_args: For the full set of parameters supported by Google Cloud AI Platform, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models.versions#Version. - endpoint: Optional endpoint override. Should be in format of `https://[region]-ml.googleapis.com`. Default to global endpoint if not set. Using regional endpoint is recommended by Cloud AI Platform. When set, 'regions' key in ai_platform_serving_args cannot be set. For more details, please see https://cloud.google.com/ai-platform/prediction/docs/regional-endpoints#using_regional_endpoints Raises: ValueError: If ai_platform_serving_args is not in exec_properties.custom_config. If Serving model path does not start with gs://. If 'endpoint' and 'regions' are set simultanuously. RuntimeError: if the Google Cloud AI Platform training job failed. """ self._log_startup(input_dict, output_dict, exec_properties) custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict.') ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY) if not ai_platform_serving_args: raise ValueError( '\'ai_platform_serving_args\' is missing in \'custom_config\'') endpoint = custom_config.get(ENDPOINT_ARGS_KEY) if endpoint and 'regions' in ai_platform_serving_args: raise ValueError( '\'endpoint\' and \'ai_platform_serving_args.regions\' cannot be set simultanuously' ) model_push = artifact_utils.get_single_instance( output_dict[standard_component_specs.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return service_name, api_version = runner.get_service_name_and_api_version( ai_platform_serving_args) # Deploy the model. io_utils.copy_dir(src=self.GetModelPath(input_dict), dst=model_push.uri) model_path = model_push.uri # TODO(jjong): Introduce Versioning. # Note that we're adding "v" prefix as Cloud AI Prediction only allows the # version name that starts with letters, and contains letters, digits, # underscore only. model_version = 'v{}'.format(int(time.time())) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() endpoint = endpoint or runner.DEFAULT_ENDPOINT api = discovery.build( service_name, api_version, client_options=client_options.ClientOptions(api_endpoint=endpoint), ) runner.deploy_model_for_aip_prediction( api, model_path, model_version, ai_platform_serving_args, job_labels, ) self._MarkPushed( model_push, pushed_destination=_CAIP_MODEL_VERSION_PATH_FORMAT.format( project_id=ai_platform_serving_args['project_id'], model=ai_platform_serving_args['model_name'], version=model_version), pushed_version=model_version)