def testTrainerFn(self): temp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') trainer_fn_args = trainer_executor.TrainerFnArgs( train_files=os.path.join( self._testdata_path, 'transform/transformed_examples/train/*.gz'), transform_output=os.path.join(self._testdata_path, 'transform/transform_output/'), serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'), eval_files=os.path.join( self._testdata_path, 'transform/transformed_examples/eval/*.gz'), schema_file=schema_file, train_steps=1, eval_steps=1, base_model=os.path.join(self._testdata_path, 'trainer/current/serving_model_dir'), data_accessor=DataAccessor(tf_dataset_factory=tfxio_utils. get_tf_dataset_factory_from_artifact( [standard_artifacts.Examples()], []), record_batch_factory=None, data_view_decode_fn=None)) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = taxi_utils_bqml.trainer_fn(trainer_fn_args, schema) estimator = training_spec['estimator'] train_spec = training_spec['train_spec'] eval_spec = training_spec['eval_spec'] eval_input_receiver_fn = training_spec['eval_input_receiver_fn'] self.assertIsInstance(estimator, tf.estimator.Estimator) self.assertIsInstance(train_spec, tf.estimator.TrainSpec) self.assertIsInstance(eval_spec, tf.estimator.EvalSpec) self.assertIsInstance(eval_input_receiver_fn, types.FunctionType) # Train for one step, then eval for one step. eval_result, exports = tf.estimator.train_and_evaluate( estimator, train_spec, eval_spec) self.assertGreater(eval_result['loss'], 0.0) self.assertEqual(len(exports), 1) self.assertGreaterEqual(len(fileio.listdir(exports[0])), 1) # Export the eval saved model. eval_savedmodel_path = tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=path_utils.eval_model_dir(temp_dir), eval_input_receiver_fn=eval_input_receiver_fn) self.assertGreaterEqual(len(fileio.listdir(eval_savedmodel_path)), 1) # Test exported serving graph. with tf.compat.v1.Session() as sess: metagraph_def = tf.compat.v1.saved_model.loader.load( sess, [tf.saved_model.SERVING], exports[0]) self.assertIsInstance(metagraph_def, tf.compat.v1.MetaGraphDef)
def _verify_model_exports(self): self.assertTrue( tf.io.gfile.exists( path_utils.eval_model_dir(self._model_exports.uri))) self.assertTrue( tf.io.gfile.exists( path_utils.serving_model_dir(self._model_exports.uri)))
def _GetFnArgs(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> fn_args_utils.FnArgs: if input_dict.get(standard_component_specs.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) if result.custom_config and not isinstance(result.custom_config, dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(result.custom_config)) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.hyperparameters = hyperparameters_config return result
def test_trainer_fn(self): temp_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt') output_dir = os.path.join(temp_dir, 'output_dir') hparams = tf.contrib.training.HParams( train_files=os.path.join( self._testdata_path, 'transform/transformed_examples/train/*.gz'), transform_output=os.path.join(self._testdata_path, 'transform/transform_output/'), output_dir=output_dir, serving_model_dir=os.path.join(temp_dir, 'serving_model_dir'), eval_files=os.path.join( self._testdata_path, 'transform/transformed_examples/eval/*.gz'), schema_file=schema_file, train_steps=1, eval_steps=1, verbosity='INFO', warm_start_from=os.path.join(self._testdata_path, 'trainer/current/serving_model_dir')) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = taxi_utils.trainer_fn(hparams, schema) estimator = training_spec['estimator'] train_spec = training_spec['train_spec'] eval_spec = training_spec['eval_spec'] eval_input_receiver_fn = training_spec['eval_input_receiver_fn'] self.assertIsInstance(estimator, tf.estimator.DNNLinearCombinedClassifier) self.assertIsInstance(train_spec, tf.estimator.TrainSpec) self.assertIsInstance(eval_spec, tf.estimator.EvalSpec) self.assertIsInstance(eval_input_receiver_fn, types.FunctionType) # Train for one step, then eval for one step. eval_result, exports = tf.estimator.train_and_evaluate( estimator, train_spec, eval_spec) self.assertGreater(eval_result['loss'], 0.0) self.assertEqual(len(exports), 1) self.assertGreaterEqual(len(tf.gfile.ListDirectory(exports[0])), 1) # Export the eval saved model. eval_savedmodel_path = tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=path_utils.eval_model_dir(output_dir), eval_input_receiver_fn=eval_input_receiver_fn) self.assertGreaterEqual( len(tf.gfile.ListDirectory(eval_savedmodel_path)), 1) # Test exported serving graph. with tf.Session() as sess: metagraph_def = tf.compat.v1.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], exports[0]) self.assertIsInstance(metagraph_def, tf.MetaGraphDef)
def test_estimator_lifecycle(self, estimator_constructor): """Checks that a full estimator lifecycle completes without crashing.""" # Generate data that the adapter can consume. task = tfds_task.TFDSTask(tfds.builder('titanic')) autodata = subpipeline.AutoData( task.problem_statement, examples=task.train_and_eval_examples, preprocessor=basic_preprocessor.BasicPreprocessor()) self.run_pipeline(components=task.components + autodata.components) # Create the trainer adapter. adapter = estimator_adapter.EstimatorAdapter( problem_statement=task.problem_statement, transform_graph_dir=self.artifact_dir( 'Transform.AutoData/transform_graph')) config = tf.estimator.RunConfig( save_checkpoints_steps=999, keep_checkpoint_max=3) # Create the estimator. estimator = estimator_constructor(adapter, config) # Train. estimator.train( input_fn=adapter.get_input_fn( file_pattern=self.artifact_dir( 'Transform.AutoData/transformed_examples', 'train/*'), batch_size=3), max_steps=3) # Eval. results = estimator.evaluate( input_fn=adapter.get_input_fn( file_pattern=self.artifact_dir( 'Transform.AutoData/transformed_examples', 'eval/*'), batch_size=3), steps=1) self.assertNotEmpty(results) # Export for TFMA. tfma.export.export_eval_savedmodel( estimator=estimator, export_dir_base=path_utils.eval_model_dir(estimator.model_dir), eval_input_receiver_fn=adapter.get_eval_input_receiver_fn()) # Export for Serving. estimator.export_saved_model( export_dir_base=os.path.join(estimator.model_dir, 'export'), serving_input_receiver_fn=adapter.get_serving_input_receiver_fn())
def testEstimatorModelPath(self, is_old_artifact): # Create folders based on Estimator based Trainer output model directory, # after Executor performs cleaning. output_uri = os.path.join(self.get_temp_dir(), 'model_dir') eval_model_path = path_utils.eval_model_dir(output_uri, is_old_artifact) eval_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(eval_model, 'testing') serving_model_path = path_utils.serving_model_dir( output_uri, is_old_artifact) serving_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(serving_model, 'testing') # Test retrieving model folder. self.assertEqual( eval_model_path, path_utils.eval_model_path(output_uri, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.serving_model_path(output_uri, is_old_artifact)) self.assertEqual( eval_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TFMA_EVAL, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_KERAS, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_GENERIC, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_ESTIMATOR, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_JS, is_old_artifact)) self.assertEqual( serving_model_path, path_utils.get_model_dir_by_type(output_uri, path_constants.TF_LITE, is_old_artifact))
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs: # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.base_model = base_model result.hyperparameters = hyperparameters_config result.custom_config = custom_config return result
def _assertNumberOfTrainerOutputIsOne(self, pipeline_name): """Make sure the number of trainer executions and output models.""" # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = fileio.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) eval_model_dir = path_utils.eval_model_dir(model_uri) serving_model_dir = path_utils.serving_model_dir(model_uri) self.assertEqual(1, fileio.listdir(eval_model_dir).count('saved_model.pb')) self.assertEqual(1, fileio.listdir(serving_model_dir).count('saved_model.pb'))
def testEstimatorModelPath(self): # Create folders based on Estimator based Trainer output model directory, # after Executor performs cleaning. output_uri = os.path.join(self.get_temp_dir(), 'model_dir') eval_model_path = path_utils.eval_model_dir(output_uri) eval_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(eval_model, 'testing') serving_model_path = path_utils.serving_model_dir(output_uri) serving_model = os.path.join(eval_model_path, 'saved_model.pb') io_utils.write_string_file(serving_model, 'testing') # Test retrieving model folder. self.assertEqual(eval_model_path, path_utils.eval_model_path(output_uri)) self.assertEqual(serving_model_path, path_utils.serving_model_path(output_uri))
def _assertNumberOfTrainerOutputIsOne(self, pipeline_name): """Make sure the number of trainer executions and output models.""" # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = tf.io.gfile.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) self.assertEqual( 1, len(tf.io.gfile.listdir(path_utils.eval_model_dir(model_uri)))) self.assertEqual( 1, len( tf.io.gfile.listdir( os.path.join(path_utils.serving_model_dir(model_uri), 'export', 'chicago-taxi'))))
def run_fn(fn_args: executor.TrainerFnArgs): """Train the model based on given args. Args: fn_args: Holds args used to train the model as name/value pairs. """ schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) training_spec = _trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', fn_args.serving_model_dir) # Export an eval savedmodel for TFMA # NOTE: When trained in distributed training cluster, eval_savedmodel must be # exported only by the chief worker (check TF_CONFIG). absl.logging.info('Exporting eval_savedmodel for TFMA.') eval_export_dir = path_utils.eval_model_dir(fn_args.model_run_dir) tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_export_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir) # TODO(b/160795287): Deprecate estimator based executor. # Copy serving and eval model from model_run to model artifact directory. serving_source = path_utils.serving_model_path(fn_args.model_run_dir) io_utils.copy_dir(serving_source, fn_args.serving_model_dir) absl.logging.info('Serving model copied to: %s.', fn_args.serving_model_dir) eval_source = path_utils.eval_model_path(fn_args.model_run_dir) io_utils.copy_dir(eval_source, fn_args.eval_model_dir) absl.logging.info('Eval model copied to: %s.', fn_args.eval_model_dir)
def _verify_no_eval_model_exports(self): self.assertFalse( fileio.exists(path_utils.eval_model_dir(self._model_exports.uri)))
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(zhitaoli): Deprecate this in a future version. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: executor_class_path = '.'.join([Executor.__module__, Executor.__name__]) absl.logging.warn( 'Passing \'cmle_training_args\' to trainer directly is deprecated, ' 'please use extension executor at ' 'tfx.extensions.google_cloud_ai_platform.trainer.executor instead') return runner.start_cmle_training(input_dict, output_dict, exec_properties, executor_class_path, cmle_args) trainer_fn = self._GetTrainerFn(exec_properties) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict['transform_output']) if input_dict.get( 'transform_output', None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join(exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.io.gfile.exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = _HParamWrapper( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # A single uri for the model directory to warm start from. warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> TrainerFnArgs: custom_config = exec_properties.get('custom_config') or {} if not isinstance(custom_config, dict): raise ValueError('Expect custom_config to be a dict but got %s instead' % type(custom_config)) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict[EXAMPLES_KEY], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict[TRANSFORM_GRAPH_KEY]) if input_dict.get( TRANSFORM_GRAPH_KEY, None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict[EXAMPLES_KEY], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[BASE_MODEL_KEY]) ) if input_dict.get(BASE_MODEL_KEY) else None if input_dict.get(HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict[HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict[OUTPUT_MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. return TrainerFnArgs( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A single uri for the output directory of the eval model. # Note that this is estimator only, Keras doesn't require it for TFMA. eval_model_dir=eval_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config)
def testAIPlatformTrainerPipeline(self): """Trainer-only test pipeline on AI Platform Training.""" pipeline_name = 'kubeflow-aip-trainer-test-{}'.format(self._random_id()) pipeline = self._create_pipeline( pipeline_name, [ self.schema_importer, self.transformed_examples_importer, self.transform_graph_importer, Trainer( custom_executor_spec=executor_spec.ExecutorClassSpec( ai_platform_trainer_executor.Executor), module_file=self._trainer_module, transformed_examples=self.transformed_examples_importer .outputs['result'], schema=self.schema_importer.outputs['result'], transform_graph=self.transform_graph_importer.outputs['result'], train_args=trainer_pb2.TrainArgs(num_steps=10), eval_args=trainer_pb2.EvalArgs(num_steps=5), custom_config={ # Test that distributed training is behaves properly. ai_platform_trainer_executor.TRAINING_ARGS_KEY: { 'project': self._gcp_project_id, 'region': self._gcp_region, 'jobDir': os.path.join( self._pipeline_root(pipeline_name), 'tmp'), 'masterConfig': { 'imageUri': self._container_image, }, 'scaleTier': 'CUSTOM', 'masterType': 'large_model', 'parameterServerType': 'standard', 'parameterServerCount': 1, 'workerType': 'standard', 'workerCount': 2, } }) ]) self._compile_and_run_pipeline(pipeline) # There must be only one execution of Trainer. trainer_output_base_dir = os.path.join( self._pipeline_root(pipeline_name), 'Trainer', 'model') trainer_outputs = tf.io.gfile.listdir(trainer_output_base_dir) self.assertEqual(1, len(trainer_outputs)) # There must be only one saved models each for serving and eval. model_uri = os.path.join(trainer_output_base_dir, trainer_outputs[0]) self.assertEqual( 1, len(tf.io.gfile.listdir(path_utils.eval_model_dir(model_uri)))) self.assertEqual( 1, len( tf.io.gfile.listdir( os.path.join( path_utils.serving_model_dir(model_uri), 'export', 'chicago-taxi'))))
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> TrainerFnArgs: fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties) # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. return TrainerFnArgs( # A list of uris for train files. train_files=fn_args.train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=fn_args.transform_graph_path, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A single uri for the output directory of the eval model. # Note that this is estimator only, Keras doesn't require it for TFMA. eval_model_dir=eval_model_dir, # A list of uris for eval files. eval_files=fn_args.eval_files, # A single uri for the output directory of model training related files. model_run_dir=model_run_dir, # A single uri for schema file. schema_file=fn_args.schema_path, # Number of train steps. train_steps=fn_args.train_steps, # Number of eval steps. eval_steps=fn_args.eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - model: Exported model. - model_run: Model training related outputs (e.g., Tensorboard logs) exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. - custom_config: Optional. JSON-serialized dict of additional parameters to pass to trainer function. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) fn_args = self._GetFnArgs(input_dict, output_dict, exec_properties) trainer_fn = udf_utils.get_fn(exec_properties, 'trainer_fn') schema = io_utils.parse_pbtxt_file(fn_args.schema_file, schema_pb2.Schema()) # TODO(b/160795287): Deprecate estimator based executor. # Provide user with a modified fn_args, with model_run given as # the working directory. Executor will then copy user models to # model artifact directory. serving_dest = fn_args.serving_model_dir eval_dest = fn_args.eval_model_dir working_dir = fn_args.model_run_dir fn_args.serving_model_dir = path_utils.serving_model_dir(working_dir) fn_args.eval_model_dir = path_utils.eval_model_dir(working_dir) training_spec = trainer_fn(fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info( 'Training complete. Model written to %s. ModelRun written to %s', fn_args.serving_model_dir, fn_args.model_run_dir) # Export an eval savedmodel for TFMA. If distributed training, it must only # be written by the chief worker, as would be done for serving savedmodel. if _is_chief(): absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=fn_args.eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', fn_args.eval_model_dir) # TODO(b/160795287): Deprecate estimator based executor. # Copy serving and eval model from model_run to model artifact directory. serving_source = path_utils.serving_model_path(fn_args.model_run_dir) io_utils.copy_dir(serving_source, serving_dest) absl.logging.info('Serving model copied to: %s.', serving_dest) eval_source = path_utils.eval_model_path(fn_args.model_run_dir) io_utils.copy_dir(eval_source, eval_dest) absl.logging.info('Eval model copied to: %s.', eval_dest) else: absl.logging.info( 'Model export is skipped because this is not the chief worker.')
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - examples: Examples used for training, must include 'train' and 'eval' splits. - transform_output: Optional input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: ValueError: When neither or both of 'module_file' and 'trainer_fn' are present in 'exec_properties'. """ self._log_startup(input_dict, output_dict, exec_properties) custom_config = exec_properties.get('custom_config') or {} if not isinstance(custom_config, dict): raise ValueError( 'Expect custom_config to be a dict but got %s instead' % type(custom_config)) trainer_fn = self._GetTrainerFn(exec_properties) # Set up training parameters train_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'train')) ] transform_output = artifact_utils.get_single_uri( input_dict['transform_output']) if input_dict.get( 'transform_output', None) else None eval_files = [ _all_files_pattern( artifact_utils.get_split_uri(input_dict['examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['schema'])) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict['base_model']) ) if input_dict.get('base_model') else None if input_dict.get('hyperparameters'): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['hyperparameters'])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = artifact_utils.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. train_fn_args = TrainerFnArgs( # A list of uris for train files. train_files=train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(train_fn_args, schema) # Train the model absl.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) absl.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA absl.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) absl.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def Do(self, input_dict, output_dict, exec_properties): """Runs trainer job the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(khaas): Move this to tfx/extensions. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( train_files=train_files, transform_output=transform_output, output_dir=output_path, serving_model_dir=serving_model_dir, eval_files=eval_files, schema_file=schema_file, train_steps=train_steps, eval_steps=eval_steps, warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)