def _GetPreprocessingFn(self, inputs, unused_outputs): """Returns a user defined preprocessing_fn. Args: inputs: A dictionary of labelled input values. unused_outputs: A dictionary of labelled output values. Returns: User defined function. """ return io_utils.import_func( common.GetSoleValue(inputs, labels.PREPROCESSING_FN), 'preprocessing_fn')
def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]], output_dict: Dict[Text, List[types.TfxArtifact]], exec_properties: Dict[Text, Any]) -> None: """Uses a user-supplied tf.estimator to train a TensorFlow model locally. The Trainer Executor invokes a training_fn callback function provided by the user via the module_file parameter. With the tf.estimator returned by this function, the Trainer Executor then builds a TensorFlow model using the user-provided tf.estimator. Args: input_dict: Input dict from input key to a list of ML-Metadata Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None Raises: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(zhitaoli): Deprecate this in a future version. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: executor_class_path = '.'.join( [Executor.__module__, Executor.__name__]) tf.logging.warn( 'Passing \'cmle_training_args\' to trainer directly is deprecated, ' 'please use extension executor at ' 'tfx.extensions.google_cloud_ai_platform.trainer.executor instead' ) return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, executor_class_path, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) ] schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( # A list of uris for train files. train_files=train_files, # A single uri for transform graph produced by TFT. transform_output=transform_output, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A list of uris for eval files. eval_files=eval_files, # A single uri for schema file. schema_file=schema_file, # Number of train steps. train_steps=train_steps, # Number of eval steps. eval_steps=eval_steps, # A single uri for the model directory to warm start from. warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
def testImportFuncMissingFunction(self): source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata') test_fn_file = os.path.join(source_data_dir, 'test_fn.py') with self.assertRaises(AttributeError): io_utils.import_func(test_fn_file, 'non_existing')
def testImportFunc(self): source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata') test_fn_file = os.path.join(source_data_dir, 'test_fn.py') test_fn = io_utils.import_func(test_fn_file, 'test_fn') self.assertEqual(10, test_fn([1, 2, 3, 4]))
def Do(self, input_dict, output_dict, exec_properties): """Runs trainer job the given input. Args: input_dict: Input dict from input key to a list of Artifacts. - transformed_examples: Transformed example. - transform_output: Input transform graph. - schema: Schema of the data. output_dict: Output dict from output key to a list of Artifacts. - output: Exported model. exec_properties: A dict of execution properties. - train_args: JSON string of trainer_pb2.TrainArgs instance, providing args for training. - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing args for eval. - module_file: Python module file containing UDF model definition. - warm_starting: Whether or not we need to do warm starting. - warm_start_from: Optional. If warm_starting is True, this is the directory to find previous model to warm start on. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) # TODO(khaas): Move this to tfx/extensions. if exec_properties.get('custom_config', None): cmle_args = exec_properties.get('custom_config', {}).get('cmle_training_args') if cmle_args: return cmle_runner.start_cmle_training(input_dict, output_dict, exec_properties, cmle_args) trainer_fn = io_utils.import_func(exec_properties['module_file'], 'trainer_fn') # Set up training parameters train_files = [ _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'train')) ] transform_output = types.get_single_uri(input_dict['transform_output']) eval_files = _all_files_pattern( types.get_split_uri(input_dict['transformed_examples'], 'eval')) schema_file = io_utils.get_only_uri_in_dir( types.get_single_uri(input_dict['schema'])) train_args = trainer_pb2.TrainArgs() eval_args = trainer_pb2.EvalArgs() json_format.Parse(exec_properties['train_args'], train_args) json_format.Parse(exec_properties['eval_args'], eval_args) # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with # num_steps=None. Conversion of the proto to python will set the default # value of an int as 0 so modify the value here. Tensorflow will raise an # error if num_steps <= 0. train_steps = train_args.num_steps or None eval_steps = eval_args.num_steps or None output_path = types.get_single_uri(output_dict['output']) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) # Assemble warm start path if needed. warm_start_from = None if exec_properties.get('warm_starting') and exec_properties.get( 'warm_start_from'): previous_model_dir = os.path.join( exec_properties['warm_start_from'], path_utils.SERVING_MODEL_DIR) if previous_model_dir and tf.gfile.Exists( os.path.join(previous_model_dir, self._CHECKPOINT_FILE_NAME)): warm_start_from = previous_model_dir # TODO(b/126242806) Use PipelineInputs when it is available in third_party. hparams = tf.contrib.training.HParams( train_files=train_files, transform_output=transform_output, output_dir=output_path, serving_model_dir=serving_model_dir, eval_files=eval_files, schema_file=schema_file, train_steps=train_steps, eval_steps=eval_steps, warm_start_from=warm_start_from) schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema()) training_spec = trainer_fn(hparams, schema) # Train the model tf.logging.info('Training model.') tf.estimator.train_and_evaluate(training_spec['estimator'], training_spec['train_spec'], training_spec['eval_spec']) tf.logging.info('Training complete. Model written to %s', serving_model_dir) # Export an eval savedmodel for TFMA tf.logging.info('Exporting eval_savedmodel for TFMA.') tfma.export.export_eval_savedmodel( estimator=training_spec['estimator'], export_dir_base=eval_model_dir, eval_input_receiver_fn=training_spec['eval_input_receiver_fn']) tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)