def _GetFnArgs(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> fn_args_utils.FnArgs: if input_dict.get(standard_component_specs.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[standard_component_specs.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[standard_component_specs.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) if result.custom_config and not isinstance(result.custom_config, dict): raise ValueError( 'custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(result.custom_config)) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.hyperparameters = hyperparameters_config return result
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: if exec_properties.get(_TUNE_ARGS_KEY): raise ValueError( "TuneArgs is not supported for default Tuner's Executor.") tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, self._get_tmp_dir()) tuner_fn_result = tuner_fn(fn_args) tuner = tuner_fn_result.tuner fit_kwargs = tuner_fn_result.fit_kwargs # TODO(b/156966497): set logger for printing. tuner.search_space_summary() absl.logging.info('Start tuning...') tuner.search(**fit_kwargs) tuner.results_summary() best_hparams_config = tuner.get_best_hyperparameters()[0].get_config() absl.logging.info('Best hyperParameters: %s' % best_hparams_config) best_hparams_path = os.path.join( artifact_utils.get_single_uri( output_dict[_BEST_HYPERPARAMETERS_KEY]), _DEFAULT_FILE_NAME) io_utils.write_string_file(best_hparams_path, json.dumps(best_hparams_config)) absl.logging.info('Best Hyperparameters are written to %s.' % best_hparams_path)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: if tfx_tuner.get_tune_args(exec_properties): raise ValueError( "TuneArgs is not supported by this Tuner's Executor.") metalearning_algorithm = None if 'metalearning_algorithm' in exec_properties: metalearning_algorithm = exec_properties.get( 'metalearning_algorithm') warmup_trials = 0 warmup_trial_data = None if metalearning_algorithm: warmup_tuner, warmup_trials = self.warmup(input_dict, exec_properties, metalearning_algorithm) warmup_trial_data = extract_tuner_trial_progress(warmup_tuner) else: logging.info('MetaLearning Algorithm not provided.') # Create new fn_args for final tuning stage. fn_args = fn_args_utils.get_common_fn_args( input_dict, exec_properties, working_dir=self._get_tmp_dir()) tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') tuner_fn_result = tuner_fn(fn_args) tuner_fn_result.tuner.oracle.max_trials = max( (tuner_fn_result.tuner.oracle.max_trials - warmup_trials), 1) tuner = self.search(tuner_fn_result) tuner_trial_data = extract_tuner_trial_progress(tuner) if warmup_trial_data: cumulative_tuner_trial_data, best_tuner_ix = merge_trial_data( warmup_trial_data, tuner_trial_data) cumulative_tuner_trial_data[ 'warmup_trial_data'] = warmup_trial_data[BEST_CUMULATIVE_SCORE] cumulative_tuner_trial_data['tuner_trial_data'] = tuner_trial_data[ BEST_CUMULATIVE_SCORE] if isinstance(tuner.oracle.objective, kerastuner.Objective): cumulative_tuner_trial_data[ 'objective'] = tuner.oracle.objective.name else: cumulative_tuner_trial_data[ 'objective'] = 'objective not understood' tuner_trial_data = cumulative_tuner_trial_data best_tuner = warmup_tuner if best_tuner_ix == 0 else tuner else: best_tuner = tuner tfx_tuner.write_best_hyperparameters(best_tuner, output_dict) tuner_plot_path = os.path.join( artifact_utils.get_single_uri(output_dict['trial_summary_plot']), 'tuner_plot_data.txt') io_utils.write_string_file(tuner_plot_path, json.dumps(tuner_trial_data)) logging.info('Tuner plot data written at: %s', tuner_plot_path)
def testGetCommonFnArgs(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'transform/transformed_examples') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) transform_output = standard_artifacts.TransformGraph() transform_output.uri = os.path.join(source_data_dir, 'transform/transform_graph') schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') base_model = standard_artifacts.Model() base_model.uri = os.path.join(source_data_dir, 'trainer/previous') input_dict = { standard_component_specs.EXAMPLES_KEY: [examples], standard_component_specs.TRANSFORM_GRAPH_KEY: [transform_output], standard_component_specs.SCHEMA_KEY: [schema], standard_component_specs.BASE_MODEL_KEY: [base_model], } # Create exec properties skeleton. exec_properties = { 'train_args': proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=1000)), 'eval_args': proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=500)), } fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, 'tempdir') self.assertEqual(fn_args.working_dir, 'tempdir') self.assertEqual(fn_args.train_steps, 1000) self.assertEqual(fn_args.eval_steps, 500) self.assertLen(fn_args.train_files, 1) self.assertEqual(fn_args.train_files[0], os.path.join(examples.uri, 'Split-train', '*')) self.assertLen(fn_args.eval_files, 1) self.assertEqual(fn_args.eval_files[0], os.path.join(examples.uri, 'Split-eval', '*')) self.assertEqual(fn_args.schema_path, os.path.join(schema.uri, 'schema.pbtxt')) # Depending on execution environment, the base model may have been stored # at .../Format-Servo/... or .../Format-Serving/... directory patterns. self.assertRegex( fn_args.base_model, os.path.join(base_model.uri, r'Format-(Servo|Serving)/export/chicago-taxi/\d+')) self.assertEqual(fn_args.transform_graph_path, transform_output.uri) self.assertIsInstance(fn_args.data_accessor, fn_args_utils.DataAccessor)
def testGetCommonFnArgs(self): source_data_dir = os.path.join( os.path.dirname(os.path.dirname(__file__)), 'testdata') # Create input dict. examples = standard_artifacts.Examples() examples.uri = os.path.join(source_data_dir, 'transform/transformed_examples') examples.split_names = artifact_utils.encode_split_names( ['train', 'eval']) transform_output = standard_artifacts.TransformGraph() transform_output.uri = os.path.join(source_data_dir, 'transform/transform_graph') schema = standard_artifacts.Schema() schema.uri = os.path.join(source_data_dir, 'schema_gen') input_dict = { constants.EXAMPLES_KEY: [examples], constants.TRANSFORM_GRAPH_KEY: [transform_output], constants.SCHEMA_KEY: [schema], } # Create exec properties skeleton. exec_properties = { 'train_args': json_format.MessageToJson(trainer_pb2.TrainArgs(num_steps=1000), preserving_proto_field_name=True), 'eval_args': json_format.MessageToJson(trainer_pb2.EvalArgs(num_steps=500), preserving_proto_field_name=True), } fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, 'tempdir') self.assertEqual(fn_args.working_dir, 'tempdir') self.assertEqual(fn_args.train_steps, 1000) self.assertEqual(fn_args.eval_steps, 500) self.assertLen(fn_args.train_files, 1) self.assertEqual(fn_args.train_files[0], os.path.join(examples.uri, 'train', '*')) self.assertLen(fn_args.eval_files, 1) self.assertEqual(fn_args.eval_files[0], os.path.join(examples.uri, 'eval', '*')) self.assertEqual(fn_args.schema_path, os.path.join(schema.uri, 'schema.pbtxt')) self.assertEqual(fn_args.transform_graph_path, transform_output.uri) self.assertIsInstance(fn_args.data_accessor, fn_args_utils.DataAccessor)
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs: # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. result = fn_args_utils.get_common_fn_args(input_dict, exec_properties) result.transform_output = result.transform_graph_path result.serving_model_dir = serving_model_dir result.eval_model_dir = eval_model_dir result.model_run_dir = model_run_dir result.schema_file = result.schema_path result.base_model = base_model result.hyperparameters = hyperparameters_config result.custom_config = custom_config return result
def search(input_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any], working_dir: str) -> base_tuner.BaseTuner: """Conduct a single hyperparameter search loop, and return the Tuner.""" tuner_fn = _get_tuner_fn(exec_properties) fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties, working_dir) tuner_fn_result = tuner_fn(fn_args) result = tuner_fn_result.tuner # TODO(b/156966497): set logger for printing. result.search_space_summary() logging.info('Start tuning... Tuner ID: %s', result.tuner_id) result.search(**tuner_fn_result.fit_kwargs) logging.info('Finished tuning... Tuner ID: %s', result.tuner_id) result.results_summary() return result
def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> TrainerFnArgs: fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties) # Load and deserialize custom config from execution properties. # Note that in the component interface the default serialization of custom # config is 'null' instead of '{}'. Therefore we need to default the # json_utils.loads to 'null' then populate it with an empty dict when # needed. custom_config = json_utils.loads( exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {} if not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict. Got %s instead.' % type(custom_config)) # TODO(ruoyu): Make this a dict of tag -> uri instead of list. if input_dict.get(constants.BASE_MODEL_KEY): base_model = path_utils.serving_model_path( artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY])) else: base_model = None if input_dict.get(constants.HYPERPARAMETERS_KEY): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[constants.HYPERPARAMETERS_KEY])) hyperparameters_config = json.loads( file_io.read_file_to_string(hyperparameters_file)) else: hyperparameters_config = None output_path = artifact_utils.get_single_uri( output_dict[constants.MODEL_KEY]) serving_model_dir = path_utils.serving_model_dir(output_path) eval_model_dir = path_utils.eval_model_dir(output_path) model_run_dir = artifact_utils.get_single_uri( output_dict[constants.MODEL_RUN_KEY]) # TODO(b/126242806) Use PipelineInputs when it is available in third_party. return TrainerFnArgs( # A list of uris for train files. train_files=fn_args.train_files, # An optional single uri for transform graph produced by TFT. Will be # None if not specified. transform_output=fn_args.transform_graph_path, # A single uri for the output directory of the serving model. serving_model_dir=serving_model_dir, # A single uri for the output directory of the eval model. # Note that this is estimator only, Keras doesn't require it for TFMA. eval_model_dir=eval_model_dir, # A list of uris for eval files. eval_files=fn_args.eval_files, # A single uri for the output directory of model training related files. model_run_dir=model_run_dir, # A single uri for schema file. schema_file=fn_args.schema_path, # Number of train steps. train_steps=fn_args.train_steps, # Number of eval steps. eval_steps=fn_args.eval_steps, # Base model that will be used for this training job. base_model=base_model, # An optional kerastuner.HyperParameters config. hyperparameters=hyperparameters_config, # Additional parameters to pass to trainer function. **custom_config)
def warmup(self, input_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, List[types.Artifact]], algorithm: str): # Perform warmup tuning if WARMUP_HYPERPARAMETERS given. hparams_warmup_config_list = None if input_dict.get(WARMUP_HYPERPARAMETERS): hyperparameters_file = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri( input_dict[WARMUP_HYPERPARAMETERS])) hparams_warmup_config_list = json.loads( io_utils.read_string_file(hyperparameters_file)) fn_args = fn_args_utils.get_common_fn_args( input_dict, exec_properties, working_dir=self._get_tmp_dir() + 'warmup') # TODO(nikhilmehta): Currently all algorithms need warmup_hyperparameters. # This may not be needed for other algorithms that can predict hyperparams. if not hparams_warmup_config_list: raise ValueError('Expected warmup_hyperparameters') logging.info('Algorithm: %s', algorithm) warmup_trials = 0 if algorithm == 'majority_voting': warmup_trials = DEFAULT_WARMUP_TRIALS fn_args.custom_config[ WARMUP_HYPERPARAMETERS] = hparams_warmup_config_list[0] elif algorithm == 'nearest_neighbor': warmup_trials = DEFAULT_WARMUP_TRIALS if input_dict.get('metamodel'): metamodel_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['metamodel'])) logging.info('Meta model path: %s', metamodel_path) metamodel = _load_keras_model(metamodel_path) else: raise ValueError( f'Tuner for metalearning_algorithm={algorithm} expects metamodel.' ) if input_dict.get('metafeature'): metafeature_path = io_utils.get_only_uri_in_dir( artifact_utils.get_single_uri(input_dict['metafeature'])) logging.info('Metafeature: %s', metafeature_path) metafeature = json.loads( io_utils.read_string_file(metafeature_path)) metafeature = metafeature['metafeature'] else: raise ValueError( f'Tuner for metalearning_algorithm={algorithm} expects metafeature.' ) metafeature = np.array(metafeature, dtype=np.float32) metafeature = np.expand_dims(metafeature, axis=0) logits = metamodel(metafeature).numpy()[0] nearest_configs = [ hparams_warmup_config_list[ix] for ix in np.argsort(logits)[-DEFAULT_K:] ] nearest_hparam_config = _merge_hparam_configs(nearest_configs) fn_args.custom_config[ WARMUP_HYPERPARAMETERS] = nearest_hparam_config else: raise NotImplementedError( f'Tuning for metalearning_algorithm={algorithm} is not implemented.' ) # kerastuner doesn't support grid search, setting max_trials large enough. # Track issue: https://github.com/keras-team/keras-tuner/issues/340 fn_args.custom_config['max_trials'] = warmup_trials tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') warmtuner_fn_result = tuner_fn(fn_args) warmup_tuner = self.search(warmtuner_fn_result) return warmup_tuner, warmup_trials