Exemple #1
0
    def _GetFnArgs(self, input_dict: Dict[str, List[types.Artifact]],
                   output_dict: Dict[str, List[types.Artifact]],
                   exec_properties: Dict[str, Any]) -> fn_args_utils.FnArgs:
        if input_dict.get(standard_component_specs.HYPERPARAMETERS_KEY):
            hyperparameters_file = io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[standard_component_specs.HYPERPARAMETERS_KEY]))
            hyperparameters_config = json.loads(
                file_io.read_file_to_string(hyperparameters_file))
        else:
            hyperparameters_config = None

        output_path = artifact_utils.get_single_uri(
            output_dict[standard_component_specs.MODEL_KEY])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        model_run_dir = artifact_utils.get_single_uri(
            output_dict[standard_component_specs.MODEL_RUN_KEY])

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        result = fn_args_utils.get_common_fn_args(input_dict, exec_properties)
        if result.custom_config and not isinstance(result.custom_config, dict):
            raise ValueError(
                'custom_config in execution properties needs to be a '
                'dict. Got %s instead.' % type(result.custom_config))
        result.transform_output = result.transform_graph_path
        result.serving_model_dir = serving_model_dir
        result.eval_model_dir = eval_model_dir
        result.model_run_dir = model_run_dir
        result.schema_file = result.schema_path
        result.hyperparameters = hyperparameters_config
        return result
Exemple #2
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        if exec_properties.get(_TUNE_ARGS_KEY):
            raise ValueError(
                "TuneArgs is not supported for default Tuner's Executor.")

        tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
        fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties,
                                                   self._get_tmp_dir())

        tuner_fn_result = tuner_fn(fn_args)
        tuner = tuner_fn_result.tuner
        fit_kwargs = tuner_fn_result.fit_kwargs

        # TODO(b/156966497): set logger for printing.
        tuner.search_space_summary()
        absl.logging.info('Start tuning...')
        tuner.search(**fit_kwargs)
        tuner.results_summary()
        best_hparams_config = tuner.get_best_hyperparameters()[0].get_config()
        absl.logging.info('Best hyperParameters: %s' % best_hparams_config)
        best_hparams_path = os.path.join(
            artifact_utils.get_single_uri(
                output_dict[_BEST_HYPERPARAMETERS_KEY]), _DEFAULT_FILE_NAME)
        io_utils.write_string_file(best_hparams_path,
                                   json.dumps(best_hparams_config))
        absl.logging.info('Best Hyperparameters are written to %s.' %
                          best_hparams_path)
Exemple #3
0
    def Do(self, input_dict: Dict[str, List[types.Artifact]],
           output_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any]) -> None:

        if tfx_tuner.get_tune_args(exec_properties):
            raise ValueError(
                "TuneArgs is not supported by this Tuner's Executor.")

        metalearning_algorithm = None
        if 'metalearning_algorithm' in exec_properties:
            metalearning_algorithm = exec_properties.get(
                'metalearning_algorithm')

        warmup_trials = 0
        warmup_trial_data = None
        if metalearning_algorithm:
            warmup_tuner, warmup_trials = self.warmup(input_dict,
                                                      exec_properties,
                                                      metalearning_algorithm)
            warmup_trial_data = extract_tuner_trial_progress(warmup_tuner)
        else:
            logging.info('MetaLearning Algorithm not provided.')

        # Create new fn_args for final tuning stage.
        fn_args = fn_args_utils.get_common_fn_args(
            input_dict, exec_properties, working_dir=self._get_tmp_dir())
        tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
        tuner_fn_result = tuner_fn(fn_args)
        tuner_fn_result.tuner.oracle.max_trials = max(
            (tuner_fn_result.tuner.oracle.max_trials - warmup_trials), 1)
        tuner = self.search(tuner_fn_result)
        tuner_trial_data = extract_tuner_trial_progress(tuner)

        if warmup_trial_data:
            cumulative_tuner_trial_data, best_tuner_ix = merge_trial_data(
                warmup_trial_data, tuner_trial_data)
            cumulative_tuner_trial_data[
                'warmup_trial_data'] = warmup_trial_data[BEST_CUMULATIVE_SCORE]
            cumulative_tuner_trial_data['tuner_trial_data'] = tuner_trial_data[
                BEST_CUMULATIVE_SCORE]

            if isinstance(tuner.oracle.objective, kerastuner.Objective):
                cumulative_tuner_trial_data[
                    'objective'] = tuner.oracle.objective.name
            else:
                cumulative_tuner_trial_data[
                    'objective'] = 'objective not understood'

            tuner_trial_data = cumulative_tuner_trial_data
            best_tuner = warmup_tuner if best_tuner_ix == 0 else tuner
        else:
            best_tuner = tuner
        tfx_tuner.write_best_hyperparameters(best_tuner, output_dict)
        tuner_plot_path = os.path.join(
            artifact_utils.get_single_uri(output_dict['trial_summary_plot']),
            'tuner_plot_data.txt')
        io_utils.write_string_file(tuner_plot_path,
                                   json.dumps(tuner_trial_data))
        logging.info('Tuner plot data written at: %s', tuner_plot_path)
Exemple #4
0
    def testGetCommonFnArgs(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir,
                                    'transform/transformed_examples')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        transform_output = standard_artifacts.TransformGraph()
        transform_output.uri = os.path.join(source_data_dir,
                                            'transform/transform_graph')

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(source_data_dir, 'schema_gen')

        base_model = standard_artifacts.Model()
        base_model.uri = os.path.join(source_data_dir, 'trainer/previous')

        input_dict = {
            standard_component_specs.EXAMPLES_KEY: [examples],
            standard_component_specs.TRANSFORM_GRAPH_KEY: [transform_output],
            standard_component_specs.SCHEMA_KEY: [schema],
            standard_component_specs.BASE_MODEL_KEY: [base_model],
        }

        # Create exec properties skeleton.
        exec_properties = {
            'train_args':
            proto_utils.proto_to_json(trainer_pb2.TrainArgs(num_steps=1000)),
            'eval_args':
            proto_utils.proto_to_json(trainer_pb2.EvalArgs(num_steps=500)),
        }

        fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties,
                                                   'tempdir')
        self.assertEqual(fn_args.working_dir, 'tempdir')
        self.assertEqual(fn_args.train_steps, 1000)
        self.assertEqual(fn_args.eval_steps, 500)
        self.assertLen(fn_args.train_files, 1)
        self.assertEqual(fn_args.train_files[0],
                         os.path.join(examples.uri, 'Split-train', '*'))
        self.assertLen(fn_args.eval_files, 1)
        self.assertEqual(fn_args.eval_files[0],
                         os.path.join(examples.uri, 'Split-eval', '*'))
        self.assertEqual(fn_args.schema_path,
                         os.path.join(schema.uri, 'schema.pbtxt'))
        # Depending on execution environment, the base model may have been stored
        # at .../Format-Servo/... or .../Format-Serving/... directory patterns.
        self.assertRegex(
            fn_args.base_model,
            os.path.join(base_model.uri,
                         r'Format-(Servo|Serving)/export/chicago-taxi/\d+'))
        self.assertEqual(fn_args.transform_graph_path, transform_output.uri)
        self.assertIsInstance(fn_args.data_accessor,
                              fn_args_utils.DataAccessor)
Exemple #5
0
    def testGetCommonFnArgs(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        # Create input dict.
        examples = standard_artifacts.Examples()
        examples.uri = os.path.join(source_data_dir,
                                    'transform/transformed_examples')
        examples.split_names = artifact_utils.encode_split_names(
            ['train', 'eval'])

        transform_output = standard_artifacts.TransformGraph()
        transform_output.uri = os.path.join(source_data_dir,
                                            'transform/transform_graph')

        schema = standard_artifacts.Schema()
        schema.uri = os.path.join(source_data_dir, 'schema_gen')

        input_dict = {
            constants.EXAMPLES_KEY: [examples],
            constants.TRANSFORM_GRAPH_KEY: [transform_output],
            constants.SCHEMA_KEY: [schema],
        }

        # Create exec properties skeleton.
        exec_properties = {
            'train_args':
            json_format.MessageToJson(trainer_pb2.TrainArgs(num_steps=1000),
                                      preserving_proto_field_name=True),
            'eval_args':
            json_format.MessageToJson(trainer_pb2.EvalArgs(num_steps=500),
                                      preserving_proto_field_name=True),
        }

        fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties,
                                                   'tempdir')
        self.assertEqual(fn_args.working_dir, 'tempdir')
        self.assertEqual(fn_args.train_steps, 1000)
        self.assertEqual(fn_args.eval_steps, 500)
        self.assertLen(fn_args.train_files, 1)
        self.assertEqual(fn_args.train_files[0],
                         os.path.join(examples.uri, 'train', '*'))
        self.assertLen(fn_args.eval_files, 1)
        self.assertEqual(fn_args.eval_files[0],
                         os.path.join(examples.uri, 'eval', '*'))
        self.assertEqual(fn_args.schema_path,
                         os.path.join(schema.uri, 'schema.pbtxt'))
        self.assertEqual(fn_args.transform_graph_path, transform_output.uri)
        self.assertIsInstance(fn_args.data_accessor,
                              fn_args_utils.DataAccessor)
Exemple #6
0
  def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
                 output_dict: Dict[Text, List[types.Artifact]],
                 exec_properties: Dict[Text, Any]) -> fn_args_utils.FnArgs:
    # Load and deserialize custom config from execution properties.
    # Note that in the component interface the default serialization of custom
    # config is 'null' instead of '{}'. Therefore we need to default the
    # json_utils.loads to 'null' then populate it with an empty dict when
    # needed.
    custom_config = json_utils.loads(
        exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {}
    if not isinstance(custom_config, dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict. Got %s instead.' % type(custom_config))

    # TODO(ruoyu): Make this a dict of tag -> uri instead of list.
    if input_dict.get(constants.BASE_MODEL_KEY):
      base_model = path_utils.serving_model_path(
          artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY]))
    else:
      base_model = None

    if input_dict.get(constants.HYPERPARAMETERS_KEY):
      hyperparameters_file = io_utils.get_only_uri_in_dir(
          artifact_utils.get_single_uri(
              input_dict[constants.HYPERPARAMETERS_KEY]))
      hyperparameters_config = json.loads(
          file_io.read_file_to_string(hyperparameters_file))
    else:
      hyperparameters_config = None

    output_path = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_KEY])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    model_run_dir = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_RUN_KEY])

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    result = fn_args_utils.get_common_fn_args(input_dict, exec_properties)
    result.transform_output = result.transform_graph_path
    result.serving_model_dir = serving_model_dir
    result.eval_model_dir = eval_model_dir
    result.model_run_dir = model_run_dir
    result.schema_file = result.schema_path
    result.base_model = base_model
    result.hyperparameters = hyperparameters_config
    result.custom_config = custom_config
    return result
Exemple #7
0
def search(input_dict: Dict[str, List[types.Artifact]],
           exec_properties: Dict[str, Any],
           working_dir: str) -> base_tuner.BaseTuner:
    """Conduct a single hyperparameter search loop, and return the Tuner."""
    tuner_fn = _get_tuner_fn(exec_properties)

    fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties,
                                               working_dir)

    tuner_fn_result = tuner_fn(fn_args)
    result = tuner_fn_result.tuner

    # TODO(b/156966497): set logger for printing.
    result.search_space_summary()
    logging.info('Start tuning... Tuner ID: %s', result.tuner_id)
    result.search(**tuner_fn_result.fit_kwargs)
    logging.info('Finished tuning... Tuner ID: %s', result.tuner_id)
    result.results_summary()

    return result
Exemple #8
0
  def _GetFnArgs(self, input_dict: Dict[Text, List[types.Artifact]],
                 output_dict: Dict[Text, List[types.Artifact]],
                 exec_properties: Dict[Text, Any]) -> TrainerFnArgs:
    fn_args = fn_args_utils.get_common_fn_args(input_dict, exec_properties)

    # Load and deserialize custom config from execution properties.
    # Note that in the component interface the default serialization of custom
    # config is 'null' instead of '{}'. Therefore we need to default the
    # json_utils.loads to 'null' then populate it with an empty dict when
    # needed.
    custom_config = json_utils.loads(
        exec_properties.get(constants.CUSTOM_CONFIG_KEY, 'null')) or {}
    if not isinstance(custom_config, Dict):
      raise ValueError('custom_config in execution properties needs to be a '
                       'dict. Got %s instead.' % type(custom_config))

    # TODO(ruoyu): Make this a dict of tag -> uri instead of list.
    if input_dict.get(constants.BASE_MODEL_KEY):
      base_model = path_utils.serving_model_path(
          artifact_utils.get_single_uri(input_dict[constants.BASE_MODEL_KEY]))
    else:
      base_model = None

    if input_dict.get(constants.HYPERPARAMETERS_KEY):
      hyperparameters_file = io_utils.get_only_uri_in_dir(
          artifact_utils.get_single_uri(
              input_dict[constants.HYPERPARAMETERS_KEY]))
      hyperparameters_config = json.loads(
          file_io.read_file_to_string(hyperparameters_file))
    else:
      hyperparameters_config = None

    output_path = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_KEY])
    serving_model_dir = path_utils.serving_model_dir(output_path)
    eval_model_dir = path_utils.eval_model_dir(output_path)

    model_run_dir = artifact_utils.get_single_uri(
        output_dict[constants.MODEL_RUN_KEY])

    # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
    return TrainerFnArgs(
        # A list of uris for train files.
        train_files=fn_args.train_files,
        # An optional single uri for transform graph produced by TFT. Will be
        # None if not specified.
        transform_output=fn_args.transform_graph_path,
        # A single uri for the output directory of the serving model.
        serving_model_dir=serving_model_dir,
        # A single uri for the output directory of the eval model.
        # Note that this is estimator only, Keras doesn't require it for TFMA.
        eval_model_dir=eval_model_dir,
        # A list of uris for eval files.
        eval_files=fn_args.eval_files,
        # A single uri for the output directory of model training related files.
        model_run_dir=model_run_dir,
        # A single uri for schema file.
        schema_file=fn_args.schema_path,
        # Number of train steps.
        train_steps=fn_args.train_steps,
        # Number of eval steps.
        eval_steps=fn_args.eval_steps,
        # Base model that will be used for this training job.
        base_model=base_model,
        # An optional kerastuner.HyperParameters config.
        hyperparameters=hyperparameters_config,
        # Additional parameters to pass to trainer function.
        **custom_config)
Exemple #9
0
    def warmup(self, input_dict: Dict[str, List[types.Artifact]],
               exec_properties: Dict[str,
                                     List[types.Artifact]], algorithm: str):

        # Perform warmup tuning if WARMUP_HYPERPARAMETERS given.
        hparams_warmup_config_list = None
        if input_dict.get(WARMUP_HYPERPARAMETERS):
            hyperparameters_file = io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[WARMUP_HYPERPARAMETERS]))
            hparams_warmup_config_list = json.loads(
                io_utils.read_string_file(hyperparameters_file))

        fn_args = fn_args_utils.get_common_fn_args(
            input_dict,
            exec_properties,
            working_dir=self._get_tmp_dir() + 'warmup')

        # TODO(nikhilmehta): Currently all algorithms need warmup_hyperparameters.
        # This may not be needed for other algorithms that can predict hyperparams.
        if not hparams_warmup_config_list:
            raise ValueError('Expected warmup_hyperparameters')

        logging.info('Algorithm: %s', algorithm)
        warmup_trials = 0
        if algorithm == 'majority_voting':
            warmup_trials = DEFAULT_WARMUP_TRIALS
            fn_args.custom_config[
                WARMUP_HYPERPARAMETERS] = hparams_warmup_config_list[0]
        elif algorithm == 'nearest_neighbor':
            warmup_trials = DEFAULT_WARMUP_TRIALS

            if input_dict.get('metamodel'):
                metamodel_path = io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict['metamodel']))
                logging.info('Meta model path: %s', metamodel_path)
                metamodel = _load_keras_model(metamodel_path)
            else:
                raise ValueError(
                    f'Tuner for metalearning_algorithm={algorithm} expects metamodel.'
                )

            if input_dict.get('metafeature'):
                metafeature_path = io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(input_dict['metafeature']))
                logging.info('Metafeature: %s', metafeature_path)
                metafeature = json.loads(
                    io_utils.read_string_file(metafeature_path))
                metafeature = metafeature['metafeature']
            else:
                raise ValueError(
                    f'Tuner for metalearning_algorithm={algorithm} expects metafeature.'
                )

            metafeature = np.array(metafeature, dtype=np.float32)
            metafeature = np.expand_dims(metafeature, axis=0)
            logits = metamodel(metafeature).numpy()[0]
            nearest_configs = [
                hparams_warmup_config_list[ix]
                for ix in np.argsort(logits)[-DEFAULT_K:]
            ]
            nearest_hparam_config = _merge_hparam_configs(nearest_configs)
            fn_args.custom_config[
                WARMUP_HYPERPARAMETERS] = nearest_hparam_config
        else:
            raise NotImplementedError(
                f'Tuning for metalearning_algorithm={algorithm} is not implemented.'
            )

        # kerastuner doesn't support grid search, setting max_trials large enough.
        # Track issue: https://github.com/keras-team/keras-tuner/issues/340
        fn_args.custom_config['max_trials'] = warmup_trials
        tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
        warmtuner_fn_result = tuner_fn(fn_args)
        warmup_tuner = self.search(warmtuner_fn_result)

        return warmup_tuner, warmup_trials