Beispiel #1
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Starts a Tuner component as a job on Google Cloud AI Platform."""
        self._log_startup(input_dict, output_dict, exec_properties)

        custom_config = json_utils.loads(
            exec_properties.get(standard_component_specs.CUSTOM_CONFIG_KEY,
                                'null'))
        if custom_config is None:
            raise ValueError('custom_config is not provided')

        if not isinstance(custom_config, Dict):
            raise TypeError(
                'custom_config in execution properties must be a dict, '
                'but received %s' % type(custom_config))

        training_inputs = custom_config.get(TUNING_ARGS_KEY)
        if training_inputs is None:
            err_msg = ('\'%s\' not found in custom_config.' % TUNING_ARGS_KEY)
            logging.error(err_msg)
            raise ValueError(err_msg)
        training_inputs = training_inputs.copy()

        tune_args = tuner_executor.get_tune_args(exec_properties)

        num_parallel_trials = (1 if not tune_args else
                               tune_args.num_parallel_trials)
        if num_parallel_trials > 1:
            # Chief node is also responsible for conducting tuning loop.
            desired_worker_count = num_parallel_trials - 1

            if training_inputs.get('workerCount') != desired_worker_count:
                logging.warning('workerCount is overridden with %s',
                                desired_worker_count)
                training_inputs['workerCount'] = desired_worker_count

            training_inputs['scaleTier'] = 'CUSTOM'
            training_inputs['masterType'] = (training_inputs.get('masterType')
                                             or 'standard')
            training_inputs['workerType'] = (training_inputs.get('workerType')
                                             or 'standard')

        # 'tfx_tuner_YYYYmmddHHMMSS' is the default job ID if not specified.
        job_id = (custom_config.get(ai_platform_trainer_executor.JOB_ID_KEY)
                  or 'tfx_tuner_{}'.format(
                      datetime.datetime.now().strftime('%Y%m%d%H%M%S')))

        # TODO(b/160059039): Factor out label creation to a utility function.
        executor_class = _WorkerExecutor
        executor_class_path = '%s.%s' % (executor_class.__module__,
                                         executor_class.__name__)

        # Note: exec_properties['custom_config'] here is a dict.
        return runner.start_aip_training(input_dict, output_dict,
                                         exec_properties, executor_class_path,
                                         training_inputs, job_id)
Beispiel #2
0
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:

    if tfx_tuner.get_tune_args(exec_properties):
      raise ValueError("TuneArgs is not supported by this Tuner's Executor.")

    metalearning_algorithm = None
    if 'metalearning_algorithm' in exec_properties:
      metalearning_algorithm = exec_properties.get('metalearning_algorithm')

    warmup_trials = 0
    warmup_trial_data = None
    if metalearning_algorithm:
      warmup_tuner, warmup_trials = self.warmup(input_dict, exec_properties,
                                                metalearning_algorithm)
      warmup_trial_data = extract_tuner_trial_progress(warmup_tuner)
    else:
      logging.info('MetaLearning Algorithm not provided.')

    # Create new fn_args for final tuning stage.
    fn_args = fn_args_utils.get_common_fn_args(
        input_dict, exec_properties, working_dir=self._get_tmp_dir())
    tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn')
    tuner_fn_result = tuner_fn(fn_args)
    tuner_fn_result.tuner.oracle.max_trials = max(
        (tuner_fn_result.tuner.oracle.max_trials - warmup_trials), 1)
    tuner = self.search(tuner_fn_result)
    tuner_trial_data = extract_tuner_trial_progress(tuner)

    if warmup_trial_data:
      cumulative_tuner_trial_data, best_tuner_ix = merge_trial_data(
          warmup_trial_data, tuner_trial_data)
      cumulative_tuner_trial_data['warmup_trial_data'] = warmup_trial_data[
          BEST_CUMULATIVE_SCORE]
      cumulative_tuner_trial_data['tuner_trial_data'] = tuner_trial_data[
          BEST_CUMULATIVE_SCORE]

      if isinstance(tuner.oracle.objective, kerastuner.Objective):
        cumulative_tuner_trial_data['objective'] = tuner.oracle.objective.name
      else:
        cumulative_tuner_trial_data['objective'] = 'objective not understood'

      tuner_trial_data = cumulative_tuner_trial_data
      best_tuner = warmup_tuner if best_tuner_ix == 0 else tuner
    else:
      best_tuner = tuner
    tfx_tuner.write_best_hyperparameters(best_tuner, output_dict)
    tuner_plot_path = os.path.join(
        artifact_utils.get_single_uri(output_dict['trial_summary_plot']),
        'tuner_plot_data.txt')
    io_utils.write_string_file(tuner_plot_path, json.dumps(tuner_trial_data))
    logging.info('Tuner plot data written at: %s', tuner_plot_path)
Beispiel #3
0
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """Starts a Tuner component as a job on Google Cloud AI Platform."""
    self._log_startup(input_dict, output_dict, exec_properties)

    custom_config = json_utils.loads(
        exec_properties.get(standard_component_specs.CUSTOM_CONFIG_KEY, 'null'))
    if custom_config is None:
      raise ValueError('custom_config is not provided')

    if not isinstance(custom_config, Dict):
      raise TypeError('custom_config in execution properties must be a dict, '
                      'but received %s' % type(custom_config))

    training_inputs = custom_config.get(TUNING_ARGS_KEY)
    if training_inputs is None:
      err_msg = ('\'%s\' not found in custom_config.' % TUNING_ARGS_KEY)
      logging.error(err_msg)
      raise ValueError(err_msg)
    training_inputs = training_inputs.copy()

    tune_args = tuner_executor.get_tune_args(exec_properties)

    enable_vertex = custom_config.get(constants.ENABLE_VERTEX_KEY, False)
    vertex_region = custom_config.get(constants.VERTEX_REGION_KEY, None)
    num_parallel_trials = (1
                           if not tune_args else tune_args.num_parallel_trials)
    if num_parallel_trials > 1:
      # Chief node is also responsible for conducting tuning loop.
      desired_worker_count = num_parallel_trials - 1

      if enable_vertex:
        # worker_pool_specs follows the order detailed below. We make sure the
        # number of workers in pool 1 is consistent with num_parallel_trials.
        # https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job
        worker_pool_specs = training_inputs['job_spec'].get('worker_pool_specs')
        if worker_pool_specs is None or len(worker_pool_specs) < 1:
          training_inputs['job_spec']['worker_pool_specs'] = [
              # `WorkerPoolSpec` for worker pool 0, primary replica
              {
                  'machine_spec': {
                      'machine_type': 'n1-standard-8'
                  },
                  'replica_count': 1
              },
              # `WorkerPoolSpec` for worker pool 1
              {
                  'machine_spec': {
                      'machine_type': 'n1-standard-8'
                  },
                  'replica_count': desired_worker_count
              }
          ]
          logging.warning('worker_pool_specs are overridden with %s.',
                          training_inputs['job_spec']['worker_pool_specs'])
        elif len(worker_pool_specs) < 2:
          # primary replica set but missing workers
          worker_specs = {**training_inputs['job_spec']['worker_pool_specs'][0]}
          worker_specs['replica_count'] = desired_worker_count
          training_inputs['job_spec']['worker_pool_specs'].append(worker_specs)
          logging.warning('worker_pool_specs[1] are overridden with %s.',
                          training_inputs['job_spec']['worker_pool_specs'][1])
        elif training_inputs['job_spec']['worker_pool_specs'][1].get(
            'replica_count') != desired_worker_count:
          training_inputs['job_spec']['worker_pool_specs'][1][
              'replica_count'] = desired_worker_count
          logging.warning(
              'replica_count in worker_pool_specs[1] is overridden with %s.',
              desired_worker_count)
      else:
        if training_inputs.get('workerCount') != desired_worker_count:
          logging.warning('workerCount is overridden with %s',
                          desired_worker_count)
          training_inputs['workerCount'] = desired_worker_count

        training_inputs['scaleTier'] = 'CUSTOM'
        training_inputs['masterType'] = (
            training_inputs.get('masterType') or 'standard')
        training_inputs['workerType'] = (
            training_inputs.get('workerType') or 'standard')

    # 'tfx_tuner_YYYYmmddHHMMSS' is the default job ID if not specified.
    job_id = (
        custom_config.get(ai_platform_trainer_executor.JOB_ID_KEY) or
        'tfx_tuner_{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S')))

    # TODO(b/160059039): Factor out label creation to a utility function.
    executor_class = _WorkerExecutor
    executor_class_path = '%s.%s' % (executor_class.__module__,
                                     executor_class.__name__)

    # Note: exec_properties['custom_config'] here is a dict.
    return runner.start_cloud_training(input_dict, output_dict, exec_properties,
                                       executor_class_path, training_inputs,
                                       job_id, enable_vertex, vertex_region)