def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Starts a Tuner component as a job on Google Cloud AI Platform.""" self._log_startup(input_dict, output_dict, exec_properties) custom_config = json_utils.loads( exec_properties.get(standard_component_specs.CUSTOM_CONFIG_KEY, 'null')) if custom_config is None: raise ValueError('custom_config is not provided') if not isinstance(custom_config, Dict): raise TypeError( 'custom_config in execution properties must be a dict, ' 'but received %s' % type(custom_config)) training_inputs = custom_config.get(TUNING_ARGS_KEY) if training_inputs is None: err_msg = ('\'%s\' not found in custom_config.' % TUNING_ARGS_KEY) logging.error(err_msg) raise ValueError(err_msg) training_inputs = training_inputs.copy() tune_args = tuner_executor.get_tune_args(exec_properties) num_parallel_trials = (1 if not tune_args else tune_args.num_parallel_trials) if num_parallel_trials > 1: # Chief node is also responsible for conducting tuning loop. desired_worker_count = num_parallel_trials - 1 if training_inputs.get('workerCount') != desired_worker_count: logging.warning('workerCount is overridden with %s', desired_worker_count) training_inputs['workerCount'] = desired_worker_count training_inputs['scaleTier'] = 'CUSTOM' training_inputs['masterType'] = (training_inputs.get('masterType') or 'standard') training_inputs['workerType'] = (training_inputs.get('workerType') or 'standard') # 'tfx_tuner_YYYYmmddHHMMSS' is the default job ID if not specified. job_id = (custom_config.get(ai_platform_trainer_executor.JOB_ID_KEY) or 'tfx_tuner_{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) # TODO(b/160059039): Factor out label creation to a utility function. executor_class = _WorkerExecutor executor_class_path = '%s.%s' % (executor_class.__module__, executor_class.__name__) # Note: exec_properties['custom_config'] here is a dict. return runner.start_aip_training(input_dict, output_dict, exec_properties, executor_class_path, training_inputs, job_id)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: if tfx_tuner.get_tune_args(exec_properties): raise ValueError("TuneArgs is not supported by this Tuner's Executor.") metalearning_algorithm = None if 'metalearning_algorithm' in exec_properties: metalearning_algorithm = exec_properties.get('metalearning_algorithm') warmup_trials = 0 warmup_trial_data = None if metalearning_algorithm: warmup_tuner, warmup_trials = self.warmup(input_dict, exec_properties, metalearning_algorithm) warmup_trial_data = extract_tuner_trial_progress(warmup_tuner) else: logging.info('MetaLearning Algorithm not provided.') # Create new fn_args for final tuning stage. fn_args = fn_args_utils.get_common_fn_args( input_dict, exec_properties, working_dir=self._get_tmp_dir()) tuner_fn = udf_utils.get_fn(exec_properties, 'tuner_fn') tuner_fn_result = tuner_fn(fn_args) tuner_fn_result.tuner.oracle.max_trials = max( (tuner_fn_result.tuner.oracle.max_trials - warmup_trials), 1) tuner = self.search(tuner_fn_result) tuner_trial_data = extract_tuner_trial_progress(tuner) if warmup_trial_data: cumulative_tuner_trial_data, best_tuner_ix = merge_trial_data( warmup_trial_data, tuner_trial_data) cumulative_tuner_trial_data['warmup_trial_data'] = warmup_trial_data[ BEST_CUMULATIVE_SCORE] cumulative_tuner_trial_data['tuner_trial_data'] = tuner_trial_data[ BEST_CUMULATIVE_SCORE] if isinstance(tuner.oracle.objective, kerastuner.Objective): cumulative_tuner_trial_data['objective'] = tuner.oracle.objective.name else: cumulative_tuner_trial_data['objective'] = 'objective not understood' tuner_trial_data = cumulative_tuner_trial_data best_tuner = warmup_tuner if best_tuner_ix == 0 else tuner else: best_tuner = tuner tfx_tuner.write_best_hyperparameters(best_tuner, output_dict) tuner_plot_path = os.path.join( artifact_utils.get_single_uri(output_dict['trial_summary_plot']), 'tuner_plot_data.txt') io_utils.write_string_file(tuner_plot_path, json.dumps(tuner_trial_data)) logging.info('Tuner plot data written at: %s', tuner_plot_path)
def Do(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any]) -> None: """Starts a Tuner component as a job on Google Cloud AI Platform.""" self._log_startup(input_dict, output_dict, exec_properties) custom_config = json_utils.loads( exec_properties.get(standard_component_specs.CUSTOM_CONFIG_KEY, 'null')) if custom_config is None: raise ValueError('custom_config is not provided') if not isinstance(custom_config, Dict): raise TypeError('custom_config in execution properties must be a dict, ' 'but received %s' % type(custom_config)) training_inputs = custom_config.get(TUNING_ARGS_KEY) if training_inputs is None: err_msg = ('\'%s\' not found in custom_config.' % TUNING_ARGS_KEY) logging.error(err_msg) raise ValueError(err_msg) training_inputs = training_inputs.copy() tune_args = tuner_executor.get_tune_args(exec_properties) enable_vertex = custom_config.get(constants.ENABLE_VERTEX_KEY, False) vertex_region = custom_config.get(constants.VERTEX_REGION_KEY, None) num_parallel_trials = (1 if not tune_args else tune_args.num_parallel_trials) if num_parallel_trials > 1: # Chief node is also responsible for conducting tuning loop. desired_worker_count = num_parallel_trials - 1 if enable_vertex: # worker_pool_specs follows the order detailed below. We make sure the # number of workers in pool 1 is consistent with num_parallel_trials. # https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job worker_pool_specs = training_inputs['job_spec'].get('worker_pool_specs') if worker_pool_specs is None or len(worker_pool_specs) < 1: training_inputs['job_spec']['worker_pool_specs'] = [ # `WorkerPoolSpec` for worker pool 0, primary replica { 'machine_spec': { 'machine_type': 'n1-standard-8' }, 'replica_count': 1 }, # `WorkerPoolSpec` for worker pool 1 { 'machine_spec': { 'machine_type': 'n1-standard-8' }, 'replica_count': desired_worker_count } ] logging.warning('worker_pool_specs are overridden with %s.', training_inputs['job_spec']['worker_pool_specs']) elif len(worker_pool_specs) < 2: # primary replica set but missing workers worker_specs = {**training_inputs['job_spec']['worker_pool_specs'][0]} worker_specs['replica_count'] = desired_worker_count training_inputs['job_spec']['worker_pool_specs'].append(worker_specs) logging.warning('worker_pool_specs[1] are overridden with %s.', training_inputs['job_spec']['worker_pool_specs'][1]) elif training_inputs['job_spec']['worker_pool_specs'][1].get( 'replica_count') != desired_worker_count: training_inputs['job_spec']['worker_pool_specs'][1][ 'replica_count'] = desired_worker_count logging.warning( 'replica_count in worker_pool_specs[1] is overridden with %s.', desired_worker_count) else: if training_inputs.get('workerCount') != desired_worker_count: logging.warning('workerCount is overridden with %s', desired_worker_count) training_inputs['workerCount'] = desired_worker_count training_inputs['scaleTier'] = 'CUSTOM' training_inputs['masterType'] = ( training_inputs.get('masterType') or 'standard') training_inputs['workerType'] = ( training_inputs.get('workerType') or 'standard') # 'tfx_tuner_YYYYmmddHHMMSS' is the default job ID if not specified. job_id = ( custom_config.get(ai_platform_trainer_executor.JOB_ID_KEY) or 'tfx_tuner_{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))) # TODO(b/160059039): Factor out label creation to a utility function. executor_class = _WorkerExecutor executor_class_path = '%s.%s' % (executor_class.__module__, executor_class.__name__) # Note: exec_properties['custom_config'] here is a dict. return runner.start_cloud_training(input_dict, output_dict, exec_properties, executor_class_path, training_inputs, job_id, enable_vertex, vertex_region)