def test_run_hyperparameter_tuning_job_with_fail_raises( self, create_hyperparameter_tuning_job_mock, get_hyperparameter_tuning_job_mock_with_fail, sync, ): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"] ), "batch_size": hpt.DiscreteParameterSpec( values=[16, 32], scale="linear" ), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, labels=_TEST_LABELS, ) with pytest.raises(RuntimeError): job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=sync, ) job.wait() expected_hyperparameter_tuning_job = _get_hyperparameter_tuning_job_proto() create_hyperparameter_tuning_job_mock.assert_called_once_with( parent=_TEST_PARENT, hyperparameter_tuning_job=expected_hyperparameter_tuning_job, ) assert job._gca_resource.state == gca_job_state_compat.JobState.JOB_STATE_FAILED
def tune_hyperparameters( project: str, location: str, container_uri: str, training_file_path: str, validation_file_path: str, staging_bucket: str, max_trial_count: int, parallel_trial_count: int ) -> NamedTuple('Outputs', [("best_accuracy", float), ("best_alpha", float), ("best_max_iter", int)]): from google.cloud import aiplatform from google.cloud.aiplatform import hyperparameter_tuning as hpt aiplatform.init(project=project, location=location, staging_bucket=staging_bucket) worker_pool_specs = [{ "machine_spec": { "machine_type": "n1-standard-4", "accelerator_type": "NVIDIA_TESLA_K80", "accelerator_count": 1, }, "replica_count": 1, "container_spec": { "image_uri": container_uri, "args": [ f"--training_dataset_path={training_file_path}", f"--validation_dataset_path={validation_file_path}", "--hptune" ], }, }] custom_job = aiplatform.CustomJob(display_name='covertype_kfp_trial_job', worker_pool_specs=worker_pool_specs) hp_job = aiplatform.HyperparameterTuningJob( display_name='covertype_kfp_tuning_job', custom_job=custom_job, metric_spec={ 'accuracy': 'maximize', }, parameter_spec={ 'alpha': hpt.DoubleParameterSpec(min=1.0e-4, max=1.0e-1, scale='linear'), 'max_iter': hpt.DiscreteParameterSpec(values=[1, 2], scale='linear') }, max_trial_count=max_trial_count, parallel_trial_count=parallel_trial_count, ) hp_job.run() metrics = [ trial.final_measurement.metrics[0].value for trial in hp_job.trials ] best_trial = hp_job.trials[metrics.index(max(metrics))] best_accuracy = float(best_trial.final_measurement.metrics[0].value) best_alpha = float(best_trial.parameters[0].value) best_max_iter = int(best_trial.parameters[1].value) return best_accuracy, best_alpha, best_max_iter
def test_hyperparameter_tuning_job_get_state_raises_without_run(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32, 64], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, ) with pytest.raises(RuntimeError): print(job.state)
def test_run_hyperparameter_tuning_job_with_fail_at_creation(self): aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, sync=False, ) with pytest.raises(RuntimeError) as e: job.wait_for_resource_creation() assert e.match("Mock fail") with pytest.raises(RuntimeError) as e: job.resource_name assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.network assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" ) with pytest.raises(RuntimeError) as e: job.trials assert e.match( "HyperparameterTuningJob resource has not been created. Resource failed with: Mock fail" )
def hyperparameter_tuning_job_run_op( display_name: str, project: str, base_output_directory: str, worker_pool_specs: list, study_spec_metrics: dict, study_spec_parameters: list, max_trial_count: int, parallel_trial_count: int, max_failed_trial_count: int = 0, location: str = "us-central1", study_spec_algorithm: str = "ALGORITHM_UNSPECIFIED", study_spec_measurement_selection_type: str = "BEST_MEASUREMENT", encryption_spec_key_name: str = None, service_account: str = None, network: str = None, ) -> NamedTuple('Outputs', [ ("trials", list), ]): """ Creates a Google Cloud AI Platform HyperparameterTuning Job and waits for it to complete. For example usage, see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/experimental/hyperparameter_tuning_job/hp_tuning_job_sample.ipynb. For more information on using hyperparameter tuning, please visit: https://cloud.google.com/vertex-ai/docs/training/using-hyperparameter-tuning Args: Creates a Google Cloud AI Platform HyperparameterTuning Job and waits for it to complete. For example usage, see https://github.com/kubeflow/pipelines/blob/master/components/google-cloud/google_cloud_pipeline_components/experimental/hyperparameter_tuning_job/hp_tuning_job_sample.ipynb. For more information on using hyperparameter tuning, please visit: https://cloud.google.com/vertex-ai/docs/training/using-hyperparameter-tuning Args: display_name (str): Required. The user-defined name of the HyperparameterTuningJob. The name can be up to 128 characters long and can be consist of any UTF-8 characters. project (str): Required. Project to run the HyperparameterTuningJob in. base_output_directory (str): Required. The Cloud Storage location to store the output of this HyperparameterTuningJob. The base_output_directory of each child CustomJob backing a Trial is set to a subdirectory with name as the trial id under its parent HyperparameterTuningJob's base_output_directory. The following Vertex AI environment variables will be passed to containers or python modules when this field is set: For CustomJob backing a Trial of HyperparameterTuningJob: * AIP_MODEL_DIR = `\/\/model\/` * AIP_CHECKPOINT_DIR = `\/\/checkpoints\/` * AIP_TENSORBOARD_LOG_DIR = `\/\/logs\/` worker_pool_specs (List[Dict]): Required. The spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. study_spec_metrics: (Dict[str, str]): Required. Dictionary representing metrics to optimize. The dictionary key is the metric_id, which is reported by your training job, and the dictionary value is the optimization goal of the metric ('minimize' or 'maximize'). example: metrics = {'loss': 'minimize', 'accuracy': 'maximize'} study_spec_parameters (list[str]): Required. List serialized from the parameter dictionary. The dictionary represents parameters to optimize. The dictionary key is the parameter_id, which is passed into your training job as a command line key word argument, and the dictionary value is the parameter specification of the metric. from google.cloud.aiplatform import hyperparameter_tuning as hpt from google_cloud_pipeline_components.experimental import hyperparameter_tuning_job parameters = hyperparameter_tuning_job.serialize_parameters({ 'lr': hpt.DoubleParameterSpec(min=0.001, max=0.1, scale='log'), 'units': hpt.IntegerParameterSpec(min=4, max=128, scale='linear'), 'activation': hpt.CategoricalParameterSpec(values=['relu', 'selu']), 'batch_size': hpt.DiscreteParameterSpec(values=[128, 256], scale='linear') }) Supported parameter specifications can be found in aiplatform.hyperparameter_tuning. These parameter specification are currently supported: DoubleParameterSpec, IntegerParameterSpec, CategoricalParameterSpace, DiscreteParameterSpec max_trial_count (int): Required. The desired total number of Trials. parallel_trial_count (int): Required. The desired number of Trials to run in parallel. max_failed_trial_count (Optional[int]): The number of failed Trials that need to be seen before failing the HyperparameterTuningJob. If set to 0, Vertex AI decides how many Trials must fail before the whole job fails. location (Optional[str]): Location to run the HyperparameterTuningJob in, defaults to "us-central1" study_spec_algorithm (Optional[str]): The search algorithm specified for the Study. Accepts one of the following: * `ALGORITHM_UNSPECIFIED` - If you do not specify an algorithm, your job uses the default Vertex AI algorithm. The default algorithm applies Bayesian optimization to arrive at the optimal solution with a more effective search over the parameter space. * 'GRID_SEARCH' - A simple grid search within the feasible space. This option is particularly useful if you want to specify a quantity of trials that is greater than the number of points in the feasible space. In such cases, if you do not specify a grid search, the Vertex AI default algorithm may generate duplicate suggestions. To use grid search, all parameter specs must be of type `IntegerParameterSpec`, `CategoricalParameterSpace`, or `DiscreteParameterSpec`. * 'RANDOM_SEARCH' - A simple random search within the feasible space. study_spec_measurement_selection_type (Optional[str]): This indicates which measurement to use if/when the service automatically selects the final measurement from previously reported intermediate measurements. Accepts: 'BEST_MEASUREMENT', 'LAST_MEASUREMENT' Choose this based on two considerations: A) Do you expect your measurements to monotonically improve? If so, choose 'LAST_MEASUREMENT'. On the other hand, if you're in a situation where your system can "over-train" and you expect the performance to get better for a while but then start declining, choose 'BEST_MEASUREMENT'. B) Are your measurements significantly noisy and/or irreproducible? If so, 'BEST_MEASUREMENT' will tend to be over-optimistic, and it may be better to choose 'LAST_MEASUREMENT'. If both or neither of (A) and (B) apply, it doesn't matter which selection type is chosen. encryption_spec_key_name (Optional[str]): Customer-managed encryption key options for a HyperparameterTuningJob. If this is set, then all resources created by the HyperparameterTuningJob will be encrypted with the provided encryption key. Has the form: ``projects/my-project/locations/my-location/keyRings/my-kr/cryptoKeys/my-key``. The key needs to be in the same region as where the compute resource is created. service_account (Optional[str]): Specifies the service account for workload run-as account. Users submitting jobs must have act-as permission on this run-as account. network (Optional[str]): The full name of the Compute Engine network to which the job should be peered. For example, projects/12345/global/networks/myVPC. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network. Returns: List of HyperparameterTuningJob trials """ from google.cloud import aiplatform from google.cloud.aiplatform import hyperparameter_tuning as hpt from google.cloud.aiplatform_v1.types import study from google.cloud.aiplatform.hyperparameter_tuning import _SCALE_TYPE_MAP # Reverse the _SCALE_TYPE_MAP dict for deserialization SCALE_MAP = dict((reversed(item) for item in _SCALE_TYPE_MAP.items())) PARAMETER_SPEC_MAP = { hpt.DoubleParameterSpec._parameter_spec_value_key: hpt.DoubleParameterSpec, hpt.IntegerParameterSpec._parameter_spec_value_key: hpt.IntegerParameterSpec, hpt.CategoricalParameterSpec._parameter_spec_value_key: hpt.CategoricalParameterSpec, hpt.DiscreteParameterSpec._parameter_spec_value_key: hpt.DiscreteParameterSpec, } ALGORITHM_MAP = { 'ALGORITHM_UNSPECIFIED': None, 'GRID_SEARCH': 'grid', 'RANDOM_SEARCH': 'random', } MEASUREMENT_SELECTION_TYPE_MAP = { 'BEST_MEASUREMENT': 'best', 'LAST_MEASUREMENT': 'last', } aiplatform.init(project=project, location=location, staging_bucket=base_output_directory) # Deserialize the parameters parameters_kwargs = {} for parameter in study_spec_parameters: param = study.StudySpec.ParameterSpec.from_json(parameter) parameter_id = param.parameter_id param_attrs = {} for parameter_spec_value_key, parameter_spec in PARAMETER_SPEC_MAP.items(): if getattr(param, parameter_spec_value_key): attrs = getattr(param, parameter_spec_value_key) for parameter, value in parameter_spec._parameter_value_map: if hasattr(attrs, value): param_attrs[parameter] = getattr(attrs, value) # Detect 'scale' in list of arguments to parameter_spec.__init__ param_spec_code = parameter_spec.__init__.__code__ if 'scale' in param_spec_code.co_varnames[:param_spec_code.co_argcount]: param_attrs['scale'] = SCALE_MAP[param.scale_type] parameters_kwargs[parameter_id] = parameter_spec( **param_attrs) # pytype: disable=wrong-keyword-args break custom_job_display_name = display_name + '_custom_job' job = aiplatform.CustomJob( display_name=custom_job_display_name, staging_bucket=base_output_directory, worker_pool_specs=worker_pool_specs, ) hp_job = aiplatform.HyperparameterTuningJob( display_name=display_name, custom_job=job, metric_spec=study_spec_metrics, parameter_spec={ **parameters_kwargs }, max_trial_count=max_trial_count, parallel_trial_count=parallel_trial_count, max_failed_trial_count=max_failed_trial_count, search_algorithm=ALGORITHM_MAP[study_spec_algorithm], measurement_selection=MEASUREMENT_SELECTION_TYPE_MAP[ study_spec_measurement_selection_type ], encryption_spec_key_name=encryption_spec_key_name ) hp_job.run( service_account=service_account, network=network) trials = [study.Trial.to_json(trial) for trial in hp_job.trials] return trials # pytype: disable=bad-return-type
def do_hyperparameter_tuning(data_set, timestamp, develop_mode, cpu_only_mode, tf_version): # Vertex AI services require regional API endpoints. if cpu_only_mode: train_image = 'us-docker.pkg.dev/vertex-ai/training/tf-cpu.{}:latest'.format( tf_version) else: train_image = "us-docker.pkg.dev/vertex-ai/training/tf-gpu.{}:latest".format( tf_version) # a single trial job model_display_name = '{}-{}'.format(ENDPOINT_NAME, timestamp) if cpu_only_mode: trial_job = aiplatform.CustomJob.from_local_script( display_name='train-{}'.format(model_display_name), script_path="model.py", container_uri=train_image, args=[ '--bucket', BUCKET, '--skip_full_eval', # no need to evaluate on test data set '--num_epochs', '10', '--num_examples', '500000' # 1/10 actual size to finish faster ], requirements=['cloudml-hypertune'], # any extra Python packages replica_count=1, machine_type='n1-standard-4') else: trial_job = aiplatform.CustomJob.from_local_script( display_name='train-{}'.format(model_display_name), script_path="model.py", container_uri=train_image, args=[ '--bucket', BUCKET, '--skip_full_eval', # no need to evaluate on test data set '--num_epochs', '10', '--num_examples', '500000' # 1/10 actual size to finish faster ], requirements=['cloudml-hypertune'], # any extra Python packages replica_count=1, machine_type='n1-standard-4', # See https://cloud.google.com/vertex-ai/docs/general/locations#accelerators accelerator_type=aip.AcceleratorType.NVIDIA_TESLA_T4.name, accelerator_count=1, ) # the tuning job hparam_job = aiplatform.HyperparameterTuningJob( # See https://googleapis.dev/python/aiplatform/latest/aiplatform.html# display_name='hparam-{}'.format(model_display_name), custom_job=trial_job, metric_spec={'val_rmse': 'minimize'}, parameter_spec={ "train_batch_size": hpt.IntegerParameterSpec(min=16, max=256, scale='log'), "nbuckets": hpt.IntegerParameterSpec(min=5, max=10, scale='linear'), "dnn_hidden_units": hpt.CategoricalParameterSpec( values=["64,16", "64,16,4", "64,64,64,8", "256,64,16"]) }, max_trial_count=2 if develop_mode else NUM_HPARAM_TRIALS, parallel_trial_count=2, search_algorithm=None, # Bayesian ) hparam_job.run(sync=True) # has to finish before we can get trials. # get the parameters corresponding to the best trial best = sorted(hparam_job.trials, key=lambda x: x.final_measurement.metrics[0].value)[0] logging.info('Best trial: {}'.format(best)) best_params = [] for param in best.parameters: best_params.append('--{}'.format(param.parameter_id)) if param.parameter_id in ["train_batch_size", "nbuckets"]: # hparam returns 10.0 even though it's an integer param. so round it. # but CustomTrainingJob makes integer args into floats. so make it a string best_params.append(str(int(round(param.value)))) else: # string or float parameters best_params.append(param.value) # run the best trial to completion logging.info('Launching full training job with {}'.format(best_params)) return train_custom_model(data_set, timestamp, develop_mode, cpu_only_mode, tf_version, extra_args=best_params)
def test_create_hyperparameter_tuning_job_with_enable_web_access( self, create_hyperparameter_tuning_job_mock_with_enable_web_access, get_hyperparameter_tuning_job_mock_with_enable_web_access, sync, caplog, ): caplog.set_level(logging.INFO) aiplatform.init( project=_TEST_PROJECT, location=_TEST_LOCATION, staging_bucket=_TEST_STAGING_BUCKET, encryption_spec_key_name=_TEST_DEFAULT_ENCRYPTION_KEY_NAME, ) custom_job = aiplatform.CustomJob( display_name=test_custom_job._TEST_DISPLAY_NAME, worker_pool_specs=test_custom_job._TEST_WORKER_POOL_SPEC, base_output_dir=test_custom_job._TEST_BASE_OUTPUT_DIR, ) job = aiplatform.HyperparameterTuningJob( display_name=_TEST_DISPLAY_NAME, custom_job=custom_job, metric_spec={_TEST_METRIC_SPEC_KEY: _TEST_METRIC_SPEC_VALUE}, parameter_spec={ "lr": hpt.DoubleParameterSpec(min=0.001, max=0.1, scale="log"), "units": hpt.IntegerParameterSpec(min=4, max=1028, scale="linear"), "activation": hpt.CategoricalParameterSpec( values=["relu", "sigmoid", "elu", "selu", "tanh"]), "batch_size": hpt.DiscreteParameterSpec(values=[16, 32], scale="linear"), }, parallel_trial_count=_TEST_PARALLEL_TRIAL_COUNT, max_trial_count=_TEST_MAX_TRIAL_COUNT, max_failed_trial_count=_TEST_MAX_FAILED_TRIAL_COUNT, search_algorithm=_TEST_SEARCH_ALGORITHM, measurement_selection=_TEST_MEASUREMENT_SELECTION, labels=_TEST_LABELS, ) job.run( service_account=_TEST_SERVICE_ACCOUNT, network=_TEST_NETWORK, timeout=_TEST_TIMEOUT, restart_job_on_worker_restart=_TEST_RESTART_JOB_ON_WORKER_RESTART, enable_web_access=test_custom_job._TEST_ENABLE_WEB_ACCESS, sync=sync, create_request_timeout=None, ) job.wait() assert "workerpool0-0" in caplog.text expected_hyperparameter_tuning_job = ( _get_hyperparameter_tuning_job_proto_with_enable_web_access()) create_hyperparameter_tuning_job_mock_with_enable_web_access.assert_called_once_with( parent=_TEST_PARENT, hyperparameter_tuning_job=expected_hyperparameter_tuning_job, timeout=None, ) assert job.state == gca_job_state_compat.JobState.JOB_STATE_SUCCEEDED assert job.network == _TEST_NETWORK assert job.trials == [] caplog.clear()