def run_as_aiplatform_custom_job( op: dsl.ContainerOp, display_name: Optional[str] = None, replica_count: Optional[int] = None, machine_type: Optional[str] = None, accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, boot_disk_type: Optional[str] = None, boot_disk_size_gb: Optional[int] = None, timeout: Optional[str] = None, restart_job_on_worker_restart: Optional[bool] = None, service_account: Optional[str] = None, network: Optional[str] = None, output_uri_prefix: Optional[str] = None, worker_pool_specs: Optional[List[Mapping[str, Any]]] = None, ) -> None: """Run a pipeline task using AI Platform (Unified) custom training job. For detailed doc of the service, please refer to https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job Args: op: The task (ContainerOp) object to run as aiplatform custom job. display_name: Optional. The name of the custom job. replica_count: Optional. The number of replicas to be split between master workerPoolSpec and worker workerPoolSpec. (master always has 1 replica). machine_type: Optional. The type of the machine to run the custom job. The default value is "n1-standard-4". accelerator_type: Optional. The type of accelerator(s) that may be attached to the machine as per acceleratorCount. Optional. accelerator_count: Optional. The number of accelerators to attach to the machine. boot_disk_type: Optional. Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive). boot_disk_size_gb: Optional. Size in GB of the boot disk (default is 100GB). timeout: Optional. The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's'. Example: "3.5s" restart_job_on_worker_restart: Optional. Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job. service_account: Optional. Specifies the service account for workload run-as account. network: Optional. The full name of the Compute Engine network to which the job should be peered. For example, projects/12345/global/networks/myVPC. output_uri_prefix: Optional. Google Cloud Storage URI to output directory. additional_worker_pool_specs: Optional. Additional workerPoolSpecs for distributed training. For details, please see: https://cloud.google.com/ai-platform-unified/docs/training/distributed-training """ job_spec = {} if worker_pool_specs is not None: worker_pool_specs = copy.deepcopy(worker_pool_specs) def _is_output_parameter(output_key: str) -> bool: return output_key in ( op.component_spec.output_definitions.parameters.keys()) for worker_pool_spec in worker_pool_specs: if 'containerSpec' in worker_pool_spec: container_spec = worker_pool_spec['containerSpec'] if 'command' in container_spec: dsl_utils.resolve_cmd_lines(container_spec['command'], _is_output_parameter) if 'args' in container_spec: dsl_utils.resolve_cmd_lines(container_spec['args'], _is_output_parameter) elif 'pythonPackageSpec' in worker_pool_spec: # For custom Python training, resolve placeholders in args only. python_spec = worker_pool_spec['pythonPackageSpec'] if 'args' in python_spec: dsl_utils.resolve_cmd_lines(python_spec['args'], _is_output_parameter) else: raise ValueError( 'Expect either "containerSpec" or "pythonPackageSpec" in each ' 'workerPoolSpec. Got: {}'.format(custom_job_spec)) job_spec['workerPoolSpecs'] = worker_pool_specs else: worker_pool_spec = { 'machineSpec': { 'machineType': machine_type or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE }, 'replicaCount': '1', 'containerSpec': { 'imageUri': op.container.image, } } if op.container.command: worker_pool_spec['containerSpec']['command'] = op.container.command if op.container.args: worker_pool_spec['containerSpec']['args'] = op.container.args if accelerator_type is not None: worker_pool_spec['machineSpec'][ 'acceleratorType'] = accelerator_type if accelerator_count is not None: worker_pool_spec['machineSpec'][ 'acceleratorCount'] = accelerator_count if boot_disk_type is not None: if 'diskSpec' not in worker_pool_spec: worker_pool_spec['diskSpec'] = {} worker_pool_spec['diskSpec']['bootDiskType'] = boot_disk_type if boot_disk_size_gb is not None: if 'diskSpec' not in worker_pool_spec: worker_pool_spec['diskSpec'] = {} worker_pool_spec['diskSpec']['bootDiskSizeGb'] = boot_disk_size_gb job_spec['workerPoolSpecs'] = [worker_pool_spec] if replica_count is not None and replica_count > 1: additional_worker_pool_spec = copy.deepcopy(worker_pool_spec) additional_worker_pool_spec['replicaCount'] = str(replica_count - 1) job_spec['workerPoolSpecs'].append(additional_worker_pool_spec) if timeout is not None: if 'scheduling' not in job_spec: job_spec['scheduling'] = {} job_spec['scheduling']['timeout'] = timeout if restart_job_on_worker_restart is not None: if 'scheduling' not in job_spec: job_spec['scheduling'] = {} job_spec['scheduling'][ 'restartJobOnWorkerRestart'] = restart_job_on_worker_restart if service_account is not None: job_spec['serviceAccount'] = service_account if network is not None: job_spec['network'] = network if output_uri_prefix is not None: job_spec['baseOutputDirectory'] = { 'outputUriPrefix': output_uri_prefix } op.custom_job_spec = { 'displayName': display_name or op.name, 'jobSpec': job_spec }
def create_custom_training_job_op_from_component( component_spec: Callable, # pylint: disable=g-bare-generic display_name: Optional[str] = '', replica_count: Optional[int] = 1, machine_type: Optional[str] = 'n1-standard-4', accelerator_type: Optional[str] = '', accelerator_count: Optional[int] = 1, boot_disk_type: Optional[str] = 'pd-ssd', boot_disk_size_gb: Optional[int] = 100, timeout: Optional[str] = '', restart_job_on_worker_restart: Optional[bool] = False, service_account: Optional[str] = '', network: Optional[str] = '', encryption_spec_key_name: Optional[str] = '', tensorboard: Optional[str] = '', enable_web_access: Optional[bool] = False, base_output_directory: Optional[str] = '', labels: Optional[Dict[str, str]] = None, ) -> Callable: # pylint: disable=g-bare-generic """Create a component spec that runs a custom training in Vertex AI. This utility converts a given component to a CustomTrainingJobOp that runs a custom training in Vertex AI. This simplifies the creation of custom training jobs. All Inputs and Outputs of the supplied component will be copied over to the constructed training job. Note that this utility constructs a ClusterSpec where the master and all the workers use the same spec, meaning all disk/machine spec related parameters will apply to all replicas. This is suitable for use cases such as training with MultiWorkerMirroredStrategy or Mirrored Strategy. This component does not support Vertex AI Python training application. For more details on Vertex AI Training service, please refer to https://cloud.google.com/vertex-ai/docs/training/create-custom-job Args: component_spec: The task (ContainerOp) object to run as Vertex AI custom job. display_name (Optional[str]): The name of the custom job. If not provided the component_spec.name will be used instead. replica_count (Optional[int]): The count of instances in the cluster. One replica always counts towards the master in worker_pool_spec[0] and the remaining replicas will be allocated in worker_pool_spec[1]. For more details see https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job. machine_type (Optional[str]): The type of the machine to run the custom job. The default value is "n1-standard-4". For more details about this input config, see https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types. accelerator_type (Optional[str]): The type of accelerator(s) that may be attached to the machine as per accelerator_count. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype. accelerator_count (Optional[int]): The number of accelerators to attach to the machine. Defaults to 1 if accelerator_type is set. boot_disk_type (Optional[str]): Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive). boot_disk_size_gb (Optional[int]): Size in GB of the boot disk (default is 100GB). timeout (Optional[str]): The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: "3.5s". restart_job_on_worker_restart (Optional[bool]): Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job. service_account (Optional[str]): Sets the default service account for workload run-as account. The service account running the pipeline (https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code Service Agent(https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project. network (Optional[str]): The full name of the Compute Engine network to which the job should be peered. For example, projects/12345/global/networks/myVPC. Format is of the form projects/{project}/global/networks/{network}. Where {project} is a project number, as in 12345, and {network} is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network. encryption_spec_key_name (Optional[str]): Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. tensorboard (Optional[str]): The name of a Vertex AI Tensorboard resource to which this CustomJob will upload Tensorboard logs. enable_web_access (Optional[bool]): Whether you want Vertex AI to enable [interactive shell access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If set to `true`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][]. base_output_directory (Optional[str]): The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. see below for more details: https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination labels (Optional[Dict[str, str]]): The labels with user-defined metadata to organize CustomJobs. See https://goo.gl/xmQnxf for more information. Returns: A Custom Job component operator corresponding to the input component operator. """ job_spec = {} input_specs = [] output_specs = [] # pytype: disable=attribute-error if component_spec.component_spec.inputs: input_specs = component_spec.component_spec.inputs if component_spec.component_spec.outputs: output_specs = component_spec.component_spec.outputs def _is_output_parameter(output_key: str) -> bool: for output in component_spec.component_spec.outputs: if output.name == output_key: return type_utils.is_parameter_type(output.type) return False worker_pool_spec = { 'machine_spec': { 'machine_type': machine_type }, 'replica_count': 1, 'container_spec': { 'image_uri': component_spec.component_spec.implementation.container.image, } } if component_spec.component_spec.implementation.container.command: container_command_copy = component_spec.component_spec.implementation.container.command.copy( ) dsl_utils.resolve_cmd_lines(container_command_copy, _is_output_parameter) # Replace executor place holder with the json escaped placeholder. for idx, val in enumerate(container_command_copy): if val == '{{{{$}}}}': container_command_copy[ idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT worker_pool_spec['container_spec']['command'] = container_command_copy if component_spec.component_spec.implementation.container.args: container_args_copy = component_spec.component_spec.implementation.container.args.copy( ) dsl_utils.resolve_cmd_lines(container_args_copy, _is_output_parameter) # Replace executor place holder with the json escaped placeholder. for idx, val in enumerate(container_args_copy): if val == '{{{{$}}}}': container_args_copy[idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT worker_pool_spec['container_spec']['args'] = container_args_copy if accelerator_type: worker_pool_spec['machine_spec']['accelerator_type'] = accelerator_type worker_pool_spec['machine_spec'][ 'accelerator_count'] = accelerator_count if boot_disk_type: if 'disk_spec' not in worker_pool_spec: worker_pool_spec['disk_spec'] = {} worker_pool_spec['disk_spec']['boot_disk_type'] = boot_disk_type if 'disk_spec' not in worker_pool_spec: worker_pool_spec['disk_spec'] = {} worker_pool_spec['disk_spec']['boot_disk_size_gb'] = boot_disk_size_gb job_spec['worker_pool_specs'] = [worker_pool_spec] if int(replica_count) > 1: additional_worker_pool_spec = copy.deepcopy(worker_pool_spec) additional_worker_pool_spec['replica_count'] = str(replica_count - 1) job_spec['worker_pool_specs'].append(additional_worker_pool_spec) # TODO(chavoshi): Use input parameter instead of hard coded string label. # This requires Dictionary input type to be supported in V2. if labels is not None: job_spec['labels'] = labels if timeout: if 'scheduling' not in job_spec: job_spec['scheduling'] = {} job_spec['scheduling']['timeout'] = timeout if restart_job_on_worker_restart: if 'scheduling' not in job_spec: job_spec['scheduling'] = {} job_spec['scheduling'][ 'restart_job_on_worker_restart'] = restart_job_on_worker_restart if enable_web_access: job_spec['enable_web_access'] = enable_web_access if encryption_spec_key_name: job_spec['encryption_spec'] = {} job_spec['encryption_spec'][ 'kms_key_name'] = "{{$.inputs.parameters['encryption_spec_key_name']}}" input_specs.append( structures.InputSpec(name='encryption_spec_key_name', type='String', optional=True, default=encryption_spec_key_name), ) # Remove any existing service_account from component input list. input_specs[:] = [ input_spec for input_spec in input_specs if input_spec.name not in ('service_account', 'network', 'tensorboard', 'base_output_directory') ] job_spec['service_account'] = "{{$.inputs.parameters['service_account']}}" job_spec['network'] = "{{$.inputs.parameters['network']}}" job_spec['tensorboard'] = "{{$.inputs.parameters['tensorboard']}}" job_spec['base_output_directory'] = {} job_spec['base_output_directory'][ 'output_uri_prefix'] = "{{$.inputs.parameters['base_output_directory']}}" custom_job_payload = { 'display_name': display_name or component_spec.component_spec.name, 'job_spec': job_spec } custom_job_component_spec = structures.ComponentSpec( name=component_spec.component_spec.name, inputs=input_specs + [ structures.InputSpec(name='base_output_directory', type='String', optional=True, default=base_output_directory), structures.InputSpec(name='tensorboard', type='String', optional=True, default=tensorboard), structures.InputSpec( name='network', type='String', optional=True, default=network), structures.InputSpec(name='service_account', type='String', optional=True, default=service_account), structures.InputSpec(name='project', type='String'), structures.InputSpec(name='location', type='String') ], outputs=output_specs + [structures.OutputSpec(name='gcp_resources', type='String')], implementation=structures. ContainerImplementation(container=structures.ContainerSpec( image=_DEFAULT_CUSTOM_JOB_CONTAINER_IMAGE, command=[ 'python3', '-u', '-m', 'google_cloud_pipeline_components.container.v1.gcp_launcher.launcher' ], args=[ '--type', 'CustomJob', '--payload', json.dumps(custom_job_payload), '--project', structures.InputValuePlaceholder(input_name='project'), '--location', structures.InputValuePlaceholder(input_name='location'), '--gcp_resources', structures.OutputPathPlaceholder(output_name='gcp_resources'), ], ))) # pytype: enable=attribute-error component_path = tempfile.mktemp() custom_job_component_spec.save(component_path) return components.load_component_from_file(component_path)
def create_custom_training_job_op_from_component( component_spec: Callable, # pylint: disable=g-bare-generic display_name: Optional[str] = '', replica_count: Optional[int] = 1, machine_type: Optional[str] = 'n1-standard-4', accelerator_type: Optional[str] = '', accelerator_count: Optional[int] = 1, boot_disk_type: Optional[str] = 'pd-ssd', boot_disk_size_gb: Optional[int] = 100, timeout: Optional[str] = '604800s', restart_job_on_worker_restart: Optional[bool] = False, service_account: Optional[str] = '', network: Optional[str] = '', encryption_spec_key_name: Optional[str] = '', tensorboard: Optional[str] = '', enable_web_access: Optional[bool] = False, reserved_ip_ranges: Optional[Sequence[str]] = None, nfs_mounts: Optional[Sequence[Dict[str, str]]] = None, base_output_directory: Optional[str] = '', labels: Optional[Dict[str, str]] = None, ) -> Callable: # pylint: disable=g-bare-generic """Create a component spec that runs a custom training in Vertex AI. This utility converts a given component to a CustomTrainingJobOp that runs a custom training in Vertex AI. This simplifies the creation of custom training jobs. All Inputs and Outputs of the supplied component will be copied over to the constructed training job. Note that this utility constructs a ClusterSpec where the master and all the workers use the same spec, meaning all disk/machine spec related parameters will apply to all replicas. This is suitable for use cases such as training with MultiWorkerMirroredStrategy or Mirrored Strategy. This component does not support Vertex AI Python training application. For more details on Vertex AI Training service, please refer to https://cloud.google.com/vertex-ai/docs/training/create-custom-job Args: component_spec: The task (ContainerOp) object to run as Vertex AI custom job. display_name (Optional[str]): The name of the custom job. If not provided the component_spec.name will be used instead. replica_count (Optional[int]): The count of instances in the cluster. One replica always counts towards the master in worker_pool_spec[0] and the remaining replicas will be allocated in worker_pool_spec[1]. For more details see https://cloud.google.com/vertex-ai/docs/training/distributed-training#configure_a_distributed_training_job. machine_type (Optional[str]): The type of the machine to run the custom job. The default value is "n1-standard-4". For more details about this input config, see https://cloud.google.com/vertex-ai/docs/training/configure-compute#machine-types. accelerator_type (Optional[str]): The type of accelerator(s) that may be attached to the machine as per accelerator_count. For more details about this input config, see https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec#acceleratortype. accelerator_count (Optional[int]): The number of accelerators to attach to the machine. Defaults to 1 if accelerator_type is set. boot_disk_type (Optional[str]): Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive). boot_disk_type is set as a static value and cannot be changed as a pipeline parameter. boot_disk_size_gb (Optional[int]): Size in GB of the boot disk (default is 100GB). boot_disk_size_gb is set as a static value and cannot be changed as a pipeline parameter. timeout (Optional[str]): The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: "3.5s". restart_job_on_worker_restart (Optional[bool]): Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job. service_account (Optional[str]): Sets the default service account for workload run-as account. The service account running the pipeline (https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code Service Agent(https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project. network (Optional[str]): The full name of the Compute Engine network to which the job should be peered. For example, projects/12345/global/networks/myVPC. Format is of the form projects/{project}/global/networks/{network}. Where {project} is a project number, as in 12345, and {network} is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network. encryption_spec_key_name (Optional[str]): Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key. tensorboard (Optional[str]): The name of a Vertex AI Tensorboard resource to which this CustomJob will upload Tensorboard logs. enable_web_access (Optional[bool]): Whether you want Vertex AI to enable [interactive shell access](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If set to `true`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][]. reserved_ip_ranges (Optional[Sequence[str]]): A list of names for the reserved ip ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided ip ranges. Otherwise, the job will be deployed to any ip ranges under the provided VPC network. nfs_mounts (Optional[Sequence[Dict]]): A list of NFS mount specs in Json dict format. nfs_mounts is set as a static value and cannot be changed as a pipeline parameter. For API spec, see https://cloud.devsite.corp.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#NfsMount For more details about mounting NFS for CustomJob, see https://cloud.devsite.corp.google.com/vertex-ai/docs/training/train-nfs-share base_output_directory (Optional[str]): The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. see below for more details: https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination labels (Optional[Dict[str, str]]): The labels with user-defined metadata to organize CustomJobs. See https://goo.gl/xmQnxf for more information. Returns: A Custom Job component operator corresponding to the input component operator. """ worker_pool_specs = {} input_specs = [] output_specs = [] # pytype: disable=attribute-error if component_spec.component_spec.inputs: input_specs = component_spec.component_spec.inputs if component_spec.component_spec.outputs: output_specs = component_spec.component_spec.outputs def _is_output_parameter(output_key: str) -> bool: for output in component_spec.component_spec.outputs: if output.name == output_key: return type_utils.is_parameter_type(output.type) return False worker_pool_spec = { 'machine_spec': { 'machine_type': machine_type }, 'replica_count': 1, 'container_spec': { 'image_uri': component_spec.component_spec.implementation.container.image, } } if component_spec.component_spec.implementation.container.command: container_command_copy = component_spec.component_spec.implementation.container.command.copy( ) dsl_utils.resolve_cmd_lines(container_command_copy, _is_output_parameter) # Replace executor place holder with the json escaped placeholder. for idx, val in enumerate(container_command_copy): if val == '{{{{$}}}}': container_command_copy[ idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT worker_pool_spec['container_spec']['command'] = container_command_copy if component_spec.component_spec.implementation.container.env: worker_pool_spec['container_spec'][ 'env'] = component_spec.component_spec.implementation.container.env.copy( ) if component_spec.component_spec.implementation.container.args: container_args_copy = component_spec.component_spec.implementation.container.args.copy( ) dsl_utils.resolve_cmd_lines(container_args_copy, _is_output_parameter) # Replace executor place holder with the json escaped placeholder. for idx, val in enumerate(container_args_copy): if val == '{{{{$}}}}': container_args_copy[idx] = _EXECUTOR_PLACE_HOLDER_REPLACEMENT worker_pool_spec['container_spec']['args'] = container_args_copy if accelerator_type: worker_pool_spec['machine_spec']['accelerator_type'] = accelerator_type worker_pool_spec['machine_spec'][ 'accelerator_count'] = accelerator_count if boot_disk_type: if 'disk_spec' not in worker_pool_spec: worker_pool_spec['disk_spec'] = {} worker_pool_spec['disk_spec']['boot_disk_type'] = boot_disk_type if 'disk_spec' not in worker_pool_spec: worker_pool_spec['disk_spec'] = {} worker_pool_spec['disk_spec']['boot_disk_size_gb'] = boot_disk_size_gb if nfs_mounts: if 'nfs_mounts' not in worker_pool_spec: worker_pool_spec['nfs_mounts'] = [] worker_pool_spec['nfs_mounts'].extend(nfs_mounts) worker_pool_specs = [worker_pool_spec] if int(replica_count) > 1: additional_worker_pool_spec = copy.deepcopy(worker_pool_spec) additional_worker_pool_spec['replica_count'] = str(replica_count - 1) worker_pool_specs.append(additional_worker_pool_spec) # Remove any Vertex Training duplicate input_spec from component input list. input_specs[:] = [ input_spec for input_spec in input_specs if input_spec.name not in ('project', 'location', 'display_name', 'worker_pool_specs', 'timeout', 'restart_job_on_worker_restart', 'service_account', 'tensorboard', 'network', 'reserved_ip_ranges', 'nfs_mounts', 'base_output_directory', 'labels', 'encryption_spec_key_name') ] custom_training_job_json = None with open(os.path.join(os.path.dirname(__file__), 'component.yaml')) as file: custom_training_job_json = yaml.load(file, Loader=yaml.FullLoader) for input_item in custom_training_job_json['inputs']: if 'display_name' in input_item.values(): input_item[ 'default'] = display_name if display_name else component_spec.component_spec.name input_item['optional'] = True elif 'worker_pool_specs' in input_item.values(): input_item['default'] = json.dumps(worker_pool_specs) input_item['optional'] = True elif 'timeout' in input_item.values(): input_item['default'] = timeout input_item['optional'] = True elif 'restart_job_on_worker_restart' in input_item.values(): input_item['default'] = json.dumps(restart_job_on_worker_restart) input_item['optional'] = True elif 'service_account' in input_item.values(): input_item['default'] = service_account input_item['optional'] = True elif 'tensorboard' in input_item.values(): input_item['default'] = tensorboard input_item['optional'] = True elif 'enable_web_access' in input_item.values(): input_item['default'] = json.dumps(enable_web_access) input_item['optional'] = True elif 'network' in input_item.values(): input_item['default'] = network input_item['optional'] = True elif 'reserved_ip_ranges' in input_item.values(): input_item['default'] = json.dumps( reserved_ip_ranges) if reserved_ip_ranges else '[]' input_item['optional'] = True elif 'base_output_directory' in input_item.values(): input_item['default'] = base_output_directory input_item['optional'] = True elif 'labels' in input_item.values(): input_item['default'] = json.dumps(labels) if labels else '{}' input_item['optional'] = True elif 'encryption_spec_key_name' in input_item.values(): input_item['default'] = encryption_spec_key_name input_item['optional'] = True else: # This field does not need to be updated. continue # Copying over the input and output spec from the given component. for input_spec in input_specs: custom_training_job_json['inputs'].append(input_spec.to_dict()) for output_spec in output_specs: custom_training_job_json['outputs'].append(output_spec.to_dict()) # Copy the component name and description custom_training_job_json['name'] = component_spec.component_spec.name if component_spec.component_spec.description: # TODO(chavoshi) Add support for docstring parsing. component_description = 'A custom job that wraps ' component_description += f'{component_spec.component_spec.name}.\n\nOrigional component' component_description += f' description:\n{component_spec.component_spec.description}\n\nCustom' component_description += ' Job wrapper description:\n' component_description += custom_training_job_json['description'] custom_training_job_json['description'] = component_description component_path = tempfile.mktemp() with open(component_path, 'w') as out_file: yaml.dump(custom_training_job_json, out_file) return components.load_component_from_file(component_path)
def run_as_vertex_ai_custom_job( component_spec: Callable, display_name: Optional[str] = None, replica_count: Optional[int] = None, machine_type: Optional[str] = None, accelerator_type: Optional[str] = None, accelerator_count: Optional[int] = None, boot_disk_type: Optional[str] = None, boot_disk_size_gb: Optional[int] = None, timeout: Optional[str] = None, restart_job_on_worker_restart: Optional[bool] = None, service_account: Optional[str] = None, network: Optional[str] = None, worker_pool_specs: Optional[List[Mapping[str, Any]]] = None, ) -> Callable: """Run a pipeline task using AI Platform (Unified) custom training job. For detailed doc of the service, please refer to https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job Args: component_spec: The task (ContainerOp) object to run as aiplatform custom job. display_name: Optional. The name of the custom job. If not provided the component_spec.name will be used instead. replica_count: Optional. The number of replicas to be split between master workerPoolSpec and worker workerPoolSpec. (master always has 1 replica). machine_type: Optional. The type of the machine to run the custom job. The default value is "n1-standard-4". accelerator_type: Optional. The type of accelerator(s) that may be attached to the machine as per accelerator_count. Optional. accelerator_count: Optional. The number of accelerators to attach to the machine. boot_disk_type: Optional. Type of the boot disk (default is "pd-ssd"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive). boot_disk_size_gb: Optional. Size in GB of the boot disk (default is 100GB). timeout: Optional. The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's'. Example: "3.5s" restart_job_on_worker_restart: Optional. Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job. service_account: Optional. Specifies the service account for workload run-as account. network: Optional. The full name of the Compute Engine network to which the job should be peered. For example, projects/12345/global/networks/myVPC. worker_pool_specs: Optional, worker_pool_specs for distributed training. this will overwite all other cluster configurations. For details, please see: https://cloud.google.com/ai-platform-unified/docs/training/distributed-training Returns: A Custom Job component OP correspoinding to the input component OP. """ job_spec = {} # As a temporary work aruond for issue with kfp v2 based compiler where # compiler expects place holders in origional form in args, instead of # using fields from outputs, we add back the args from the origional # component to the custom job component. These args will be ignored # by the remote launcher. copy_of_origional_args = [] if worker_pool_specs is not None: worker_pool_specs = copy.deepcopy(worker_pool_specs) def _is_output_parameter(output_key: str) -> bool: return output_key in (component_spec.component_spec. output_definitions.parameters.keys()) for worker_pool_spec in worker_pool_specs: if 'container_spec' in worker_pool_spec: container_spec = worker_pool_spec['container_spec'] if 'command' in container_spec: dsl_utils.resolve_cmd_lines(container_spec['command'], _is_output_parameter) if 'args' in container_spec: copy_of_origional_args = container_spec['args'].copy() dsl_utils.resolve_cmd_lines(container_spec['args'], _is_output_parameter) elif 'python_package_spec' in worker_pool_spec: # For custom Python training, resolve placeholders in args only. python_spec = worker_pool_spec['python_package_spec'] if 'args' in python_spec: dsl_utils.resolve_cmd_lines(python_spec['args'], _is_output_parameter) else: raise ValueError( 'Expect either "container_spec" or "python_package_spec" in each ' 'workerPoolSpec. Got: {}'.format(worker_pool_spec)) job_spec['worker_pool_specs'] = worker_pool_specs else: def _is_output_parameter(output_key: str) -> bool: for output in component_spec.component_spec.outputs: if output.name == output_key: return type_utils.is_parameter_type(output.type) return False worker_pool_spec = { 'machine_spec': { 'machine_type': machine_type or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE }, 'replica_count': 1, 'container_spec': { 'image_uri': component_spec.component_spec.implementation.container.image, } } if component_spec.component_spec.implementation.container.command: container_command_copy = component_spec.component_spec.implementation.container.command.copy( ) dsl_utils.resolve_cmd_lines(container_command_copy, _is_output_parameter) worker_pool_spec['container_spec'][ 'command'] = container_command_copy if component_spec.component_spec.implementation.container.args: container_args_copy = component_spec.component_spec.implementation.container.args.copy( ) copy_of_origional_args = component_spec.component_spec.implementation.container.args.copy( ) dsl_utils.resolve_cmd_lines(container_args_copy, _is_output_parameter) worker_pool_spec['container_spec']['args'] = container_args_copy if accelerator_type is not None: worker_pool_spec['machine_spec'][ 'accelerator_type'] = accelerator_type if accelerator_count is not None: worker_pool_spec['machine_spec'][ 'accelerator_count'] = accelerator_count if boot_disk_type is not None: if 'disk_spec' not in worker_pool_spec: worker_pool_spec['disk_spec'] = {} worker_pool_spec['disk_spec']['boot_disk_type'] = boot_disk_type if boot_disk_size_gb is not None: if 'disk_spec' not in worker_pool_spec: worker_pool_spec['disk_spec'] = {} worker_pool_spec['disk_spec'][ 'boot_disk_size_gb'] = boot_disk_size_gb job_spec['worker_pool_specs'] = [worker_pool_spec] if replica_count is not None and replica_count > 1: additional_worker_pool_spec = copy.deepcopy(worker_pool_spec) additional_worker_pool_spec['replica_count'] = str(replica_count - 1) job_spec['worker_pool_specs'].append(additional_worker_pool_spec) if timeout is not None: if 'scheduling' not in job_spec: job_spec['scheduling'] = {} job_spec['scheduling']['timeout'] = timeout if restart_job_on_worker_restart is not None: if 'scheduling' not in job_spec: job_spec['scheduling'] = {} job_spec['scheduling'][ 'restart_job_on_worker_restart'] = restart_job_on_worker_restart if service_account is not None: job_spec['service_account'] = service_account if network is not None: job_spec['network'] = network custom_job_payload = { 'display_name': display_name or component_spec.component_spec.name, 'job_spec': job_spec } custom_job_component_spec = structures.ComponentSpec( name=component_spec.component_spec.name, inputs=component_spec.component_spec.inputs + [ structures.InputSpec(name='gcp_project', type='String'), structures.InputSpec(name='gcp_region', type='String') ], outputs=component_spec.component_spec.outputs + [structures.OutputSpec(name='GCP_RESOURCES', type='String')], implementation=structures.ContainerImplementation( container=structures.ContainerSpec( image=_DEFAULT_CUSTOM_JOB_CONTAINER_IMAGE, command=["python", "-u", "-m", "launcher"], args=[ '--type', 'CustomJob', '--gcp_project', structures.InputValuePlaceholder(input_name='gcp_project'), '--gcp_region', structures.InputValuePlaceholder(input_name='gcp_region'), '--payload', json.dumps(custom_job_payload), '--gcp_resources', structures.OutputPathPlaceholder( output_name='GCP_RESOURCES'), ] + copy_of_origional_args, ))) component_path = tempfile.mktemp() custom_job_component_spec.save(component_path) return components.load_component_from_file(component_path)
def custom_job( name: str, input_artifacts: Optional[Dict[str, dsl.PipelineParam]] = None, input_parameters: Optional[Dict[str, _ValueOrPipelineParam]] = None, output_artifacts: Optional[Dict[str, Type[io_types.Artifact]]] = None, output_parameters: Optional[Dict[str, Type[Union[str, float, int]]]] = None, # Custom container training specs. image_uri: Optional[str] = None, commands: Optional[List[str]] = None, # Custom Python training spec. executor_image_uri: Optional[str] = None, package_uris: Optional[List[str]] = None, python_module: Optional[str] = None, # Command line args of the user program. args: Optional[List[Any]] = None, machine_type: Optional[str] = None, # Full-fledged custom job API spec. For details please see: # https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1beta1/CustomJobSpec additional_job_spec: Optional[Dict[str, Any]] = None ) -> AiPlatformCustomJobOp: """DSL representation of a AI Platform (Unified) custom training job. For detailed doc of the service, please refer to https://cloud.google.com/ai-platform-unified/docs/training/create-custom-job Args: name: The name of this task. input_artifacts: The input artifact specification. Should be a mapping from input name to output from upstream tasks. input_parameters: The input parameter specification. Should be a mapping from input name to one of the following three: - output from upstream tasks, or - pipeline parameter, or - constant value output_artifacts: The output artifact declaration. Should be a mapping from output name to a type subclassing artifact.Artifact. output_parameters: The output parameter declaration. Should be a mapping from output name to one of 1) str, 2) float, or 3) int. image_uri: The URI of the container image containing the user training program. Applicable for custom container training. commands: The container command/entrypoint. Applicable for custom container training. executor_image_uri: The URI of the container image containing the dependencies of user training program. Applicable for custom Python training. package_uris: The Python packages that are expected to be running on the executor container. Applicable for custom Python training. python_module: The entrypoint of user training program. Applicable for custom Python training. args: The command line arguments of user training program. This is expected to be a list of either 1) constant string, or 2) KFP DSL placeholders, to connect the user program with the declared component I/O. machine_type: The machine type used to run the training program. The value of this field will be propagated to all worker pools if not specified otherwise in additional_job_spec. additional_job_spec: Full-fledged custom job API spec. The value specified in this field will override the defaults provided through other function parameters. For details please see: https://cloud.google.com/ai-platform-unified/docs/reference/rest/v1beta1/CustomJobSpec Returns: A KFP ContainerOp object represents the launcher container job, from which the user training program will be submitted to AI Platform (Unified) Custom Job service. Raises: KeyError on name collision between parameter and artifact I/O declaration. ValueError when: 1. neither or both image_uri and executor_image_uri are provided; or 2. no valid package_uris and python_module is provided for custom Python training. """ # Check the sanity of the provided parameters. input_artifacts = input_artifacts or {} input_parameters = input_parameters or {} output_artifacts = output_artifacts or {} output_parameters = output_parameters or {} if bool(set(input_artifacts.keys()) & set(input_parameters.keys())): raise KeyError( 'Input key conflict between input parameters and artifacts.') if bool(set(output_artifacts.keys()) & set(output_parameters.keys())): raise KeyError('Output key conflict between output parameters and ' 'artifacts.') if not additional_job_spec and bool(image_uri) == bool(executor_image_uri): raise ValueError( 'The user program needs to be either a custom container ' 'training job, or a custom Python training job') # For Python custom training job, package URIs and modules are also required. if executor_image_uri: if not package_uris or not python_module or len( package_uris) > _MAX_PACKAGE_URIS: raise ValueError( 'For custom Python training, package_uris with length < ' '100 and python_module are expected.') # Check and scaffold the parameters to form the custom job request spec. custom_job_spec = additional_job_spec or {} if not custom_job_spec.get('workerPoolSpecs'): # Single node training, deriving job spec from top-level parameters. if image_uri: # Single node custom container training worker_pool_spec = { 'machineSpec': { 'machineType': machine_type or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE }, 'replicaCount': '1', 'containerSpec': { 'imageUri': image_uri, } } if commands: worker_pool_spec['containerSpec']['command'] = commands if args: worker_pool_spec['containerSpec']['args'] = args custom_job_spec['workerPoolSpecs'] = [worker_pool_spec] if executor_image_uri: worker_pool_spec = { 'machineSpec': { 'machineType': machine_type or _DEFAULT_CUSTOM_JOB_MACHINE_TYPE }, 'replicaCount': '1', 'pythonPackageSpec': { 'executorImageUri': executor_image_uri, 'packageUris': package_uris, 'pythonModule': python_module, 'args': args } } custom_job_spec['workerPoolSpecs'] = [worker_pool_spec] else: # If the full-fledged job spec is provided. We'll use it as much as # possible, and patch some top-level parameters. for spec in custom_job_spec['workerPoolSpecs']: if image_uri: if (not spec.get('pythonPackageSpec') and not spec.get('containerSpec', {}).get('imageUri')): spec['containerSpec'] = spec.get('containerSpec', {}) spec['containerSpec']['imageUri'] = image_uri if commands: if (not spec.get('pythonPackageSpec') and not spec.get('containerSpec', {}).get('command')): spec['containerSpec'] = spec.get('containerSpec', {}) spec['containerSpec']['command'] = commands if executor_image_uri: if (not spec.get('containerSpec') and not spec.get( 'pythonPackageSpec', {}).get('executorImageUri')): spec['pythonPackageSpec'] = spec.get( 'pythonPackageSpec', {}) spec['pythonPackageSpec'][ 'executorImageUri'] = executor_image_uri if package_uris: if (not spec.get('containerSpec') and not spec.get( 'pythonPackageSpec', {}).get('packageUris')): spec['pythonPackageSpec'] = spec.get( 'pythonPackageSpec', {}) spec['pythonPackageSpec']['packageUris'] = package_uris if python_module: if (not spec.get('containerSpec') and not spec.get( 'pythonPackageSpec', {}).get('pythonModule')): spec['pythonPackageSpec'] = spec.get( 'pythonPackageSpec', {}) spec['pythonPackageSpec']['pythonModule'] = python_module if args: if spec.get('containerSpec' ) and not spec['containerSpec'].get('args'): spec['containerSpec']['args'] = args if (spec.get('pythonPackageSpec') and not spec['pythonPackageSpec'].get('args')): spec['pythonPackageSpec']['args'] = args # Resolve the custom job spec by wiring it with the I/O spec. def _is_output_parameter(output_key: str) -> str: return output_key in output_parameters for wp_spec in custom_job_spec['workerPoolSpecs']: if 'containerSpec' in wp_spec: # For custom container training, resolve placeholders in commands and # program args. container_spec = wp_spec['containerSpec'] if 'command' in container_spec: dsl_utils.resolve_cmd_lines(container_spec['command'], _is_output_parameter) if 'args' in container_spec: dsl_utils.resolve_cmd_lines(container_spec['args'], _is_output_parameter) else: assert 'pythonPackageSpec' in wp_spec # For custom Python training, resolve placeholders in args only. python_spec = wp_spec['pythonPackageSpec'] if 'args' in python_spec: dsl_utils.resolve_cmd_lines(python_spec['args'], _is_output_parameter) job_spec = {'name': name, 'jobSpec': custom_job_spec} return _get_custom_job_op(task_name=name, job_spec=job_spec, input_artifacts=input_artifacts, input_parameters=input_parameters, output_artifacts=output_artifacts, output_parameters=output_parameters)