def BetaArgsForClusterRef(parser): """Register beta-only flags for creating a Dataproc cluster.""" flags.AddComponentFlag(parser) flags.AddMinCpuPlatformArgs(parser, base.ReleaseTrack.BETA) parser.add_argument('--max-idle', type=arg_parsers.Duration(), help="""\ The duration before cluster is auto-deleted after last job completes, such as "2h" or "1d". See $ gcloud topic datetimes for information on duration formats. """) auto_delete_group = parser.add_mutually_exclusive_group() auto_delete_group.add_argument('--max-age', type=arg_parsers.Duration(), help="""\ The lifespan of the cluster before it is auto-deleted, such as "2h" or "1d". See $ gcloud topic datetimes for information on duration formats. """) auto_delete_group.add_argument('--expiration-time', type=arg_parsers.Datetime.Parse, help="""\ The time when cluster will be auto-deleted, such as "2017-08-29T18:52:51.142Z." See $ gcloud topic datetimes for information on time formats. """) for instance_type in ('master', 'worker'): help_msg = """\ Attaches accelerators (e.g. GPUs) to the {instance_type} instance(s). """.format(instance_type=instance_type) if instance_type == 'worker': help_msg += """ Note: No accelerators will be attached to preemptible workers, because preemptible VMs do not support accelerators. """ help_msg += """ *type*::: The specific type (e.g. nvidia-tesla-k80 for nVidia Tesla K80) of accelerator to attach to the instances. Use 'gcloud compute accelerator-types list' to learn about all available accelerator types. *count*::: The number of pieces of the accelerator to attach to each of the instances. The default value is 1. """ parser.add_argument('--{0}-accelerator'.format(instance_type), type=arg_parsers.ArgDict(spec={ 'type': str, 'count': int, }), metavar='type=TYPE,[count=COUNT]', help=help_msg) AddAllocationAffinityGroup(parser)
def ArgsForClusterRef(parser, beta=False, include_deprecated=True, include_ttl_config=False, include_gke_platform_args=False): """Register flags for creating a dataproc cluster. Args: parser: The argparse.ArgParser to configure with dataproc cluster arguments. beta: whether or not this is a beta command (may affect flag visibility) include_deprecated: whether deprecated flags should be included include_ttl_config: whether to include Scheduled Delete(TTL) args include_gke_platform_args: whether to include GKE-based cluster args """ labels_util.AddCreateLabelsFlags(parser) # 30m is backend timeout + 5m for safety buffer. flags.AddTimeoutFlag(parser, default='35m') flags.AddZoneFlag(parser, short_flags=include_deprecated) flags.AddComponentFlag(parser) platform_group = parser.add_argument_group(mutex=True) gce_platform_group = platform_group.add_argument_group(help="""\ Compute Engine options for Dataproc clusters. """) instances_flags.AddTagsArgs(gce_platform_group) gce_platform_group.add_argument( '--metadata', type=arg_parsers.ArgDict(min_length=1), action='append', default=None, help=('Metadata to be made available to the guest operating system ' 'running on the instances'), metavar='KEY=VALUE') # Either allow creating a single node cluster (--single-node), or specifying # the number of workers in the multi-node cluster (--num-workers and # --num-secondary-workers) node_group = parser.add_argument_group(mutex=True) # Mutually exclusive node_group.add_argument('--single-node', action='store_true', help="""\ Create a single node cluster. A single node cluster has all master and worker components. It cannot have any separate worker nodes. If this flag is not specified, a cluster with separate workers is created. """) # Not mutually exclusive worker_group = node_group.add_argument_group( help='Multi-node cluster flags') worker_group.add_argument( '--num-workers', type=int, help='The number of worker nodes in the cluster. Defaults to ' 'server-specified.') worker_group.add_argument( '--secondary-worker-type', hidden=True, metavar='TYPE', choices=['preemptible', 'non-preemptible', 'unspecified'], default='unspecified', help='The type of the secondary worker group.') num_secondary_workers = worker_group.add_argument_group(mutex=True) num_secondary_workers.add_argument( '--num-preemptible-workers', action=actions.DeprecationAction( '--num-preemptible-workers', warn=('The `--num-preemptible-workers` flag is deprecated. ' 'Use the `--num-secondary-workers` flag instead.')), type=int, hidden=True, help='The number of preemptible worker nodes in the cluster.') num_secondary_workers.add_argument( '--num-secondary-workers', type=int, help='The number of secondary worker nodes in the cluster.') parser.add_argument( '--master-machine-type', help='The type of machine to use for the master. Defaults to ' 'server-specified.') parser.add_argument( '--worker-machine-type', help='The type of machine to use for workers. Defaults to ' 'server-specified.') image_parser = parser.add_mutually_exclusive_group() # TODO(b/73291743): Add external doc link to --image image_parser.add_argument( '--image', metavar='IMAGE', help='The full custom image URI or the custom image name that ' 'will be used to create a cluster.') image_parser.add_argument( '--image-version', metavar='VERSION', help='The image version to use for the cluster. Defaults to the ' 'latest version.') parser.add_argument('--bucket', help="""\ The Google Cloud Storage bucket to use by default to stage job dependencies, miscellaneous config files, and job driver console output when using this cluster. """) netparser = gce_platform_group.add_argument_group(mutex=True) netparser.add_argument('--network', help="""\ The Compute Engine network that the VM instances of the cluster will be part of. This is mutually exclusive with --subnet. If neither is specified, this defaults to the "default" network. """) netparser.add_argument('--subnet', help="""\ Specifies the subnet that the cluster will be part of. This is mutally exclusive with --network. """) parser.add_argument( '--num-worker-local-ssds', type=int, help='The number of local SSDs to attach to each worker in a cluster.') parser.add_argument( '--num-master-local-ssds', type=int, help='The number of local SSDs to attach to the master in a cluster.') secondary_worker_local_ssds = parser.add_argument_group(mutex=True) secondary_worker_local_ssds.add_argument( '--num-preemptible-worker-local-ssds', type=int, hidden=True, action=actions.DeprecationAction( '--num-preemptible-worker-local-ssds', warn=( 'The `--num-preemptible-worker-local-ssds` flag is deprecated. ' 'Use the `--num-secondary-worker-local-ssds` flag instead.')), help="""\ The number of local SSDs to attach to each secondary worker in a cluster. """) secondary_worker_local_ssds.add_argument( '--num-secondary-worker-local-ssds', type=int, help="""\ The number of local SSDs to attach to each preemptible worker in a cluster. """) parser.add_argument( '--initialization-actions', type=arg_parsers.ArgList(min_length=1), metavar='CLOUD_STORAGE_URI', help=('A list of Google Cloud Storage URIs of ' 'executables to run on each node in the cluster.')) parser.add_argument( '--initialization-action-timeout', type=arg_parsers.Duration(), metavar='TIMEOUT', default='10m', help=('The maximum duration of each initialization action. See ' '$ gcloud topic datetimes for information on duration formats.')) parser.add_argument( '--num-masters', type=arg_parsers.CustomFunctionValidator( lambda n: int(n) in [1, 3], 'Number of masters must be 1 (Standard) or 3 (High Availability)', parser=arg_parsers.BoundedInt(1, 3)), help="""\ The number of master nodes in the cluster. Number of Masters | Cluster Mode --- | --- 1 | Standard 3 | High Availability """) parser.add_argument('--properties', type=arg_parsers.ArgDict(), action=arg_parsers.UpdateAction, default={}, metavar='PREFIX:PROPERTY=VALUE', help="""\ Specifies configuration properties for installed packages, such as Hadoop and Spark. Properties are mapped to configuration files by specifying a prefix, such as "core:io.serializations". The following are supported prefixes and their mappings: Prefix | File | Purpose of file --- | --- | --- capacity-scheduler | capacity-scheduler.xml | Hadoop YARN Capacity Scheduler configuration core | core-site.xml | Hadoop general configuration distcp | distcp-default.xml | Hadoop Distributed Copy configuration hadoop-env | hadoop-env.sh | Hadoop specific environment variables hdfs | hdfs-site.xml | Hadoop HDFS configuration hive | hive-site.xml | Hive configuration mapred | mapred-site.xml | Hadoop MapReduce configuration mapred-env | mapred-env.sh | Hadoop MapReduce specific environment variables pig | pig.properties | Pig configuration spark | spark-defaults.conf | Spark configuration spark-env | spark-env.sh | Spark specific environment variables yarn | yarn-site.xml | Hadoop YARN configuration yarn-env | yarn-env.sh | Hadoop YARN specific environment variables See https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/cluster-properties for more information. """) gce_platform_group.add_argument( '--service-account', help='The Google Cloud IAM service account to be authenticated as.') gce_platform_group.add_argument('--scopes', type=arg_parsers.ArgList(min_length=1), metavar='SCOPE', help="""\ Specifies scopes for the node instances. Multiple SCOPEs can be specified, separated by commas. Examples: $ {{command}} example-cluster --scopes https://www.googleapis.com/auth/bigtable.admin $ {{command}} example-cluster --scopes sqlservice,bigquery The following *minimum scopes* are necessary for the cluster to function properly and are always added, even if not explicitly specified: {minimum_scopes} If the `--scopes` flag is not specified, the following *default scopes* are also included: {additional_scopes} If you want to enable all scopes use the 'cloud-platform' scope. {scopes_help} """.format(minimum_scopes='\n '.join(constants.MINIMUM_SCOPE_URIS), additional_scopes='\n '.join( constants.ADDITIONAL_DEFAULT_SCOPE_URIS), scopes_help=compute_helpers.SCOPES_HELP)) if include_deprecated: _AddDiskArgsDeprecated(parser) else: _AddDiskArgs(parser) # --no-address is an exception to the no negative-flag style guildline to be # consistent with gcloud compute instances create --no-address parser.add_argument('--no-address', action='store_true', help="""\ If provided, the instances in the cluster will not be assigned external IP addresses. If omitted the instances in the cluster will each be assigned an ephemeral external IP address. Note: Dataproc VMs need access to the Dataproc API. This can be achieved without external IP addresses using Private Google Access (https://cloud.google.com/compute/docs/private-google-access). """) boot_disk_type_detailed_help = """\ The type of the boot disk. The value must be ``pd-standard'' or ``pd-ssd''. """ parser.add_argument('--master-boot-disk-type', help=boot_disk_type_detailed_help) parser.add_argument('--worker-boot-disk-type', help=boot_disk_type_detailed_help) secondary_worker_boot_disk_type = parser.add_argument_group(mutex=True) secondary_worker_boot_disk_type.add_argument( '--preemptible-worker-boot-disk-type', help=boot_disk_type_detailed_help, hidden=True, action=actions.DeprecationAction( '--preemptible-worker-boot-disk-type', warn=( 'The `--preemptible-worker-boot-disk-type` flag is deprecated. ' 'Use the `--secondary-worker-boot-disk-type` flag instead.'))) secondary_worker_boot_disk_type.add_argument( '--secondary-worker-boot-disk-type', help=boot_disk_type_detailed_help) autoscaling_group = parser.add_argument_group() flags.AddAutoscalingPolicyResourceArgForCluster( autoscaling_group, api_version=('v1beta2' if beta else 'v1')) if include_ttl_config: parser.add_argument('--max-idle', type=arg_parsers.Duration(), help="""\ The duration before cluster is auto-deleted after last job completes, such as "2h" or "1d". See $ gcloud topic datetimes for information on duration formats. """) auto_delete_group = parser.add_mutually_exclusive_group() auto_delete_group.add_argument('--max-age', type=arg_parsers.Duration(), help="""\ The lifespan of the cluster before it is auto-deleted, such as "2h" or "1d". See $ gcloud topic datetimes for information on duration formats. """) auto_delete_group.add_argument('--expiration-time', type=arg_parsers.Datetime.Parse, help="""\ The time when cluster will be auto-deleted, such as "2017-08-29T18:52:51.142Z." See $ gcloud topic datetimes for information on time formats. """) AddKerberosGroup(parser) flags.AddMinCpuPlatformArgs(parser) _AddAcceleratorArgs(parser) AddReservationAffinityGroup( gce_platform_group, group_text='Specifies the reservation for the instance.', affinity_text='The type of reservation for the instance.') if include_gke_platform_args: gke_based_cluster_group = platform_group.add_argument_group( hidden=True, help="""\ Options for creating a GKE-based Dataproc cluster. Specifying any of these will indicate that this cluster is intended to be a GKE-based cluster. These options are mutually exclusive with GCE-based options. """) gke_based_cluster_group.add_argument('--gke-cluster', hidden=True, help="""\ Required for GKE-based clusters. Specify the name of the GKE cluster to deploy this GKE-based Dataproc cluster to. This should be the short name and not the full path name. """) gke_based_cluster_group.add_argument('--gke-cluster-namespace', hidden=True, help="""\ Optional. Specify the name of the namespace to deploy Dataproc system components into. This namespace does not need to already exist. """)
def Args(parser): flags.AddTemplateResourceArg(parser, 'set managed cluster') flags.AddComponentFlag(parser) parser.add_argument('--cluster-name', help="""\ The name of the managed dataproc cluster. If unspecified, the workflow template ID will be used.""") clusters.ArgsForClusterRef(parser, beta=True) flags.AddMinCpuPlatformArgs(parser, base.ReleaseTrack.BETA) # TODO(b/70164645): Consolidate these arguments with the other beta args # All arguments for these arguments are duplicated from the cluster creation # beta track. There should be a ArgsForClusterRefBeta method in clusters.py # that is invoked here so that we don't have to duplicate the arguments. parser.add_argument('--max-idle', type=arg_parsers.Duration(), help="""\ The duration before cluster is auto-deleted after last job completes, such as "2h" or "1d". See $ gcloud topic datetimes for information on duration formats. """) auto_delete_group = parser.add_mutually_exclusive_group() auto_delete_group.add_argument('--max-age', type=arg_parsers.Duration(), help="""\ The lifespan of the cluster before it is auto-deleted, such as "2h" or "1d". See $ gcloud topic datetimes for information on duration formats. """) auto_delete_group.add_argument('--expiration-time', type=arg_parsers.Datetime.Parse, help="""\ The time when cluster will be auto-deleted, such as "2017-08-29T18:52:51.142Z." See $ gcloud topic datetimes for information on time formats. """) for instance_type in ('master', 'worker'): help_msg = """\ Attaches accelerators (e.g. GPUs) to the {instance_type} instance(s). """.format(instance_type=instance_type) if instance_type == 'worker': help_msg += """ Note: No accelerators will be attached to preemptible workers, because preemptible VMs do not support accelerators. """ help_msg += """ *type*::: The specific type (e.g. nvidia-tesla-k80 for nVidia Tesla K80) of accelerator to attach to the instances. Use 'gcloud compute accelerator-types list' to learn about all available accelerator types. *count*::: The number of pieces of the accelerator to attach to each of the instances. The default value is 1. """ parser.add_argument('--{0}-accelerator'.format(instance_type), type=arg_parsers.ArgDict(spec={ 'type': str, 'count': int, }), metavar='type=TYPE,[count=COUNT]', help=help_msg)
def ArgsForClusterRef(parser, beta=False, include_deprecated=True): """Register flags for creating a dataproc cluster. Args: parser: The argparse.ArgParser to configure with dataproc cluster arguments. beta: whether or not this is a beta command (may affect flag visibility) include_deprecated: whether deprecated flags should be included """ labels_util.AddCreateLabelsFlags(parser) instances_flags.AddTagsArgs(parser) # 30m is backend timeout + 5m for safety buffer. flags.AddTimeoutFlag(parser, default='35m') flags.AddZoneFlag(parser, short_flags=include_deprecated) flags.AddComponentFlag(parser, not beta) # Hidden in GA track. parser.add_argument( '--metadata', type=arg_parsers.ArgDict(min_length=1), action='append', default=None, help=('Metadata to be made available to the guest operating system ' 'running on the instances'), metavar='KEY=VALUE') # Either allow creating a single node cluster (--single-node), or specifying # the number of workers in the multi-node cluster (--num-workers and # --num-preemptible-workers) node_group = parser.add_argument_group(mutex=True) # Mutually exclusive node_group.add_argument('--single-node', action='store_true', help="""\ Create a single node cluster. A single node cluster has all master and worker components. It cannot have any separate worker nodes. If this flag is not specified, a cluster with separate workers is created. """) # Not mutually exclusive worker_group = node_group.add_argument_group( help='Multi-node cluster flags') worker_group.add_argument( '--num-workers', type=int, help='The number of worker nodes in the cluster. Defaults to ' 'server-specified.') worker_group.add_argument( '--num-preemptible-workers', type=int, help='The number of preemptible worker nodes in the cluster.') parser.add_argument( '--master-machine-type', help='The type of machine to use for the master. Defaults to ' 'server-specified.') parser.add_argument( '--worker-machine-type', help='The type of machine to use for workers. Defaults to ' 'server-specified.') image_parser = parser.add_mutually_exclusive_group() # TODO(b/73291743): Add external doc link to --image image_parser.add_argument( '--image', metavar='IMAGE', help='The full custom image URI or the custom image name that ' 'will be used to create a cluster.') image_parser.add_argument( '--image-version', metavar='VERSION', help='The image version to use for the cluster. Defaults to the ' 'latest version.') parser.add_argument( '--bucket', help='The Google Cloud Storage bucket to use with the Google Cloud ' 'Storage connector. A bucket is auto created when this parameter is ' 'not specified.') netparser = parser.add_mutually_exclusive_group() netparser.add_argument('--network', help="""\ The Compute Engine network that the VM instances of the cluster will be part of. This is mutually exclusive with --subnet. If neither is specified, this defaults to the "default" network. """) netparser.add_argument('--subnet', help="""\ Specifies the subnet that the cluster will be part of. This is mutally exclusive with --network. """) parser.add_argument( '--num-worker-local-ssds', type=int, help='The number of local SSDs to attach to each worker in a cluster.') parser.add_argument( '--num-master-local-ssds', type=int, help='The number of local SSDs to attach to the master in a cluster.') parser.add_argument('--num-preemptible-worker-local-ssds', type=int, help="""\ The number of local SSDs to attach to each preemptible worker in a cluster. """) parser.add_argument( '--initialization-actions', type=arg_parsers.ArgList(min_length=1), metavar='CLOUD_STORAGE_URI', help=('A list of Google Cloud Storage URIs of ' 'executables to run on each node in the cluster.')) parser.add_argument( '--initialization-action-timeout', type=arg_parsers.Duration(), metavar='TIMEOUT', default='10m', help=('The maximum duration of each initialization action. See ' '$ gcloud topic datetimes for information on duration formats.')) parser.add_argument( '--num-masters', type=arg_parsers.CustomFunctionValidator( lambda n: int(n) in [1, 3], 'Number of masters must be 1 (Standard) or 3 (High Availability)', parser=arg_parsers.BoundedInt(1, 3)), help="""\ The number of master nodes in the cluster. Number of Masters | Cluster Mode --- | --- 1 | Standard 3 | High Availability """) parser.add_argument('--properties', type=arg_parsers.ArgDict(), metavar='PREFIX:PROPERTY=VALUE', default={}, help="""\ Specifies configuration properties for installed packages, such as Hadoop and Spark. Properties are mapped to configuration files by specifying a prefix, such as "core:io.serializations". The following are supported prefixes and their mappings: Prefix | File | Purpose of file --- | --- | --- capacity-scheduler | capacity-scheduler.xml | Hadoop YARN Capacity Scheduler configuration core | core-site.xml | Hadoop general configuration distcp | distcp-default.xml | Hadoop Distributed Copy configuration hadoop-env | hadoop-env.sh | Hadoop specific environment variables hdfs | hdfs-site.xml | Hadoop HDFS configuration hive | hive-site.xml | Hive configuration mapred | mapred-site.xml | Hadoop MapReduce configuration mapred-env | mapred-env.sh | Hadoop MapReduce specific environment variables pig | pig.properties | Pig configuration spark | spark-defaults.conf | Spark configuration spark-env | spark-env.sh | Spark specific environment variables yarn | yarn-site.xml | Hadoop YARN configuration yarn-env | yarn-env.sh | Hadoop YARN specific environment variables See https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/cluster-properties for more information. """) parser.add_argument( '--service-account', help='The Google Cloud IAM service account to be authenticated as.') parser.add_argument('--scopes', type=arg_parsers.ArgList(min_length=1), metavar='SCOPE', help="""\ Specifies scopes for the node instances. Multiple SCOPEs can be specified, separated by commas. Examples: $ {{command}} example-cluster --scopes https://www.googleapis.com/auth/bigtable.admin $ {{command}} example-cluster --scopes sqlservice,bigquery The following *minimum scopes* are necessary for the cluster to function properly and are always added, even if not explicitly specified: {minimum_scopes} If the `--scopes` flag is not specified, the following *default scopes* are also included: {additional_scopes} If you want to enable all scopes use the 'cloud-platform' scope. {scopes_help} """.format(minimum_scopes='\n '.join(constants.MINIMUM_SCOPE_URIS), additional_scopes='\n '.join( constants.ADDITIONAL_DEFAULT_SCOPE_URIS), scopes_help=compute_helpers.SCOPES_HELP)) if include_deprecated: _AddDiskArgsDeprecated(parser) else: _AddDiskArgs(parser) # --no-address is an exception to the no negative-flag style guildline to be # consistent with gcloud compute instances create --no-address parser.add_argument('--no-address', action='store_true', help="""\ If provided, the instances in the cluster will not be assigned external IP addresses. If omitted the instances in the cluster will each be assigned an ephemeral external IP address. Note: Dataproc VMs need access to the Dataproc API. This can be achieved without external IP addresses using Private Google Access (https://cloud.google.com/compute/docs/private-google-access). """) boot_disk_type_detailed_help = """\ The type of the boot disk. The value must be ``pd-standard'' or ``pd-ssd''. """ parser.add_argument('--master-boot-disk-type', help=boot_disk_type_detailed_help) parser.add_argument('--worker-boot-disk-type', help=boot_disk_type_detailed_help) parser.add_argument('--preemptible-worker-boot-disk-type', help=boot_disk_type_detailed_help)