Beispiel #1
0
  def _GetStagedFile(self, file_str):
    """Validate file URI and register it for uploading if it is local."""
    drive, _ = os.path.splitdrive(file_str)
    uri = six.moves.urllib.parse.urlsplit(file_str, allow_fragments=False)
    # Determine the file is local to this machine if no scheme besides a drive
    # is passed. file:// URIs are interpreted as living on VMs.
    is_local = drive or not uri.scheme
    if not is_local:
      # Non-local files are already staged.
      # TODO(b/36057257): Validate scheme.
      return file_str

    if not os.path.exists(file_str):
      raise files.Error('File Not Found: [{0}].'.format(file_str))
    if self._staging_dir is None:
      # we raise this exception only if there are files to stage but the staging
      # location couldn't be determined. In case where files are already staged
      # this exception is not raised
      raise exceptions.ArgumentError(
          'Could not determine where to stage local file {0}. When submitting '
          'a job to a cluster selected via --cluster-labels, either\n'
          '- a staging bucket must be provided via the --bucket argument, or\n'
          '- all provided files must be non-local.'.format(file_str))

    basename = os.path.basename(file_str)
    self.files_to_stage.append(file_str)
    staged_file = six.moves.urllib.parse.urljoin(self._staging_dir, basename)
    return staged_file
Beispiel #2
0
    def Run(self, args):
        client = self.context['dataproc_client']
        messages = self.context['dataproc_messages']

        project = properties.VALUES.core.project.Get(required=True)
        region = self.context['dataproc_region']
        request = self.GetRequest(messages, project, region, args)

        if args.cluster:
            request.clusterName = args.cluster

        if args.state_filter:
            if args.state_filter == 'active':
                request.jobStateMatcher = (
                    messages.DataprocProjectsRegionsJobsListRequest.
                    JobStateMatcherValueValuesEnum.ACTIVE)
            elif args.state_filter == 'inactive':
                request.jobStateMatcher = (
                    messages.DataprocProjectsRegionsJobsListRequest.
                    JobStateMatcherValueValuesEnum.NON_ACTIVE)
            else:
                raise exceptions.ArgumentError(
                    'Invalid state-filter; [{0}].'.format(args.state_filter))

        jobs = list_pager.YieldFromList(client.projects_regions_jobs,
                                        request,
                                        limit=args.limit,
                                        field='jobs',
                                        batch_size=args.page_size,
                                        batch_size_attribute='pageSize')
        return (TypedJob(job) for job in jobs)
Beispiel #3
0
    def Run(self, args):
        dataproc = dp.Dataproc()

        project = properties.VALUES.core.project.GetOrFail()
        region = properties.VALUES.dataproc.region.GetOrFail()

        request = self.GetRequest(dataproc.messages, project, region, args)

        if args.cluster:
            request.clusterName = args.cluster

        if args.state_filter:
            if args.state_filter == 'active':
                request.jobStateMatcher = (
                    dataproc.messages.DataprocProjectsRegionsJobsListRequest.
                    JobStateMatcherValueValuesEnum.ACTIVE)
            # TODO(b/32669485) Get full flag test coverage.
            elif args.state_filter == 'inactive':
                request.jobStateMatcher = (
                    dataproc.messages.DataprocProjectsRegionsJobsListRequest.
                    JobStateMatcherValueValuesEnum.NON_ACTIVE)
            else:
                raise exceptions.ArgumentError(
                    'Invalid state-filter; [{0}].'.format(args.state_filter))

        jobs = list_pager.YieldFromList(dataproc.client.projects_regions_jobs,
                                        request,
                                        limit=args.limit,
                                        field='jobs',
                                        batch_size=args.page_size,
                                        batch_size_attribute='pageSize')
        return (TypedJob(job) for job in jobs)
def ValidateReservationAffinityGroup(args):
    """Validates flags specifying reservation affinity."""
    affinity = getattr(args, 'reservation_affinity', None)
    if affinity == 'specific':
        if not args.IsSpecified('reservation'):
            raise exceptions.ArgumentError(
                '--reservation must be specified with --reservation-affinity=specific'
            )
Beispiel #5
0
    def Run(self, args):
        client = self.context['dataproc_client']
        messages = self.context['dataproc_messages']

        job_ref = util.ParseJob(args.id, self.context)

        changed_fields = []

        has_changes = False

        # Update labels if the user requested it
        labels = None
        if args.update_labels or args.remove_labels:
            has_changes = True
            changed_fields.append('labels')

            # We need to fetch the job first so we know what the labels look like. The
            # labels_util.UpdateLabels will fill out the proto for us with all the
            # updates and removals, but first we need to provide the current state
            # of the labels
            orig_job = client.projects_regions_jobs.Get(
                client.MESSAGES_MODULE.DataprocProjectsRegionsJobsGetRequest(
                    projectId=job_ref.projectId,
                    region=job_ref.region,
                    jobId=job_ref.jobId))

            labels = labels_util.UpdateLabels(orig_job.labels,
                                              messages.Job.LabelsValue,
                                              args.update_labels,
                                              args.remove_labels)

        if not has_changes:
            raise exceptions.ArgumentError(
                'Must specify at least one job parameter to update.')

        updated_job = orig_job
        updated_job.labels = labels
        request = messages.DataprocProjectsRegionsJobsPatchRequest(
            projectId=job_ref.projectId,
            region=job_ref.region,
            jobId=job_ref.jobId,
            job=updated_job,
            updateMask=','.join(changed_fields))

        returned_job = client.projects_regions_jobs.Patch(request)

        log.UpdatedResource(returned_job)
        return returned_job
Beispiel #6
0
  def GetFilesByType(args):
    """Returns a dict of files by their type (jars, archives, etc.)."""
    # TODO(user): Move arg manipulation elsewhere.
    # TODO(user): Remove with GA flags 2017-04-01 (b/33298024).
    if not args.main_class and not args.main_jar:
      raise exceptions.ArgumentError('Must either specify --class or JAR.')
    if args.main_class and args.main_jar:
      log.warn(
          'You must specify exactly one of --jar and --class. '
          'This will be strictly enforced in April 2017. '
          "Use 'gcloud beta dataproc jobs submit spark' to see new behavior.")
      log.info('Passing main jar as an additional jar.')
      args.jars.append(args.main_jar)
      args.main_jar = None

    return {
        'main_jar': args.main_jar,
        'jars': args.jars,
        'archives': args.archives,
        'files': args.files}
def GetClusterConfig(args,
                     dataproc,
                     project_id,
                     compute_resources,
                     beta=False,
                     include_deprecated=True,
                     include_ttl_config=False,
                     include_gke_platform_args=False):
    """Get dataproc cluster configuration.

  Args:
    args: Arguments parsed from argparse.ArgParser.
    dataproc: Dataproc object that contains client, messages, and resources
    project_id: Dataproc project ID
    compute_resources: compute resource for cluster
    beta: use BETA only features
    include_deprecated: whether to include deprecated args
    include_ttl_config: whether to include Scheduled Delete(TTL) args
    include_gke_platform_args: whether to include GKE-based cluster args

  Returns:
    cluster_config: Dataproc cluster configuration
  """
    master_accelerator_type = None
    worker_accelerator_type = None
    secondary_worker_accelerator_type = None

    if args.master_accelerator:
        master_accelerator_type = args.master_accelerator['type']
        master_accelerator_count = args.master_accelerator.get('count', 1)

    if args.worker_accelerator:
        worker_accelerator_type = args.worker_accelerator['type']
        worker_accelerator_count = args.worker_accelerator.get('count', 1)

    secondary_worker_accelerator = _FirstNonNone(
        args.secondary_worker_accelerator, args.preemptible_worker_accelerator)
    if secondary_worker_accelerator:
        secondary_worker_accelerator_type = secondary_worker_accelerator[
            'type']
        secondary_worker_accelerator_count = secondary_worker_accelerator.get(
            'count', 1)

    # Resolve non-zonal GCE resources
    # We will let the server resolve short names of zonal resources because
    # if auto zone is requested, we will not know the zone before sending the
    # request
    image_ref = args.image and compute_resources.Parse(
        args.image,
        params={'project': project_id},
        collection='compute.images')
    network_ref = args.network and compute_resources.Parse(
        args.network,
        params={'project': project_id},
        collection='compute.networks')
    subnetwork_ref = args.subnet and compute_resources.Parse(
        args.subnet,
        params={
            'project': project_id,
            'region': properties.VALUES.compute.region.GetOrFail,
        },
        collection='compute.subnetworks')
    timeout_str = six.text_type(args.initialization_action_timeout) + 's'
    init_actions = [
        dataproc.messages.NodeInitializationAction(
            executableFile=exe, executionTimeout=timeout_str)
        for exe in (args.initialization_actions or [])
    ]
    # Increase the client timeout for each initialization action.
    args.timeout += args.initialization_action_timeout * len(init_actions)

    expanded_scopes = compute_helpers.ExpandScopeAliases(args.scopes)

    software_config = dataproc.messages.SoftwareConfig(
        imageVersion=args.image_version)

    if include_deprecated:
        master_boot_disk_size_gb = args.master_boot_disk_size_gb
    else:
        master_boot_disk_size_gb = None
    if args.master_boot_disk_size:
        master_boot_disk_size_gb = (api_utils.BytesToGb(
            args.master_boot_disk_size))

    if include_deprecated:
        worker_boot_disk_size_gb = args.worker_boot_disk_size_gb
    else:
        worker_boot_disk_size_gb = None
    if args.worker_boot_disk_size:
        worker_boot_disk_size_gb = (api_utils.BytesToGb(
            args.worker_boot_disk_size))

    secondary_worker_boot_disk_size_gb = (api_utils.BytesToGb(
        _FirstNonNone(args.secondary_worker_boot_disk_size,
                      args.preemptible_worker_boot_disk_size)))

    if args.single_node or args.num_workers == 0:
        # Explicitly specifying --num-workers=0 gives you a single node cluster,
        # but if --num-workers is omitted, args.num_workers is None (not 0), and
        # this property will not be set
        args.properties[constants.ALLOW_ZERO_WORKERS_PROPERTY] = 'true'

    if args.properties:
        software_config.properties = encoding.DictToAdditionalPropertyMessage(
            args.properties,
            dataproc.messages.SoftwareConfig.PropertiesValue,
            sort_items=True)

    if args.components:
        software_config_cls = dataproc.messages.SoftwareConfig
        software_config.optionalComponents.extend(
            list(
                map(
                    software_config_cls.
                    OptionalComponentsValueListEntryValuesEnum,
                    args.components)))

    gce_cluster_config = dataproc.messages.GceClusterConfig(
        networkUri=network_ref and network_ref.SelfLink(),
        subnetworkUri=subnetwork_ref and subnetwork_ref.SelfLink(),
        internalIpOnly=args.no_address,
        serviceAccount=args.service_account,
        serviceAccountScopes=expanded_scopes,
        zoneUri=properties.VALUES.compute.zone.GetOrFail())

    reservation_affinity = GetReservationAffinity(args, dataproc)
    gce_cluster_config.reservationAffinity = reservation_affinity

    if args.tags:
        gce_cluster_config.tags = args.tags

    if args.metadata:
        flat_metadata = collections.OrderedDict([(k, v) for d in args.metadata
                                                 for k, v in d.items()])
        gce_cluster_config.metadata = encoding.DictToAdditionalPropertyMessage(
            flat_metadata, dataproc.messages.GceClusterConfig.MetadataValue)

    master_accelerators = []
    if master_accelerator_type:
        master_accelerators.append(
            dataproc.messages.AcceleratorConfig(
                acceleratorTypeUri=master_accelerator_type,
                acceleratorCount=master_accelerator_count))
    worker_accelerators = []
    if worker_accelerator_type:
        worker_accelerators.append(
            dataproc.messages.AcceleratorConfig(
                acceleratorTypeUri=worker_accelerator_type,
                acceleratorCount=worker_accelerator_count))
    secondary_worker_accelerators = []
    if secondary_worker_accelerator_type:
        secondary_worker_accelerators.append(
            dataproc.messages.AcceleratorConfig(
                acceleratorTypeUri=secondary_worker_accelerator_type,
                acceleratorCount=secondary_worker_accelerator_count))

    cluster_config = dataproc.messages.ClusterConfig(
        configBucket=args.bucket,
        gceClusterConfig=gce_cluster_config,
        masterConfig=dataproc.messages.InstanceGroupConfig(
            numInstances=args.num_masters,
            imageUri=image_ref and image_ref.SelfLink(),
            machineTypeUri=args.master_machine_type,
            accelerators=master_accelerators,
            diskConfig=GetDiskConfig(dataproc, args.master_boot_disk_type,
                                     master_boot_disk_size_gb,
                                     args.num_master_local_ssds),
            minCpuPlatform=args.master_min_cpu_platform),
        workerConfig=dataproc.messages.InstanceGroupConfig(
            numInstances=args.num_workers,
            imageUri=image_ref and image_ref.SelfLink(),
            machineTypeUri=args.worker_machine_type,
            accelerators=worker_accelerators,
            diskConfig=GetDiskConfig(
                dataproc,
                args.worker_boot_disk_type,
                worker_boot_disk_size_gb,
                args.num_worker_local_ssds,
            ),
            minCpuPlatform=args.worker_min_cpu_platform),
        initializationActions=init_actions,
        softwareConfig=software_config,
    )

    if args.kerberos_config_file or args.kerberos_root_principal_password_uri:
        cluster_config.securityConfig = dataproc.messages.SecurityConfig()
        if args.kerberos_config_file:
            cluster_config.securityConfig.kerberosConfig = ParseKerberosConfigFile(
                dataproc, args.kerberos_config_file)
        else:
            kerberos_config = dataproc.messages.KerberosConfig()
            kerberos_config.enableKerberos = True
            if args.kerberos_root_principal_password_uri:
                kerberos_config.rootPrincipalPasswordUri = \
                  args.kerberos_root_principal_password_uri
                kerberos_kms_ref = args.CONCEPTS.kerberos_kms_key.Parse()
                kerberos_config.kmsKeyUri = kerberos_kms_ref.RelativeName()
            cluster_config.securityConfig.kerberosConfig = kerberos_config

    if args.autoscaling_policy:
        cluster_config.autoscalingConfig = dataproc.messages.AutoscalingConfig(
            policyUri=args.CONCEPTS.autoscaling_policy.Parse().RelativeName())

    if include_ttl_config:
        lifecycle_config = dataproc.messages.LifecycleConfig()
        changed_config = False
        if args.max_age is not None:
            lifecycle_config.autoDeleteTtl = six.text_type(args.max_age) + 's'
            changed_config = True
        if args.expiration_time is not None:
            lifecycle_config.autoDeleteTime = times.FormatDateTime(
                args.expiration_time)
            changed_config = True
        if args.max_idle is not None:
            lifecycle_config.idleDeleteTtl = six.text_type(args.max_idle) + 's'
            changed_config = True
        if changed_config:
            cluster_config.lifecycleConfig = lifecycle_config

    if hasattr(args.CONCEPTS, 'kms_key'):
        kms_ref = args.CONCEPTS.kms_key.Parse()
        if kms_ref:
            encryption_config = dataproc.messages.EncryptionConfig()
            encryption_config.gcePdKmsKeyName = kms_ref.RelativeName()
            cluster_config.encryptionConfig = encryption_config
        else:
            # Did user use any gce-pd-kms-key flags?
            for keyword in [
                    'gce-pd-kms-key', 'gce-pd-kms-key-project',
                    'gce-pd-kms-key-location', 'gce-pd-kms-key-keyring'
            ]:
                if getattr(args, keyword.replace('-', '_'), None):
                    raise exceptions.ArgumentError(
                        '--gce-pd-kms-key was not fully specified.')

    # Secondary worker group is optional. However, users may specify
    # future pVMs configuration at creation time.
    num_secondary_workers = _FirstNonNone(args.num_secondary_workers,
                                          args.num_preemptible_workers)
    secondary_worker_boot_disk_type = _FirstNonNone(
        args.secondary_worker_boot_disk_type,
        args.preemptible_worker_boot_disk_type)
    num_secondary_worker_local_ssds = _FirstNonNone(
        args.num_secondary_worker_local_ssds,
        args.num_preemptible_worker_local_ssds)
    if (num_secondary_workers is not None
            or secondary_worker_boot_disk_size_gb is not None
            or secondary_worker_boot_disk_type is not None
            or num_secondary_worker_local_ssds is not None
            or args.worker_min_cpu_platform is not None
            or args.secondary_worker_type != 'unspecified'):
        cluster_config.secondaryWorkerConfig = (
            dataproc.messages.InstanceGroupConfig(
                numInstances=num_secondary_workers,
                accelerators=secondary_worker_accelerators,
                diskConfig=GetDiskConfig(
                    dataproc,
                    secondary_worker_boot_disk_type,
                    secondary_worker_boot_disk_size_gb,
                    num_secondary_worker_local_ssds,
                ),
                minCpuPlatform=args.worker_min_cpu_platform,
                preemptibility=_GetType(dataproc, args.secondary_worker_type)))

    if include_gke_platform_args:
        if args.enable_component_gateway:
            cluster_config.endpointConfig = dataproc.messages.EndpointConfig(
                enableHttpPortAccess=args.enable_component_gateway)
        if args.gke_cluster is not None:
            location = args.zone or args.region
            target_gke_cluster = 'projects/{0}/locations/{1}/clusters/{2}'.format(
                project_id, location, args.gke_cluster)
            cluster_config.gkeClusterConfig = dataproc.messages.GkeClusterConfig(
                namespacedGkeDeploymentTarget=dataproc.messages.
                NamespacedGkeDeploymentTarget(
                    targetGkeCluster=target_gke_cluster,
                    clusterNamespace=args.gke_cluster_namespace))
            cluster_config.gceClusterConfig = None
            cluster_config.masterConfig = None
            cluster_config.workerConfig = None
            cluster_config.secondaryWorkerConfig = None

    return cluster_config
Beispiel #8
0
def GetClusterConfig(args,
                     dataproc,
                     project_id,
                     compute_resources,
                     beta=False):
    """Get dataproc cluster configuration.

  Args:
    args: Arguments parsed from argparse.ArgParser.
    dataproc: Dataproc object that contains client, messages, and resources
    project_id: Dataproc project ID
    compute_resources: compute resource for cluster
    beta: use BETA only features

  Returns:
    cluster_config: Dataproc cluster configuration
  """
    master_accelerator_type = None
    worker_accelerator_type = None
    master_accelerator_count = None
    worker_accelerator_count = None
    if beta:
        if args.master_accelerator:
            master_accelerator_type = args.master_accelerator['type']
            master_accelerator_count = args.master_accelerator.get('count', 1)
        if args.worker_accelerator:
            worker_accelerator_type = args.worker_accelerator['type']
            worker_accelerator_count = args.worker_accelerator.get('count', 1)

    # Resolve non-zonal GCE resources
    # We will let the server resolve short names of zonal resources because
    # if auto zone is requested, we will not know the zone before sending the
    # request
    image_ref = args.image and compute_resources.Parse(
        args.image,
        params={'project': project_id},
        collection='compute.images')
    network_ref = args.network and compute_resources.Parse(
        args.network,
        params={'project': project_id},
        collection='compute.networks')
    subnetwork_ref = args.subnet and compute_resources.Parse(
        args.subnet,
        params={
            'project': project_id,
            'region': properties.VALUES.compute.region.GetOrFail,
        },
        collection='compute.subnetworks')
    timeout_str = str(args.initialization_action_timeout) + 's'
    init_actions = [
        dataproc.messages.NodeInitializationAction(
            executableFile=exe, executionTimeout=timeout_str)
        for exe in (args.initialization_actions or [])
    ]
    # Increase the client timeout for each initialization action.
    args.timeout += args.initialization_action_timeout * len(init_actions)

    expanded_scopes = compute_helpers.ExpandScopeAliases(args.scopes)

    software_config = dataproc.messages.SoftwareConfig(
        imageVersion=args.image_version)

    master_boot_disk_size_gb = args.master_boot_disk_size_gb
    if args.master_boot_disk_size:
        master_boot_disk_size_gb = (api_utils.BytesToGb(
            args.master_boot_disk_size))

    worker_boot_disk_size_gb = args.worker_boot_disk_size_gb
    if args.worker_boot_disk_size:
        worker_boot_disk_size_gb = (api_utils.BytesToGb(
            args.worker_boot_disk_size))

    preemptible_worker_boot_disk_size_gb = (api_utils.BytesToGb(
        args.preemptible_worker_boot_disk_size))

    if args.single_node or args.num_workers == 0:
        # Explicitly specifying --num-workers=0 gives you a single node cluster,
        # but if --num-workers is omitted, args.num_workers is None (not 0), and
        # this property will not be set
        args.properties[constants.ALLOW_ZERO_WORKERS_PROPERTY] = 'true'

    if args.properties:
        software_config.properties = encoding.DictToMessage(
            args.properties, dataproc.messages.SoftwareConfig.PropertiesValue)

    if beta:
        if args.components:
            software_config_cls = dataproc.messages.SoftwareConfig
            software_config.optionalComponents.extend(
                list(
                    map(
                        software_config_cls.
                        OptionalComponentsValueListEntryValuesEnum,
                        args.components)))

    gce_cluster_config = dataproc.messages.GceClusterConfig(
        networkUri=network_ref and network_ref.SelfLink(),
        subnetworkUri=subnetwork_ref and subnetwork_ref.SelfLink(),
        internalIpOnly=args.no_address,
        serviceAccount=args.service_account,
        serviceAccountScopes=expanded_scopes,
        zoneUri=properties.VALUES.compute.zone.GetOrFail())

    if args.tags:
        gce_cluster_config.tags = args.tags

    if args.metadata:
        flat_metadata = dict(
            (k, v) for d in args.metadata for k, v in d.items())
        gce_cluster_config.metadata = encoding.DictToMessage(
            flat_metadata, dataproc.messages.GceClusterConfig.MetadataValue)

    master_accelerators = []
    if master_accelerator_type:
        master_accelerators.append(
            dataproc.messages.AcceleratorConfig(
                acceleratorTypeUri=master_accelerator_type,
                acceleratorCount=master_accelerator_count))
    worker_accelerators = []
    if worker_accelerator_type:
        worker_accelerators.append(
            dataproc.messages.AcceleratorConfig(
                acceleratorTypeUri=worker_accelerator_type,
                acceleratorCount=worker_accelerator_count))

    cluster_config = dataproc.messages.ClusterConfig(
        configBucket=args.bucket,
        gceClusterConfig=gce_cluster_config,
        masterConfig=dataproc.messages.InstanceGroupConfig(
            numInstances=args.num_masters,
            imageUri=image_ref and image_ref.SelfLink(),
            machineTypeUri=args.master_machine_type,
            accelerators=master_accelerators,
            diskConfig=GetDiskConfig(dataproc, args.master_boot_disk_type,
                                     master_boot_disk_size_gb,
                                     args.num_master_local_ssds)),
        workerConfig=dataproc.messages.InstanceGroupConfig(
            numInstances=args.num_workers,
            imageUri=image_ref and image_ref.SelfLink(),
            machineTypeUri=args.worker_machine_type,
            accelerators=worker_accelerators,
            diskConfig=GetDiskConfig(
                dataproc,
                args.worker_boot_disk_type,
                worker_boot_disk_size_gb,
                args.num_worker_local_ssds,
            )),
        initializationActions=init_actions,
        softwareConfig=software_config,
    )

    if beta:
        cluster_config.masterConfig.minCpuPlatform = args.master_min_cpu_platform
        cluster_config.workerConfig.minCpuPlatform = args.worker_min_cpu_platform

    if beta:
        lifecycle_config = dataproc.messages.LifecycleConfig()
        changed_config = False
        if args.max_age is not None:
            lifecycle_config.autoDeleteTtl = str(args.max_age) + 's'
            changed_config = True
        if args.expiration_time is not None:
            lifecycle_config.autoDeleteTime = times.FormatDateTime(
                args.expiration_time)
            changed_config = True
        if args.max_idle is not None:
            lifecycle_config.idleDeleteTtl = str(args.max_idle) + 's'
            changed_config = True
        if changed_config:
            cluster_config.lifecycleConfig = lifecycle_config

    if beta and hasattr(args.CONCEPTS, 'kms_key'):
        kms_ref = args.CONCEPTS.kms_key.Parse()
        if kms_ref:
            encryption_config = dataproc.messages.EncryptionConfig()
            encryption_config.gcePdKmsKeyName = kms_ref.RelativeName()
            cluster_config.encryptionConfig = encryption_config
        else:
            # Did user use any gce-pd-kms-key flags?
            for keyword in [
                    'gce-pd-kms-key', 'gce-pd-kms-key-project',
                    'gce-pd-kms-key-location', 'gce-pd-kms-key-keyring'
            ]:
                if getattr(args, keyword.replace('-', '_'), None):
                    raise exceptions.ArgumentError(
                        '--gce-pd-kms-key was not fully specified.')

    # Secondary worker group is optional. However, users may specify
    # future pVMs configuration at creation time.
    if (args.num_preemptible_workers is not None
            or preemptible_worker_boot_disk_size_gb is not None
            or args.preemptible_worker_boot_disk_type is not None
            or (beta and args.worker_min_cpu_platform is not None)):
        cluster_config.secondaryWorkerConfig = (
            dataproc.messages.InstanceGroupConfig(
                numInstances=args.num_preemptible_workers,
                diskConfig=GetDiskConfig(
                    dataproc,
                    args.preemptible_worker_boot_disk_type,
                    preemptible_worker_boot_disk_size_gb,
                    None,
                )))
        if beta and args.worker_min_cpu_platform:
            cluster_config.secondaryWorkerConfig.minCpuPlatform = (
                args.worker_min_cpu_platform)

    return cluster_config
Beispiel #9
0
    def Run(self, args):
        dataproc = dp.Dataproc(self.ReleaseTrack())

        cluster_ref = util.ParseCluster(args.name, dataproc)

        cluster_config = dataproc.messages.ClusterConfig()
        changed_fields = []

        has_changes = False

        if args.num_workers is not None:
            worker_config = dataproc.messages.InstanceGroupConfig(
                numInstances=args.num_workers)
            cluster_config.workerConfig = worker_config
            changed_fields.append('config.worker_config.num_instances')
            has_changes = True

        if args.num_preemptible_workers is not None:
            worker_config = dataproc.messages.InstanceGroupConfig(
                numInstances=args.num_preemptible_workers)
            cluster_config.secondaryWorkerConfig = worker_config
            changed_fields.append(
                'config.secondary_worker_config.num_instances')
            has_changes = True

        if self.ReleaseTrack() == base.ReleaseTrack.BETA:
            lifecycle_config = dataproc.messages.LifecycleConfig()
            changed_config = False
            if args.max_age is not None:
                lifecycle_config.autoDeleteTtl = str(args.max_age) + 's'
                changed_config = True
            if args.expiration_time is not None:
                lifecycle_config.autoDeleteTime = times.FormatDateTime(
                    args.expiration_time)
                changed_config = True
            if args.max_idle is not None:
                lifecycle_config.idleDeleteTtl = str(args.max_idle) + 's'
                changed_config = True
            if changed_config:
                cluster_config.lifecycleConfig = lifecycle_config
                changed_fields.append('config.lifecycle_config')
                has_changes = True

        # Update labels if the user requested it
        labels = None
        if args.update_labels or args.remove_labels:
            has_changes = True
            changed_fields.append('labels')

            # We need to fetch cluster first so we know what the labels look like. The
            # labels_util.UpdateLabels will fill out the proto for us with all the
            # updates and removals, but first we need to provide the current state
            # of the labels
            get_cluster_request = (
                dataproc.messages.DataprocProjectsRegionsClustersGetRequest(
                    projectId=cluster_ref.projectId,
                    region=cluster_ref.region,
                    clusterName=cluster_ref.clusterName))
            current_cluster = dataproc.client.projects_regions_clusters.Get(
                get_cluster_request)
            labels = labels_util.UpdateLabels(
                current_cluster.labels, dataproc.messages.Cluster.LabelsValue,
                args.update_labels, args.remove_labels)

        if not has_changes:
            raise exceptions.ArgumentError(
                'Must specify at least one cluster parameter to update.')

        cluster = dataproc.messages.Cluster(
            config=cluster_config,
            clusterName=cluster_ref.clusterName,
            labels=labels,
            projectId=cluster_ref.projectId)

        request = dataproc.messages.DataprocProjectsRegionsClustersPatchRequest(
            clusterName=cluster_ref.clusterName,
            region=cluster_ref.region,
            projectId=cluster_ref.projectId,
            cluster=cluster,
            updateMask=','.join(changed_fields))

        if (self.ReleaseTrack() == base.ReleaseTrack.BETA
                and args.graceful_decommission_timeout):
            request.gracefulDecommissionTimeout = (
                str(args.graceful_decommission_timeout) + 's')

        operation = dataproc.client.projects_regions_clusters.Patch(request)

        if args. async:
            log.status.write('Updating [{0}] with operation [{1}].'.format(
                cluster_ref, operation.name))
            return

        util.WaitForOperation(dataproc,
                              operation,
                              message='Waiting for cluster update operation',
                              timeout_s=args.timeout)

        request = dataproc.messages.DataprocProjectsRegionsClustersGetRequest(
            projectId=cluster_ref.projectId,
            region=cluster_ref.region,
            clusterName=cluster_ref.clusterName)
        cluster = dataproc.client.projects_regions_clusters.Get(request)
        log.UpdatedResource(cluster_ref)
        return cluster
Beispiel #10
0
    def Run(self, args):
        dataproc = dp.Dataproc(self.ReleaseTrack())

        cluster_ref = util.ParseCluster(args.name, dataproc)

        cluster_config = dataproc.messages.ClusterConfig()
        changed_fields = []

        has_changes = False

        if args.num_workers is not None:
            worker_config = dataproc.messages.InstanceGroupConfig(
                numInstances=args.num_workers)
            cluster_config.workerConfig = worker_config
            changed_fields.append('config.worker_config.num_instances')
            has_changes = True

        if args.num_preemptible_workers is not None:
            worker_config = dataproc.messages.InstanceGroupConfig(
                numInstances=args.num_preemptible_workers)
            cluster_config.secondaryWorkerConfig = worker_config
            changed_fields.append(
                'config.secondary_worker_config.num_instances')
            has_changes = True

        if self.ReleaseTrack() == base.ReleaseTrack.BETA:
            if args.autoscaling_policy:
                cluster_config.autoscalingConfig = dataproc.messages.AutoscalingConfig(
                    policyUri=args.CONCEPTS.autoscaling_policy.Parse(
                    ).RelativeName())
                changed_fields.append('config.autoscaling_config.policy_uri')
                has_changes = True
            elif args.autoscaling_policy == '' or args.disable_autoscaling:  # pylint: disable=g-explicit-bool-comparison
                # Disabling autoscaling. Don't need to explicitly set
                # cluster_config.autoscaling_config to None.
                changed_fields.append('config.autoscaling_config.policy_uri')
                has_changes = True

            lifecycle_config = dataproc.messages.LifecycleConfig()
            changed_config = False
            if args.max_age is not None:
                lifecycle_config.autoDeleteTtl = str(args.max_age) + 's'
                changed_fields.append(
                    'config.lifecycle_config.auto_delete_ttl')
                changed_config = True
            if args.expiration_time is not None:
                lifecycle_config.autoDeleteTime = times.FormatDateTime(
                    args.expiration_time)
                changed_fields.append(
                    'config.lifecycle_config.auto_delete_time')
                changed_config = True
            if args.max_idle is not None:
                lifecycle_config.idleDeleteTtl = str(args.max_idle) + 's'
                changed_fields.append(
                    'config.lifecycle_config.idle_delete_ttl')
                changed_config = True
            if args.no_max_age:
                lifecycle_config.autoDeleteTtl = None
                changed_fields.append(
                    'config.lifecycle_config.auto_delete_ttl')
                changed_config = True
            if args.no_max_idle:
                lifecycle_config.idleDeleteTtl = None
                changed_fields.append(
                    'config.lifecycle_config.idle_delete_ttl')
                changed_config = True
            if changed_config:
                cluster_config.lifecycleConfig = lifecycle_config
                has_changes = True

        # Put in a thunk so we only make this call if needed
        def _GetCurrentLabels():
            # We need to fetch cluster first so we know what the labels look like. The
            # labels_util will fill out the proto for us with all the updates and
            # removals, but first we need to provide the current state of the labels
            get_cluster_request = (
                dataproc.messages.DataprocProjectsRegionsClustersGetRequest(
                    projectId=cluster_ref.projectId,
                    region=cluster_ref.region,
                    clusterName=cluster_ref.clusterName))
            current_cluster = dataproc.client.projects_regions_clusters.Get(
                get_cluster_request)
            return current_cluster.labels

        labels_update = labels_util.ProcessUpdateArgsLazy(
            args,
            dataproc.messages.Cluster.LabelsValue,
            orig_labels_thunk=_GetCurrentLabels)
        if labels_update.needs_update:
            has_changes = True
            changed_fields.append('labels')
        labels = labels_update.GetOrNone()

        if not has_changes:
            raise exceptions.ArgumentError(
                'Must specify at least one cluster parameter to update.')

        cluster = dataproc.messages.Cluster(
            config=cluster_config,
            clusterName=cluster_ref.clusterName,
            labels=labels,
            projectId=cluster_ref.projectId)

        request = dataproc.messages.DataprocProjectsRegionsClustersPatchRequest(
            clusterName=cluster_ref.clusterName,
            region=cluster_ref.region,
            projectId=cluster_ref.projectId,
            cluster=cluster,
            updateMask=','.join(changed_fields),
            requestId=util.GetUniqueId())

        if args.graceful_decommission_timeout is not None:
            request.gracefulDecommissionTimeout = (
                str(args.graceful_decommission_timeout) + 's')

        operation = dataproc.client.projects_regions_clusters.Patch(request)

        if args. async:
            log.status.write('Updating [{0}] with operation [{1}].'.format(
                cluster_ref, operation.name))
            return

        util.WaitForOperation(dataproc,
                              operation,
                              message='Waiting for cluster update operation',
                              timeout_s=args.timeout)

        request = dataproc.messages.DataprocProjectsRegionsClustersGetRequest(
            projectId=cluster_ref.projectId,
            region=cluster_ref.region,
            clusterName=cluster_ref.clusterName)
        cluster = dataproc.client.projects_regions_clusters.Get(request)
        log.UpdatedResource(cluster_ref)
        return cluster
Beispiel #11
0
    def Run(self, args):
        dataproc = dp.Dataproc(self.ReleaseTrack())

        cluster_ref = args.CONCEPTS.cluster.Parse()

        cluster_config = dataproc.messages.ClusterConfig()
        changed_fields = []

        has_changes = False

        if args.num_workers is not None:
            worker_config = dataproc.messages.InstanceGroupConfig(
                numInstances=args.num_workers)
            cluster_config.workerConfig = worker_config
            changed_fields.append('config.worker_config.num_instances')
            has_changes = True

        num_secondary_workers = _FirstNonNone(args.num_preemptible_workers,
                                              args.num_secondary_workers)
        if num_secondary_workers is not None:
            worker_config = dataproc.messages.InstanceGroupConfig(
                numInstances=num_secondary_workers)
            cluster_config.secondaryWorkerConfig = worker_config
            changed_fields.append(
                'config.secondary_worker_config.num_instances')
            has_changes = True

        if args.autoscaling_policy:
            cluster_config.autoscalingConfig = dataproc.messages.AutoscalingConfig(
                policyUri=args.CONCEPTS.autoscaling_policy.Parse(
                ).RelativeName())
            changed_fields.append('config.autoscaling_config.policy_uri')
            has_changes = True
        elif args.autoscaling_policy == '' or args.disable_autoscaling:  # pylint: disable=g-explicit-bool-comparison
            # Disabling autoscaling. Don't need to explicitly set
            # cluster_config.autoscaling_config to None.
            changed_fields.append('config.autoscaling_config.policy_uri')
            has_changes = True

        lifecycle_config = dataproc.messages.LifecycleConfig()
        changed_config = False
        if args.max_age is not None:
            lifecycle_config.autoDeleteTtl = six.text_type(args.max_age) + 's'
            changed_fields.append('config.lifecycle_config.auto_delete_ttl')
            changed_config = True
        if args.expiration_time is not None:
            lifecycle_config.autoDeleteTime = times.FormatDateTime(
                args.expiration_time)
            changed_fields.append('config.lifecycle_config.auto_delete_time')
            changed_config = True
        if args.max_idle is not None:
            lifecycle_config.idleDeleteTtl = six.text_type(args.max_idle) + 's'
            changed_fields.append('config.lifecycle_config.idle_delete_ttl')
            changed_config = True
        if args.no_max_age:
            lifecycle_config.autoDeleteTtl = None
            changed_fields.append('config.lifecycle_config.auto_delete_ttl')
            changed_config = True
        if args.no_max_idle:
            lifecycle_config.idleDeleteTtl = None
            changed_fields.append('config.lifecycle_config.idle_delete_ttl')
            changed_config = True
        if changed_config:
            cluster_config.lifecycleConfig = lifecycle_config
            has_changes = True

        def _GetCurrentCluster():
            # This is used for labels and auxiliary_node_pool_configs
            get_cluster_request = (
                dataproc.messages.DataprocProjectsRegionsClustersGetRequest(
                    projectId=cluster_ref.projectId,
                    region=cluster_ref.region,
                    clusterName=cluster_ref.clusterName))
            current_cluster = dataproc.client.projects_regions_clusters.Get(
                get_cluster_request)
            return current_cluster

        # Put in a thunk so we only make this call if needed
        def _GetCurrentLabels():
            # We need to fetch cluster first so we know what the labels look like. The
            # labels_util will fill out the proto for us with all the updates and
            # removals, but first we need to provide the current state of the labels
            current_cluster = _GetCurrentCluster()
            return current_cluster.labels

        labels_update = labels_util.ProcessUpdateArgsLazy(
            args,
            dataproc.messages.Cluster.LabelsValue,
            orig_labels_thunk=_GetCurrentLabels)
        if labels_update.needs_update:
            has_changes = True
            changed_fields.append('labels')
        labels = labels_update.GetOrNone()

        if args.driver_pool_size is not None:
            # Getting the node_pool_ids from the current node_pools and other attrs
            # that are not shared with the user
            # Driver pools can only be updated currently with NO other updates
            # We are relying on our frontend validation to prevent this until
            # the change is made to allow driver pools to be updated with other fields
            auxiliary_node_pools = _GetCurrentCluster(
            ).config.auxiliaryNodePoolConfigs

            # get the index of the current cluster's driver pool in the auxiliary
            # node pools list, index_driver_pools is also a list that should have a
            # length of 1
            index_driver_pools = [
                i for i, n in enumerate(auxiliary_node_pools)
                if dataproc.messages.NodePoolConfig.
                RolesValueListEntryValuesEnum.DRIVER in n.roles
            ]

            if len(index_driver_pools) > 1:
                raise exceptions.ArgumentError(
                    'At most one driver pool can be specified per cluster.')
            elif len(index_driver_pools) == 1:
                index = index_driver_pools[0]
                auxiliary_node_pools[
                    index].nodePoolConfig.numInstances = args.driver_pool_size
            else:
                # This case is only relevant for scaling from 0 -> N nodes
                # this will not be supported initially, but will be relying on our
                # front end validation to prevent or allow
                worker_config = dataproc.messages.InstanceGroupConfig(
                    numInstances=args.driver_pool_size)
                node_config = dataproc.messages.NodePoolConfig(
                    nodePoolConfig=worker_config,
                    roles=[
                        dataproc.messages.NodePoolConfig.
                        RolesValueListEntryValuesEnum.DRIVER
                    ])
                auxiliary_node_pools.append(node_config)

            cluster_config.auxiliaryNodePoolConfigs = auxiliary_node_pools
            changed_fields.append('config.auxiliary_node_pool_configs')
            has_changes = True

        if not has_changes:
            raise exceptions.ArgumentError(
                'Must specify at least one cluster parameter to update.')

        cluster = dataproc.messages.Cluster(
            config=cluster_config,
            clusterName=cluster_ref.clusterName,
            labels=labels,
            projectId=cluster_ref.projectId)

        request = dataproc.messages.DataprocProjectsRegionsClustersPatchRequest(
            clusterName=cluster_ref.clusterName,
            region=cluster_ref.region,
            projectId=cluster_ref.projectId,
            cluster=cluster,
            updateMask=','.join(changed_fields),
            requestId=util.GetUniqueId())

        if args.graceful_decommission_timeout is not None:
            request.gracefulDecommissionTimeout = (
                six.text_type(args.graceful_decommission_timeout) + 's')

        operation = dataproc.client.projects_regions_clusters.Patch(request)

        if args.async_:
            log.status.write('Updating [{0}] with operation [{1}].'.format(
                cluster_ref, operation.name))
            return

        util.WaitForOperation(dataproc,
                              operation,
                              message='Waiting for cluster update operation',
                              timeout_s=args.timeout)

        request = dataproc.messages.DataprocProjectsRegionsClustersGetRequest(
            projectId=cluster_ref.projectId,
            region=cluster_ref.region,
            clusterName=cluster_ref.clusterName)
        cluster = dataproc.client.projects_regions_clusters.Get(request)
        log.UpdatedResource(cluster_ref)
        return cluster
    def Run(self, args):
        client = self.context['dataproc_client']
        messages = self.context['dataproc_messages']

        cluster_ref = util.ParseCluster(args.name, self.context)

        cluster_config = messages.ClusterConfig()
        changed_fields = []

        has_changes = False

        if args.new_num_workers is not None:
            log.warn(
                '--new-num-workers parameter is deprecated and will be removed '
                'in a future release. Please use --num-workers instead')
            args.num_workers = args.new_num_workers

        if args.num_workers is not None:
            worker_config = messages.InstanceGroupConfig(
                numInstances=args.num_workers)
            cluster_config.workerConfig = worker_config
            changed_fields.append('config.worker_config.num_instances')
            has_changes = True

        if args.num_preemptible_workers is not None:
            worker_config = messages.InstanceGroupConfig(
                numInstances=args.num_preemptible_workers)
            cluster_config.secondaryWorkerConfig = worker_config
            changed_fields.append(
                'config.secondary_worker_config.num_instances')
            has_changes = True

        if not has_changes:
            raise exceptions.ArgumentError(
                'Must specify at least one cluster parameter to update.')

        cluster = messages.Cluster(config=cluster_config,
                                   clusterName=cluster_ref.clusterName,
                                   projectId=cluster_ref.projectId)

        request = messages.DataprocProjectsRegionsClustersPatchRequest(
            clusterName=cluster_ref.clusterName,
            region=cluster_ref.region,
            projectId=cluster_ref.projectId,
            cluster=cluster,
            updateMask=','.join(changed_fields))

        operation = client.projects_regions_clusters.Patch(request)

        if args. async:
            log.status.write('Updating [{0}] with operation [{1}].'.format(
                cluster_ref, operation.name))
            return

        util.WaitForOperation(operation,
                              self.context,
                              message='Waiting for cluster update operation',
                              timeout_s=3600 * 3)

        request = client.MESSAGES_MODULE.DataprocProjectsRegionsClustersGetRequest(
            projectId=cluster_ref.projectId,
            region=cluster_ref.region,
            clusterName=cluster_ref.clusterName)
        cluster = client.projects_regions_clusters.Get(request)
        log.UpdatedResource(cluster_ref)
        return cluster
Beispiel #13
0
    def Run(self, args):
        client = self.context['dataproc_client']
        messages = self.context['dataproc_messages']

        cluster_ref = util.ParseCluster(args.name, self.context)

        cluster_config = messages.ClusterConfig()
        changed_fields = []

        has_changes = False

        if args.num_workers is not None:
            worker_config = messages.InstanceGroupConfig(
                numInstances=args.num_workers)
            cluster_config.workerConfig = worker_config
            changed_fields.append('config.worker_config.num_instances')
            has_changes = True

        if args.num_preemptible_workers is not None:
            worker_config = messages.InstanceGroupConfig(
                numInstances=args.num_preemptible_workers)
            cluster_config.secondaryWorkerConfig = worker_config
            changed_fields.append(
                'config.secondary_worker_config.num_instances')
            has_changes = True

        # Update labels if the user requested it
        labels = None
        if args.update_labels or args.remove_labels:
            has_changes = True
            changed_fields.append('labels')

            # We need to fetch cluster first so we know what the labels look like. The
            # labels_util.UpdateLabels will fill out the proto for us with all the
            # updates and removals, but first we need to provide the current state
            # of the labels
            get_cluster_request = (client.MESSAGES_MODULE.
                                   DataprocProjectsRegionsClustersGetRequest(
                                       projectId=cluster_ref.projectId,
                                       region=cluster_ref.region,
                                       clusterName=cluster_ref.clusterName))
            current_cluster = client.projects_regions_clusters.Get(
                get_cluster_request)
            labels = labels_util.UpdateLabels(current_cluster.labels,
                                              messages.Cluster.LabelsValue,
                                              args.update_labels,
                                              args.remove_labels)

        if not has_changes:
            raise exceptions.ArgumentError(
                'Must specify at least one cluster parameter to update.')

        cluster = messages.Cluster(config=cluster_config,
                                   clusterName=cluster_ref.clusterName,
                                   labels=labels,
                                   projectId=cluster_ref.projectId)

        request = messages.DataprocProjectsRegionsClustersPatchRequest(
            clusterName=cluster_ref.clusterName,
            region=cluster_ref.region,
            projectId=cluster_ref.projectId,
            cluster=cluster,
            updateMask=','.join(changed_fields))

        operation = client.projects_regions_clusters.Patch(request)

        if args. async:
            log.status.write('Updating [{0}] with operation [{1}].'.format(
                cluster_ref, operation.name))
            return

        util.WaitForOperation(operation,
                              self.context,
                              message='Waiting for cluster update operation',
                              timeout_s=3600 * 3)

        request = client.MESSAGES_MODULE.DataprocProjectsRegionsClustersGetRequest(
            projectId=cluster_ref.projectId,
            region=cluster_ref.region,
            clusterName=cluster_ref.clusterName)
        cluster = client.projects_regions_clusters.Get(request)
        log.UpdatedResource(cluster_ref)
        return cluster