Exemple #1
0
    def _Create(self):
        """Creates the cluster."""
        if self.min_cpu_platform or self.gpu_count:
            cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters',
                                     'create', self.name)
        else:
            cmd = util.GcloudCommand(self, 'container', 'clusters', 'create',
                                     self.name)

        cmd.flags['cluster-version'] = self.cluster_version
        if FLAGS.gke_enable_alpha:
            cmd.args.append('--enable-kubernetes-alpha')
            cmd.args.append('--no-enable-autorepair')
            cmd.args.append('--no-enable-autoupgrade')

        user = util.GetDefaultUser()
        if FLAGS.gcp_service_account:
            cmd.flags['service-account'] = FLAGS.gcp_service_account
        elif 'gserviceaccount.com' in user:
            cmd.flags['service-account'] = user
            self.use_application_default_credentials = False
        else:
            cmd.flags['scopes'] = 'cloud-platform'

        if self.gpu_count:
            cmd.flags['accelerator'] = (
                gce_virtual_machine.GenerateAcceleratorSpecString(
                    self.gpu_type, self.gpu_count))
        if self.min_cpu_platform:
            cmd.flags['min-cpu-platform'] = self.min_cpu_platform

        if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes:
            cmd.args.append('--enable-autoscaling')
            cmd.flags['max-nodes'] = self.max_nodes
            cmd.flags['min-nodes'] = self.min_nodes

        cmd.flags['num-nodes'] = self.num_nodes

        if self.machine_type is None:
            cmd.flags['machine-type'] = 'custom-{0}-{1}'.format(
                self.cpus, self.memory)
        else:
            cmd.flags['machine-type'] = self.machine_type

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        # This command needs a long timeout due to the many minutes it
        # can take to provision a large GPU-accelerated GKE cluster.
        _, stderr, retcode = cmd.Issue(timeout=900,
                                       env=self._GetRequiredGkeEnv(),
                                       raise_on_failure=False)
        if retcode != 0:
            # Log specific type of failure, if known.
            if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr:
                logging.exception('Container resources exhausted: %s', stderr)
                raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                    'Container resources exhausted in zone %s: %s' %
                    (self.zone, stderr))
            raise errors.Resource.CreationError(stderr)
  def _Create(self):
    """Creates the cluster."""
    cmd = self._GcloudCommand('container', 'clusters', 'create', self.name)

    self._AddNodeParamsToCmd(self.vm_config, self.num_nodes,
                             container_service.DEFAULT_NODEPOOL, cmd)

    if self.cluster_version:
      if self.cluster_version in RELEASE_CHANNELS:
        if FLAGS.gke_enable_alpha:
          raise errors.Config.InvalidValue(
              'Kubernetes Alpha is not compatible with release channels')
        cmd.flags['release-channel'] = self.cluster_version
      else:
        cmd.flags['cluster-version'] = self.cluster_version
    if FLAGS.gke_enable_alpha:
      cmd.args.append('--enable-kubernetes-alpha')
      cmd.args.append('--no-enable-autorepair')

    user = util.GetDefaultUser()
    if FLAGS.gcp_service_account:
      cmd.flags['service-account'] = FLAGS.gcp_service_account
    # Matches service accounts that either definitely belongs to this project or
    # are a GCP managed service account like the GCE default service account,
    # which we can't tell to which project they belong.
    elif re.match(SERVICE_ACCOUNT_PATTERN, user):
      logging.info('Re-using configured service-account for GKE Cluster: %s',
                   user)
      cmd.flags['service-account'] = user
      self.use_application_default_credentials = False
    else:
      logging.info('Using default GCE service account for GKE cluster')
      cmd.flags['scopes'] = 'cloud-platform'

    if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes:
      cmd.args.append('--enable-autoscaling')
      cmd.flags['max-nodes'] = self.max_nodes
      cmd.flags['min-nodes'] = self.min_nodes

    cmd.flags['cluster-ipv4-cidr'] = f'/{_CalculateCidrSize(self.max_nodes)}'

    if self.vm_config.network:
      cmd.flags['network'] = self.vm_config.network.network_resource.name

    cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
    cmd.flags['labels'] = util.MakeFormattedDefaultTags()
    cmd.args.append('--no-enable-shielded-nodes')
    self._IssueResourceCreationCommand(cmd)

    self._CreateNodePools()
  def _AddTags(self):
    """Tags all VMs in the cluster."""
    vms_in_cluster = []
    for instance_group in self._GetInstanceGroups():
      vms_in_cluster.extend(self._GetInstancesFromInstanceGroup(instance_group))

    for vm_name in vms_in_cluster:
      cmd = util.GcloudCommand(self, 'compute', 'instances', 'add-metadata',
                               vm_name)
      cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
      cmd.Issue()

      cmd = util.GcloudCommand(self, 'compute', 'disks', 'add-labels', vm_name)
      cmd.flags['labels'] = util.MakeFormattedDefaultTags()
      cmd.Issue()
Exemple #4
0
    def _Create(self):
        """Creates the cluster."""
        if self.min_cpu_platform or self.gpu_count:
            cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters',
                                     'create', self.name)
        else:
            cmd = util.GcloudCommand(self, 'container', 'clusters', 'create',
                                     self.name)

        cmd.flags['cluster-version'] = self.cluster_version
        if FLAGS.gke_enable_alpha:
            cmd.args.append('--enable-kubernetes-alpha')
            cmd.args.append('--no-enable-autorepair')
            cmd.args.append('--no-enable-autoupgrade')

        user = util.GetDefaultUser()
        if 'gserviceaccount.com' in user:
            cmd.flags['service-account'] = user
            self.use_application_default_credentials = False
        else:
            cmd.flags['scopes'] = 'cloud-platform'

        if self.gpu_count:
            cmd.flags['accelerator'] = (
                gce_virtual_machine.GenerateAcceleratorSpecString(
                    self.gpu_type, self.gpu_count))
        if self.min_cpu_platform:
            cmd.flags['min-cpu-platform'] = self.min_cpu_platform

        if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes:
            cmd.args.append('--enable-autoscaling')
            cmd.flags['max-nodes'] = self.max_nodes
            cmd.flags['min-nodes'] = self.min_nodes

        cmd.flags['num-nodes'] = self.num_nodes

        if self.machine_type is None:
            cmd.flags['machine-type'] = 'custom-{0}-{1}'.format(
                self.cpus, self.memory)
        else:
            cmd.flags['machine-type'] = self.machine_type

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        # This command needs a long timeout due to the many minutes it
        # can take to provision a large GPU-accelerated GKE cluster.
        cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv())
    def AddMetadata(self, **kwargs):
        """Adds metadata to the VM and disk."""
        if not kwargs:
            return
        cmd = util.GcloudCommand(self, 'compute', 'instances', 'add-metadata',
                                 self.name)
        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        if kwargs:
            cmd.flags['metadata'] = '{metadata},{kwargs}'.format(
                metadata=cmd.flags['metadata'], kwargs=util.FormatTags(kwargs))
        cmd.Issue()

        cmd = util.GcloudCommand(self, 'compute', 'disks', 'add-labels',
                                 self.name)
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()
        cmd.Issue()
 def AddMetadata(self, **kwargs):
     """Adds metadata to disk."""
     # vm metadata added to vm on creation.
     cmd = util.GcloudCommand(self, 'compute', 'disks', 'add-labels',
                              self.name)
     cmd.flags['labels'] = util.MakeFormattedDefaultTags()
     cmd.Issue()
    def SubmitJob(self,
                  jarfile,
                  classname,
                  job_poll_interval=None,
                  job_arguments=None,
                  job_stdout_file=None,
                  job_type=None):
        """See base class."""
        cmd = util.GcloudCommand(self, 'dataproc', 'jobs', 'submit', job_type)
        cmd.flags['cluster'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        if classname:
            cmd.flags['jars'] = jarfile
            cmd.flags['class'] = classname
        else:
            cmd.flags['jar'] = jarfile

        # Dataproc gives as stdout an object describing job execution.
        # Its stderr contains a mix of the stderr of the job, and the
        # stdout of the job.  We set the driver log level to FATAL
        # to suppress those messages, and we can then separate, hopefully
        # the job standard out from the log messages.
        cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level)

        if job_arguments:
            cmd.additional_flags = ['--'] + job_arguments

        stdout, stderr, retcode = cmd.Issue(timeout=None)
        if retcode != 0:
            return {dpb_service.SUCCESS: False}

        stats = self._GetStats(stdout)
        return stats
Exemple #8
0
    def _Create(self):
        """Creates the cluster."""
        cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        if self.spec.worker_count:
            # The number of worker machines in the cluster
            cmd.flags['num-workers'] = self.spec.worker_count
        else:
            cmd.flags['single-node'] = True

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')
            cmd.flags['optional-components'] = ','.join(self.spec.applications)

        # TODO(pclay): stop ignoring spec.master_group?
        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)
            # Set boot_disk_size
            if self.spec.worker_group.disk_spec.disk_size:
                size_in_gb = '{}GB'.format(
                    str(self.spec.worker_group.disk_spec.disk_size))
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               size_in_gb)
            # Set boot_disk_type
            if self.spec.worker_group.disk_spec.disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.disk_spec.disk_type)
                self.dpb_hdfs_type = disk_to_hdfs_map[
                    self.spec.worker_group.disk_spec.disk_type]

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)
        # Set zone
        cmd.flags['zone'] = self.dpb_service_zone
        if self.dpb_version:
            cmd.flags['image-version'] = self.dpb_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        timeout = 900  # 15 min
        # TODO(saksena): Retrieve the cluster create time and hold in a var
        _, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False)
        if retcode:
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)
Exemple #9
0
  def _Create(self):
    """Creates the cluster."""

    if self.cluster_id is None:
      self.cluster_id = 'pkb-' + FLAGS.run_uri
    cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id)
    if self.project is not None:
      cmd.flags['project'] = self.project
    cmd.flags['num-workers'] = self.spec.worker_group.vm_count

    for group_type, group_spec in [
        ('worker', self.spec.worker_group),
        ('master', self.spec.master_group)]:
      flag_name = group_type + '-machine-type'
      cmd.flags[flag_name] = group_spec.vm_spec.machine_type

      if group_spec.vm_spec.num_local_ssds:
        ssd_flag = 'num-{0}-local-ssds'.format(group_type)
        cmd.flags[ssd_flag] = group_spec.vm_spec.num_local_ssds

      if group_spec.vm_spec.boot_disk_size:
        disk_flag = group_type + '-boot-disk-size'
        cmd.flags[disk_flag] = group_spec.vm_spec.boot_disk_size

      if group_spec.vm_spec.boot_disk_type:
        disk_flag = group_type + '-boot-disk-type'
        cmd.flags[disk_flag] = group_spec.vm_spec.boot_disk_type

    if FLAGS.gcp_dataproc_subnet:
      cmd.flags['subnet'] = FLAGS.gcp_dataproc_subnet
      cmd.additional_flags.append('--no-address')

    if FLAGS.gcp_dataproc_property:
      cmd.flags['properties'] = ','.join(FLAGS.gcp_dataproc_property)

    if FLAGS.gcp_dataproc_image:
      cmd.flags['image'] = FLAGS.gcp_dataproc_image

    cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
    cmd.flags['labels'] = util.MakeFormattedDefaultTags()
    cmd.Issue()
Exemple #10
0
 def _Create(self):
   """Creates the instance."""
   cmd = util.GcloudCommand(self, 'redis', 'instances', 'create',
                            self.name)
   cmd.flags['region'] = self.redis_region
   cmd.flags['zone'] = FLAGS.zones[0]
   cmd.flags['network'] = FLAGS.gce_network_name
   cmd.flags['tier'] = self.tier
   cmd.flags['size'] = self.size
   cmd.flags['redis-version'] = self.redis_version
   cmd.flags['labels'] = util.MakeFormattedDefaultTags()
   cmd.Issue(timeout=COMMAND_TIMEOUT)
Exemple #11
0
  def SubmitJob(self,
                jarfile=None,
                classname=None,
                pyspark_file=None,
                query_file=None,
                job_poll_interval=None,
                job_stdout_file=None,
                job_arguments=None,
                job_files=None,
                job_jars=None,
                job_type=None):
    """See base class."""
    args = ['jobs', 'submit', job_type]

    if job_type == self.PYSPARK_JOB_TYPE:
      args.append(pyspark_file)

    cmd = self.DataprocGcloudCommand(*args)

    cmd.flags['cluster'] = self.cluster_id
    cmd.flags['labels'] = util.MakeFormattedDefaultTags()

    if classname:
      cmd.flags['jars'] = jarfile
      cmd.flags['class'] = classname
    elif jarfile:
      cmd.flags['jar'] = jarfile

    if query_file:
      cmd.flags['file'] = query_file

    if job_files:
      cmd.flags['files'] = ','.join(job_files)
    if job_jars:
      cmd.flags['jars'] = ','.join(job_jars)

    # Dataproc gives as stdout an object describing job execution.
    # Its stderr contains a mix of the stderr of the job, and the
    # stdout of the job.  We set the driver log level to FATAL
    # to suppress those messages, and we can then separate, hopefully
    # the job standard out from the log messages.
    cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level)

    if job_arguments:
      cmd.additional_flags = ['--'] + job_arguments

    stdout, _, retcode = cmd.Issue(timeout=None, raise_on_failure=False)
    if retcode != 0:
      return {dpb_service.SUCCESS: False}

    stats = self._GetStats(stdout)
    stats[dpb_service.SUCCESS] = True
    return stats
Exemple #12
0
 def _Create(self):
     """Creates the disk."""
     cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name)
     cmd.flags['size'] = self.disk_size
     cmd.flags['type'] = self.disk_type
     cmd.flags['labels'] = util.MakeFormattedDefaultTags()
     if self.image:
         cmd.flags['image'] = self.image
     if self.image_project:
         cmd.flags['image-project'] = self.image_project
     _, stderr, retcode = cmd.Issue(raise_on_failure=False)
     util.CheckGcloudResponseKnownFailures(stderr, retcode)
Exemple #13
0
 def _PostCreate(self):
   """Get the cluster's data and tag it."""
   cmd = self.DataprocGcloudCommand('clusters', 'describe', self.cluster_id)
   stdout, _, _ = cmd.Issue()
   config = json.loads(stdout)['config']
   master = config['masterConfig']
   worker = config['workerConfig']
   for disk in master['instanceNames'] + worker['instanceNames']:
     cmd = util.GcloudCommand(
         self, 'compute', 'disks', 'add-labels', disk)
     cmd.flags['labels'] = util.MakeFormattedDefaultTags()
     cmd.flags['zone'] = self.dpb_service_zone
     cmd.Issue()
    def _Create(self):
        """Creates the cluster."""

        if self.cluster_id is None:
            self.cluster_id = 'pkb-' + FLAGS.run_uri
        cmd = util.GcloudCommand(self, 'dataproc', 'clusters', 'create',
                                 self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        # The number of worker machines in the cluster
        cmd.flags['num-workers'] = self.spec.worker_count

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')

        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)

            # Set boot_disk_size
            if self.spec.worker_group.vm_spec.boot_disk_size:
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               self.spec.worker_group.vm_spec.boot_disk_size)
            # Set boot_disk_type
            if self.spec.worker_group.vm_spec.boot_disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.vm_spec.boot_disk_type)

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)

        self.append_region(cmd, True)

        if self.dpb_dataproc_image_version:
            cmd.flags['image-version'] = self.dpb_dataproc_image_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        # TODO(saksena): Retrieve the cluster create time and hold in a var
        cmd.Issue()
    def _Create(self):
        """Creates the cluster."""
        cmd = util.GcloudCommand(self, 'dataproc', 'clusters', 'create',
                                 self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        # The number of worker machines in the cluster
        cmd.flags['num-workers'] = self.spec.worker_count

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')

        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)
            # Set boot_disk_size
            if self.spec.worker_group.disk_spec.disk_size:
                size_in_gb = '{}GB'.format(
                    str(self.spec.worker_group.disk_spec.disk_size))
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               size_in_gb)
            # Set boot_disk_type
            if self.spec.worker_group.disk_spec.disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.disk_spec.disk_type)
                self.dpb_hdfs_type = disk_to_hdfs_map[
                    self.spec.worker_group.disk_spec.disk_type]

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)
        # Set zone
        cmd.flags['zone'] = self.dpb_service_zone
        if self.dpb_version != 'latest':
            cmd.flags['image-version'] = self.dpb_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        # TODO(saksena): Retrieve the cluster create time and hold in a var
        cmd.Issue()
Exemple #16
0
 def _Create(self):
     logging.info('Creating NFS server %s', self.name)
     volume_arg = 'name={0},capacity={1}'.format(
         self.server_directory.strip('/'), self.disk_spec.disk_size)
     network_arg = 'name={0}'.format(self.network)
     args = [
         '--file-share', volume_arg, '--network', network_arg, '--labels',
         util.MakeFormattedDefaultTags()
     ]
     if self.nfs_tier:
         args += ['--tier', self.nfs_tier]
     try:
         self._NfsCommand('create', *args)
     except errors.Error as ex:
         # if this NFS service already exists reuse it
         if self._Exists():
             logging.info('Reusing existing NFS server %s', self.name)
         else:
             raise errors.Resource.RetryableCreationError(
                 'Error creating NFS service %s' % self.name, ex)
  def _Create(self):
    """Creates the disk."""
    cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name)
    cmd.flags['size'] = self.disk_size
    cmd.flags['type'] = self.disk_type
    if self.provisioned_iops and self.disk_type == PD_EXTREME:
      cmd.flags['provisioned-iops'] = self.provisioned_iops
    cmd.flags['labels'] = util.MakeFormattedDefaultTags()
    if self.image:
      cmd.flags['image'] = self.image
    if self.image_project:
      cmd.flags['image-project'] = self.image_project

    if self.replica_zones:
      cmd.flags['region'] = self.region
      cmd.flags['replica-zones'] = ','.join(self.replica_zones)
      del cmd.flags['zone']

    _, stderr, retcode = cmd.Issue(raise_on_failure=False)
    util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _BuildContext(launcher_vm, booter_template_vm):
    """Returns the context variables for Jinja2 template during rendering."""
    context = {
        'boot_machine_type': booter_template_vm.machine_type,
        'cloud': FLAGS.cloud,
        'contact_launcher': FLAGS.vms_contact_launcher,
        'launcher_vm_name': launcher_vm.name,
        'os_type': 'linux' if _IsLinux() else 'windows',
        'server_ip': launcher_vm.internal_ip,
        'server_port': _PORT,
        'start_time_file': _START_TIME_FILE_PATH,
        'timeout': _TIMEOUT_SECONDS,
        'vm_count': FLAGS.boots_per_launcher,
        'zone': launcher_vm.zone,
        'use_public_ip': '' if FLAGS.use_public_ip else 'no-',
    }
    cloud = FLAGS.cloud
    if cloud == 'GCP':
        context.update({
            'boot_disk_size':
            booter_template_vm.boot_disk_size,
            'boot_vm_name_prefix':
            _BOOT_VM_NAME_PREFIX.format(launcher_name=launcher_vm.name),
            'image_family':
            booter_template_vm.image_family,
            'image_project':
            booter_template_vm.image_project,
            'gcloud_path':
            FLAGS.gcloud_path,
            'project':
            FLAGS.project,
            'tags':
            gcp_util.MakeFormattedDefaultTags(),
        })
    elif cloud == 'AWS':
        tags = aws_util.MakeDefaultTags()
        tags.update({'launcher_id': launcher_vm.name})
        context.update({
            'group_name':
            booter_template_vm.placement_group.name,
            'image':
            booter_template_vm.image,
            'key_name':
            'perfkit-key-{0}'.format(FLAGS.run_uri),
            'region':
            aws_util.GetRegionFromZone(launcher_vm.zone),
            'subnet_id':
            booter_template_vm.network.subnet.id,
            'tags':
            aws_util.FormatTagSpecifications('instance', tags),
        })
    elif cloud == 'Azure':
        context.update({
            'boot_vm_name_prefix':
            launcher_vm.name.split('-', 1)[1],
            'location':
            launcher_vm.region,
            'image':
            booter_template_vm.image,
            'storage_sku':
            booter_template_vm.os_disk.disk_type,
            'resource_group':
            launcher_vm.resource_group.name,
            'nic':
            _BOOT_NIC_NAME_PREFIX.format(run_uri=FLAGS.run_uri),
            'password':
            booter_template_vm.password,
            'start_id':
            GetAzBootVMStartIdByLauncher(launcher_vm.name),
        })

    return context
Exemple #19
0
  def SubmitJob(self, jarfile, classname, job_script=None,
                job_poll_interval=None,
                job_arguments=None, job_stdout_file=None,
                job_type=spark_service.SPARK_JOB_TYPE):
    cmd = self.DataprocGcloudCommand('jobs', 'submit', job_type)
    cmd.flags['cluster'] = self.cluster_id
    cmd.flags['labels'] = util.MakeFormattedDefaultTags()
    # If we don't put this here, zone is auotmatically added to the command
    # which breaks dataproc jobs submit
    cmd.flags['zone'] = []

    cmd.additional_flags = []
    if classname and jarfile:
      cmd.flags['jars'] = jarfile
      cmd.flags['class'] = classname
    elif jarfile:
      cmd.flags['jar'] = jarfile
    elif job_script:
      cmd.additional_flags += [job_script]

    # Dataproc gives as stdout an object describing job execution.
    # Its stderr contains a mix of the stderr of the job, and the
    # stdout of the job.  We can set the driver log level to FATAL
    # to suppress those messages, and we can then separate, hopefully
    # the job standard out from the log messages.
    cmd.flags['driver-log-levels'] = 'root={}'.format(
        FLAGS.spark_service_log_level)
    if job_arguments:
      cmd.additional_flags += ['--'] + job_arguments
    stdout, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False)
    if retcode != 0:
      return {spark_service.SUCCESS: False}

    stats = self._GetStats(stdout)
    stats[spark_service.SUCCESS] = True

    if job_stdout_file:
      with open(job_stdout_file, 'w') as f:
        lines = stderr.splitlines(True)
        if (not re.match(r'Job \[.*\] submitted.', lines[0]) or
            not re.match(r'Waiting for job output...', lines[1])):
          raise Exception('Dataproc output in unexpected format.')
        i = 2
        if job_type == spark_service.SPARK_JOB_TYPE:
          if not re.match(r'\r', lines[i]):
            raise Exception('Dataproc output in unexpected format.')
          i += 1
          # Eat these status lines.  They end in \r, so they overwrite
          # themselves at the console or when you cat a file.  But they
          # are part of this string.
          while re.match(r'\[Stage \d+:', lines[i]):
            i += 1
          if not re.match(r' *\r$', lines[i]):
            raise Exception('Dataproc output in unexpected format.')

        while i < len(lines) and not re.match(r'Job \[.*\]', lines[i]):
          f.write(lines[i])
          i += 1
        if i != len(lines) - 1:
          raise Exception('Dataproc output in unexpected format.')
    return stats
Exemple #20
0
    def _Create(self):
        """Creates the cluster."""
        cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        if self.spec.worker_count:
            # The number of worker machines in the cluster
            cmd.flags['num-workers'] = self.spec.worker_count
        else:
            cmd.flags['single-node'] = True

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')
            cmd.flags['optional-components'] = ','.join(self.spec.applications)

        # Enable component gateway for debuggability. Does not impact performance.
        cmd.flags['enable-component-gateway'] = True

        # TODO(pclay): stop ignoring spec.master_group?
        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)
            # Set boot_disk_size
            if self.spec.worker_group.disk_spec.disk_size:
                size_in_gb = '{}GB'.format(
                    str(self.spec.worker_group.disk_spec.disk_size))
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               size_in_gb)
            # Set boot_disk_type
            if self.spec.worker_group.disk_spec.disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.disk_spec.disk_type)
                self.dpb_hdfs_type = disk_to_hdfs_map[
                    self.spec.worker_group.disk_spec.disk_type]

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)
                # This will actually be used for storage
                self.dpb_hdfs_type = 'Local SSD'
        # Set zone
        cmd.flags['zone'] = self.dpb_service_zone
        if self.dpb_version:
            cmd.flags['image-version'] = self.dpb_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        if FLAGS.dpb_cluster_properties:
            cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties)

        # Ideally DpbServiceSpec would have a network spec, which we would create to
        # Resolve the name, but because EMR provisions its own VPC and we are
        # generally happy using pre-existing networks for Dataproc. Just use the
        # underlying flag instead.
        if FLAGS.gce_network_name:
            cmd.flags['network'] = FLAGS.gce_network_name

        metadata = util.GetDefaultTags()
        metadata.update(
            flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata))
        cmd.flags['metadata'] = util.FormatTags(metadata)
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()
        timeout = 900  # 15 min
        stdout, stderr, retcode = cmd.Issue(timeout=timeout,
                                            raise_on_failure=False)
        self._cluster_create_time = self._ParseClusterCreateTime(stdout)
        if retcode:
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)
    def _GenerateCreateCommand(self, ssh_keys_path):
        """Generates a command to create the VM instance.

    Args:
      ssh_keys_path: string. Path to a file containing the sshKeys metadata.

    Returns:
      GcloudCommand. gcloud command to issue in order to create the VM instance.
    """
        args = ['compute', 'instances', 'create', self.name]

        cmd = util.GcloudCommand(self, *args)
        if self.network.subnet_resource is not None:
            cmd.flags['subnet'] = self.network.subnet_resource.name
        else:
            cmd.flags['network'] = self.network.network_resource.name
        if self.image:
            cmd.flags['image'] = self.image
        elif self.image_family:
            cmd.flags['image-family'] = self.image_family
        if self.image_project is not None:
            cmd.flags['image-project'] = self.image_project
        cmd.flags['boot-disk-auto-delete'] = True
        if self.boot_disk_size:
            cmd.flags['boot-disk-size'] = self.boot_disk_size
        if self.boot_disk_type:
            cmd.flags['boot-disk-type'] = self.boot_disk_type
        if self.machine_type is None:
            cmd.flags['custom-cpu'] = self.cpus
            cmd.flags['custom-memory'] = '{0}MiB'.format(self.memory_mib)
            if self.min_cpu_platform:
                cmd.flags['min-cpu-platform'] = self.min_cpu_platform
        else:
            cmd.flags['machine-type'] = self.machine_type
            if self.min_cpu_platform and 'n1-' in self.machine_type:
                cmd.flags['min-cpu-platform'] = self.min_cpu_platform
            elif self.min_cpu_platform:
                logging.warning('Cannot set min-cpu-platform for %s',
                                self.machine_type)
        if self.gpu_count and self.machine_type and 'a2-' not in self.machine_type:
            # A2 machine type already has predefined GPU type and count.
            cmd.flags['accelerator'] = GenerateAcceleratorSpecString(
                self.gpu_type, self.gpu_count)
        cmd.flags['tags'] = ','.join(['perfkitbenchmarker'] +
                                     (self.gce_tags or []))
        cmd.flags['no-restart-on-failure'] = True
        if self.node_group:
            cmd.flags['node-group'] = self.node_group.name
        if self.gce_shielded_secure_boot:
            cmd.flags['shielded-secure-boot'] = True

        if self.network.placement_group:
            self.metadata.update(
                self.network.placement_group.GetResourceMetadata())
            cmd.flags['resource-policies'] = self.network.placement_group.name
            cmd.flags['maintenance-policy'] = 'TERMINATE'
        else:
            self.metadata[
                'placement_group_style'] = placement_group.PLACEMENT_GROUP_NONE

        metadata_from_file = {'sshKeys': ssh_keys_path}
        parsed_metadata_from_file = flag_util.ParseKeyValuePairs(
            FLAGS.gcp_instance_metadata_from_file)
        for key, value in six.iteritems(parsed_metadata_from_file):
            if key in metadata_from_file:
                logging.warning(
                    'Metadata "%s" is set internally. Cannot be overridden '
                    'from command line.', key)
                continue
            metadata_from_file[key] = value
        cmd.flags['metadata-from-file'] = ','.join(
            ['%s=%s' % (k, v) for k, v in six.iteritems(metadata_from_file)])

        metadata = {}
        metadata.update(self.boot_metadata)
        metadata.update(util.GetDefaultTags())

        additional_metadata = {}
        additional_metadata.update(self.vm_metadata)
        additional_metadata.update(
            flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata))

        for key, value in six.iteritems(additional_metadata):
            if key in metadata:
                logging.warning(
                    'Metadata "%s" is set internally. Cannot be overridden '
                    'from command line.', key)
                continue
            metadata[key] = value

        if self.preemptible:
            cmd.flags['preemptible'] = True
            preemptible_status_bucket = (
                f'gs://{FLAGS.gcp_preemptible_status_bucket}/{FLAGS.run_uri}/')
            self.preempt_marker = f'{preemptible_status_bucket}{self.name}'
            metadata.update([self._PreemptibleMetadataKeyValue()])

        cmd.flags['metadata'] = util.FormatTags(metadata)

        # TODO(user): If GCE one day supports live migration on GPUs
        #                           this can be revised.
        if (FLAGS['gce_migrate_on_maintenance'].present
                and FLAGS.gce_migrate_on_maintenance and self.gpu_count):
            raise errors.Config.InvalidValue(
                'Cannot set flag gce_migrate_on_maintenance on instances with GPUs, '
                'as it is not supported by GCP.')
        if not FLAGS.gce_migrate_on_maintenance or self.gpu_count:
            cmd.flags['maintenance-policy'] = 'TERMINATE'
        cmd.flags['local-ssd'] = (
            ['interface={0}'.format(FLAGS.gce_ssd_interface)] *
            self.max_local_disks)
        if FLAGS.gcloud_scopes:
            cmd.flags['scopes'] = ','.join(
                re.split(r'[,; ]', FLAGS.gcloud_scopes))
        cmd.flags['network-tier'] = self.gce_network_tier.upper()
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        return cmd
Exemple #22
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=None,
                  job_stdout_file=None,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        assert job_type
        args = ['batches', 'submit', job_type]
        additional_args = []

        if job_type == self.PYSPARK_JOB_TYPE:
            args.append(pyspark_file)

        cmd = self.DataprocGcloudCommand(*args)

        cmd.flags['batch'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        job_jars = job_jars or []
        if classname:
            if jarfile:
                # Dataproc does not support both a main class and a main jar so just
                # make the main jar an additional jar instead.
                job_jars.append(jarfile)
            cmd.flags['class'] = classname
        elif jarfile:
            cmd.flags['jar'] = jarfile

        if query_file:
            additional_args += query_file

        if job_files:
            cmd.flags['files'] = ','.join(job_files)
        if job_jars:
            cmd.flags['jars'] = ','.join(job_jars)

        if FLAGS.gce_network_name:
            cmd.flags['network'] = FLAGS.gce_network_name

        if self.dpb_version:
            cmd.flags['version'] = self.dpb_version
        if FLAGS.gcp_dataproc_image:
            cmd.flags['container-image'] = FLAGS.gcp_dataproc_image

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})
        if all_properties:
            # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping
            cmd.flags['properties'] = '^@^' + '@'.join(
                '{}={}'.format(k, v) for k, v in all_properties.items())

        if job_arguments:
            additional_args += ['--'] + job_arguments
        cmd.additional_flags = additional_args

        _, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        fetch_batch_cmd = self.DataprocGcloudCommand('batches', 'describe',
                                                     self.cluster_id)
        stdout, stderr, retcode = fetch_batch_cmd.Issue(timeout=None,
                                                        raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        results = json.loads(stdout)
        # Otherwise retcode would not have been 0
        assert results['state'] == 'SUCCEEDED'
        done_time = self._ParseTime(results['stateTime'])
        pending_time = None
        start_time = None
        for state in results['stateHistory']:
            if state['state'] == 'PENDING':
                pending_time = self._ParseTime(state['stateStartTime'])
            elif state['state'] == 'RUNNING':
                start_time = self._ParseTime(state['stateStartTime'])

        assert pending_time and start_time and done_time

        return dpb_service.JobResult(
            run_time=(done_time - start_time).total_seconds(),
            pending_time=(start_time - pending_time).total_seconds())
Exemple #23
0
    def SubmitJob(self,
                  jarfile=None,
                  classname=None,
                  pyspark_file=None,
                  query_file=None,
                  job_poll_interval=None,
                  job_stdout_file=None,
                  job_arguments=None,
                  job_files=None,
                  job_jars=None,
                  job_type=None,
                  properties=None):
        """See base class."""
        assert job_type
        args = ['jobs', 'submit', job_type]

        if job_type == self.PYSPARK_JOB_TYPE:
            args.append(pyspark_file)

        cmd = self.DataprocGcloudCommand(*args)

        cmd.flags['cluster'] = self.cluster_id
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        job_jars = job_jars or []
        if classname:
            if jarfile:
                # Dataproc does not support both a main class and a main jar so just
                # make the main jar an additional jar instead.
                job_jars.append(jarfile)
            cmd.flags['class'] = classname
        elif jarfile:
            cmd.flags['jar'] = jarfile

        if query_file:
            cmd.flags['file'] = query_file

        if job_files:
            cmd.flags['files'] = ','.join(job_files)
        if job_jars:
            cmd.flags['jars'] = ','.join(job_jars)

        # Dataproc gives as stdout an object describing job execution.
        # Its stderr contains a mix of the stderr of the job, and the
        # stdout of the job.  We set the driver log level to FATAL
        # to suppress those messages, and we can then separate, hopefully
        # the job standard out from the log messages.
        cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level)

        all_properties = self.GetJobProperties()
        all_properties.update(properties or {})
        if all_properties:
            # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping
            cmd.flags['properties'] = '^@^' + '@'.join(
                '{}={}'.format(k, v) for k, v in all_properties.items())

        if job_arguments:
            cmd.additional_flags = ['--'] + job_arguments

        stdout, stderr, retcode = cmd.Issue(timeout=None,
                                            raise_on_failure=False)
        if retcode != 0:
            raise dpb_service.JobSubmissionError(stderr)

        results = json.loads(stdout)
        # Otherwise retcode would not have been 0
        assert results['status']['state'] == 'DONE'
        done_time = GcpDpbDataproc._ParseTime(
            results['status']['stateStartTime'])
        pending_time = None
        start_time = None
        for state in results['statusHistory']:
            if state['state'] == 'PENDING':
                pending_time = GcpDpbDataproc._ParseTime(
                    state['stateStartTime'])
            elif state['state'] == 'RUNNING':
                start_time = GcpDpbDataproc._ParseTime(state['stateStartTime'])

        assert pending_time and start_time and done_time

        return dpb_service.JobResult(
            run_time=(done_time - start_time).total_seconds(),
            pending_time=(start_time - pending_time).total_seconds())
Exemple #24
0
    def _Create(self):
        """Creates the cluster."""
        cmd = util.GcloudCommand(self, 'container', 'clusters', 'create',
                                 self.name)

        cmd.flags['cluster-version'] = self.cluster_version
        if FLAGS.gke_enable_alpha:
            cmd.args.append('--enable-kubernetes-alpha')
            cmd.args.append('--no-enable-autorepair')
            cmd.args.append('--no-enable-autoupgrade')

        user = util.GetDefaultUser()
        if FLAGS.gcp_service_account:
            cmd.flags['service-account'] = FLAGS.gcp_service_account
        # Matches service accounts that either definitely belongs to this project or
        # are a GCP managed service account like the GCE default service account,
        # which we can't tell to which project they belong.
        elif re.match(SERVICE_ACCOUNT_PATTERN, user):
            logging.info(
                'Re-using configured service-account for GKE Cluster: %s',
                user)
            cmd.flags['service-account'] = user
            self.use_application_default_credentials = False
        else:
            logging.info('Using default GCE service account for GKE cluster')
            cmd.flags['scopes'] = 'cloud-platform'

        if self.vm_config.gpu_count:
            cmd.flags['accelerator'] = (
                gce_virtual_machine.GenerateAcceleratorSpecString(
                    self.vm_config.gpu_type, self.vm_config.gpu_count))
        if self.vm_config.min_cpu_platform:
            cmd.flags['min-cpu-platform'] = self.vm_config.min_cpu_platform

        if self.vm_config.boot_disk_size:
            cmd.flags['disk-size'] = self.vm_config.boot_disk_size
        if self.vm_config.boot_disk_type:
            cmd.flags['disk-type'] = self.vm_config.boot_disk_type
        if self.vm_config.max_local_disks:
            # TODO(pclay): Switch to local-ssd-volumes which support NVME when it
            # leaves alpha. See
            # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create
            cmd.flags['local-ssd-count'] = self.vm_config.max_local_disks

        if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes:
            cmd.args.append('--enable-autoscaling')
            cmd.flags['max-nodes'] = self.max_nodes
            cmd.flags['min-nodes'] = self.min_nodes

        cmd.flags['num-nodes'] = self.num_nodes

        if self.vm_config.machine_type is None:
            cmd.flags['machine-type'] = 'custom-{0}-{1}'.format(
                self.vm_config.cpus, self.vm_config.memory_mib)
        else:
            cmd.flags['machine-type'] = self.vm_config.machine_type

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        # This command needs a long timeout due to the many minutes it
        # can take to provision a large GPU-accelerated GKE cluster.
        _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False)
        if retcode:
            # Log specific type of failure, if known.
            if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr:
                logging.exception('Container resources exhausted: %s', stderr)
                raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                    'Container resources exhausted in zone %s: %s' %
                    (self.zone, stderr))
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)
Exemple #25
0
    def _CreateGcloudSqlInstance(self):
        storage_size = self.spec.db_disk_spec.disk_size
        instance_zone = self.spec.db_spec.zone

        authorized_network = self._GetAuthorizedNetworks([self.client_vm])

        database_version_string = self._GetEngineVersionString(
            self.spec.engine, self.spec.engine_version)

        cmd_string = [
            self,
            'beta',
            'sql',
            'instances',
            'create',
            self.instance_id,
            '--quiet',
            '--format=json',
            '--activation-policy=ALWAYS',
            '--assign-ip',
            '--authorized-networks=%s' % authorized_network,
            '--zone=%s' % instance_zone,
            '--database-version=%s' % database_version_string,
            '--storage-size=%d' % storage_size,
            '--labels=%s' % util.MakeFormattedDefaultTags(),
        ]
        if self.spec.engine == relational_db.MYSQL:
            cmd_string.append('--enable-bin-log')

        if self.spec.engine == relational_db.SQLSERVER:
            # `--root-password` is required when creating SQL Server instances.
            cmd_string.append('--root-password={0}'.format(
                self.spec.database_password))

        if (self.spec.db_spec.cpus and self.spec.db_spec.memory):
            self._ValidateSpec()
            memory = self.spec.db_spec.memory
            cpus = self.spec.db_spec.cpus
            self._ValidateMachineType(memory, cpus)
            cmd_string.append('--cpu={}'.format(cpus))
            cmd_string.append('--memory={}MiB'.format(memory))
        elif hasattr(self.spec.db_spec, 'machine_type'):
            machine_type_flag = '--tier=%s' % self.spec.db_spec.machine_type
            cmd_string.append(machine_type_flag)
        else:
            raise Exception('Unspecified machine type')

        if self.spec.high_availability:
            cmd_string.append(self._GetHighAvailabilityFlag())

        if self.spec.backup_enabled:
            cmd_string.append('--backup')
            cmd_string.append('--backup-start-time={}'.format(
                self.spec.backup_start_time))
        else:
            cmd_string.append('--no-backup')
        cmd = util.GcloudCommand(*cmd_string)
        cmd.flags['project'] = self.project

        _, stderr, retcode = cmd.Issue(timeout=CREATION_TIMEOUT)

        util.CheckGcloudResponseKnownFailures(stderr, retcode)