def _Create(self): """Creates the cluster.""" if self.min_cpu_platform or self.gpu_count: cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if FLAGS.gcp_service_account: cmd.flags['service-account'] = FLAGS.gcp_service_account elif 'gserviceaccount.com' in user: cmd.flags['service-account'] = user self.use_application_default_credentials = False else: cmd.flags['scopes'] = 'cloud-platform' if self.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.cpus, self.memory) else: cmd.flags['machine-type'] = self.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. _, stderr, retcode = cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv(), raise_on_failure=False) if retcode != 0: # Log specific type of failure, if known. if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: logging.exception('Container resources exhausted: %s', stderr) raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Container resources exhausted in zone %s: %s' % (self.zone, stderr)) raise errors.Resource.CreationError(stderr)
def _Create(self): """Creates the cluster.""" cmd = self._GcloudCommand('container', 'clusters', 'create', self.name) self._AddNodeParamsToCmd(self.vm_config, self.num_nodes, container_service.DEFAULT_NODEPOOL, cmd) if self.cluster_version: if self.cluster_version in RELEASE_CHANNELS: if FLAGS.gke_enable_alpha: raise errors.Config.InvalidValue( 'Kubernetes Alpha is not compatible with release channels') cmd.flags['release-channel'] = self.cluster_version else: cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') user = util.GetDefaultUser() if FLAGS.gcp_service_account: cmd.flags['service-account'] = FLAGS.gcp_service_account # Matches service accounts that either definitely belongs to this project or # are a GCP managed service account like the GCE default service account, # which we can't tell to which project they belong. elif re.match(SERVICE_ACCOUNT_PATTERN, user): logging.info('Re-using configured service-account for GKE Cluster: %s', user) cmd.flags['service-account'] = user self.use_application_default_credentials = False else: logging.info('Using default GCE service account for GKE cluster') cmd.flags['scopes'] = 'cloud-platform' if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['cluster-ipv4-cidr'] = f'/{_CalculateCidrSize(self.max_nodes)}' if self.vm_config.network: cmd.flags['network'] = self.vm_config.network.network_resource.name cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.args.append('--no-enable-shielded-nodes') self._IssueResourceCreationCommand(cmd) self._CreateNodePools()
def _AddTags(self): """Tags all VMs in the cluster.""" vms_in_cluster = [] for instance_group in self._GetInstanceGroups(): vms_in_cluster.extend(self._GetInstancesFromInstanceGroup(instance_group)) for vm_name in vms_in_cluster: cmd = util.GcloudCommand(self, 'compute', 'instances', 'add-metadata', vm_name) cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.Issue() cmd = util.GcloudCommand(self, 'compute', 'disks', 'add-labels', vm_name) cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.Issue()
def _Create(self): """Creates the cluster.""" if self.min_cpu_platform or self.gpu_count: cmd = util.GcloudCommand(self, 'beta', 'container', 'clusters', 'create', self.name) else: cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if 'gserviceaccount.com' in user: cmd.flags['service-account'] = user self.use_application_default_credentials = False else: cmd.flags['scopes'] = 'cloud-platform' if self.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count)) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.cpus, self.memory) else: cmd.flags['machine-type'] = self.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. cmd.Issue(timeout=900, env=self._GetRequiredGkeEnv())
def AddMetadata(self, **kwargs): """Adds metadata to the VM and disk.""" if not kwargs: return cmd = util.GcloudCommand(self, 'compute', 'instances', 'add-metadata', self.name) cmd.flags['metadata'] = util.MakeFormattedDefaultTags() if kwargs: cmd.flags['metadata'] = '{metadata},{kwargs}'.format( metadata=cmd.flags['metadata'], kwargs=util.FormatTags(kwargs)) cmd.Issue() cmd = util.GcloudCommand(self, 'compute', 'disks', 'add-labels', self.name) cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.Issue()
def AddMetadata(self, **kwargs): """Adds metadata to disk.""" # vm metadata added to vm on creation. cmd = util.GcloudCommand(self, 'compute', 'disks', 'add-labels', self.name) cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.Issue()
def SubmitJob(self, jarfile, classname, job_poll_interval=None, job_arguments=None, job_stdout_file=None, job_type=None): """See base class.""" cmd = util.GcloudCommand(self, 'dataproc', 'jobs', 'submit', job_type) cmd.flags['cluster'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() if classname: cmd.flags['jars'] = jarfile cmd.flags['class'] = classname else: cmd.flags['jar'] = jarfile # Dataproc gives as stdout an object describing job execution. # Its stderr contains a mix of the stderr of the job, and the # stdout of the job. We set the driver log level to FATAL # to suppress those messages, and we can then separate, hopefully # the job standard out from the log messages. cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level) if job_arguments: cmd.additional_flags = ['--'] + job_arguments stdout, stderr, retcode = cmd.Issue(timeout=None) if retcode != 0: return {dpb_service.SUCCESS: False} stats = self._GetStats(stdout) return stats
def _Create(self): """Creates the cluster.""" cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project if self.spec.worker_count: # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count else: cmd.flags['single-node'] = True # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') cmd.flags['optional-components'] = ','.join(self.spec.applications) # TODO(pclay): stop ignoring spec.master_group? for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.disk_spec.disk_size: size_in_gb = '{}GB'.format( str(self.spec.worker_group.disk_spec.disk_size)) self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) # Set boot_disk_type if self.spec.worker_group.disk_spec.disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.disk_spec.disk_type) self.dpb_hdfs_type = disk_to_hdfs_map[ self.spec.worker_group.disk_spec.disk_type] # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) # Set zone cmd.flags['zone'] = self.dpb_service_zone if self.dpb_version: cmd.flags['image-version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image cmd.flags['metadata'] = util.MakeFormattedDefaultTags() timeout = 900 # 15 min # TODO(saksena): Retrieve the cluster create time and hold in a var _, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) if retcode: util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)
def _Create(self): """Creates the cluster.""" if self.cluster_id is None: self.cluster_id = 'pkb-' + FLAGS.run_uri cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project cmd.flags['num-workers'] = self.spec.worker_group.vm_count for group_type, group_spec in [ ('worker', self.spec.worker_group), ('master', self.spec.master_group)]: flag_name = group_type + '-machine-type' cmd.flags[flag_name] = group_spec.vm_spec.machine_type if group_spec.vm_spec.num_local_ssds: ssd_flag = 'num-{0}-local-ssds'.format(group_type) cmd.flags[ssd_flag] = group_spec.vm_spec.num_local_ssds if group_spec.vm_spec.boot_disk_size: disk_flag = group_type + '-boot-disk-size' cmd.flags[disk_flag] = group_spec.vm_spec.boot_disk_size if group_spec.vm_spec.boot_disk_type: disk_flag = group_type + '-boot-disk-type' cmd.flags[disk_flag] = group_spec.vm_spec.boot_disk_type if FLAGS.gcp_dataproc_subnet: cmd.flags['subnet'] = FLAGS.gcp_dataproc_subnet cmd.additional_flags.append('--no-address') if FLAGS.gcp_dataproc_property: cmd.flags['properties'] = ','.join(FLAGS.gcp_dataproc_property) if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.Issue()
def _Create(self): """Creates the instance.""" cmd = util.GcloudCommand(self, 'redis', 'instances', 'create', self.name) cmd.flags['region'] = self.redis_region cmd.flags['zone'] = FLAGS.zones[0] cmd.flags['network'] = FLAGS.gce_network_name cmd.flags['tier'] = self.tier cmd.flags['size'] = self.size cmd.flags['redis-version'] = self.redis_version cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.Issue(timeout=COMMAND_TIMEOUT)
def SubmitJob(self, jarfile=None, classname=None, pyspark_file=None, query_file=None, job_poll_interval=None, job_stdout_file=None, job_arguments=None, job_files=None, job_jars=None, job_type=None): """See base class.""" args = ['jobs', 'submit', job_type] if job_type == self.PYSPARK_JOB_TYPE: args.append(pyspark_file) cmd = self.DataprocGcloudCommand(*args) cmd.flags['cluster'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() if classname: cmd.flags['jars'] = jarfile cmd.flags['class'] = classname elif jarfile: cmd.flags['jar'] = jarfile if query_file: cmd.flags['file'] = query_file if job_files: cmd.flags['files'] = ','.join(job_files) if job_jars: cmd.flags['jars'] = ','.join(job_jars) # Dataproc gives as stdout an object describing job execution. # Its stderr contains a mix of the stderr of the job, and the # stdout of the job. We set the driver log level to FATAL # to suppress those messages, and we can then separate, hopefully # the job standard out from the log messages. cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level) if job_arguments: cmd.additional_flags = ['--'] + job_arguments stdout, _, retcode = cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: return {dpb_service.SUCCESS: False} stats = self._GetStats(stdout) stats[dpb_service.SUCCESS] = True return stats
def _Create(self): """Creates the disk.""" cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name) cmd.flags['size'] = self.disk_size cmd.flags['type'] = self.disk_type cmd.flags['labels'] = util.MakeFormattedDefaultTags() if self.image: cmd.flags['image'] = self.image if self.image_project: cmd.flags['image-project'] = self.image_project _, stderr, retcode = cmd.Issue(raise_on_failure=False) util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _PostCreate(self): """Get the cluster's data and tag it.""" cmd = self.DataprocGcloudCommand('clusters', 'describe', self.cluster_id) stdout, _, _ = cmd.Issue() config = json.loads(stdout)['config'] master = config['masterConfig'] worker = config['workerConfig'] for disk in master['instanceNames'] + worker['instanceNames']: cmd = util.GcloudCommand( self, 'compute', 'disks', 'add-labels', disk) cmd.flags['labels'] = util.MakeFormattedDefaultTags() cmd.flags['zone'] = self.dpb_service_zone cmd.Issue()
def _Create(self): """Creates the cluster.""" if self.cluster_id is None: self.cluster_id = 'pkb-' + FLAGS.run_uri cmd = util.GcloudCommand(self, 'dataproc', 'clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.vm_spec.boot_disk_size: self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), self.spec.worker_group.vm_spec.boot_disk_size) # Set boot_disk_type if self.spec.worker_group.vm_spec.boot_disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.vm_spec.boot_disk_type) # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) self.append_region(cmd, True) if self.dpb_dataproc_image_version: cmd.flags['image-version'] = self.dpb_dataproc_image_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image cmd.flags['metadata'] = util.MakeFormattedDefaultTags() # TODO(saksena): Retrieve the cluster create time and hold in a var cmd.Issue()
def _Create(self): """Creates the cluster.""" cmd = util.GcloudCommand(self, 'dataproc', 'clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.disk_spec.disk_size: size_in_gb = '{}GB'.format( str(self.spec.worker_group.disk_spec.disk_size)) self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) # Set boot_disk_type if self.spec.worker_group.disk_spec.disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.disk_spec.disk_type) self.dpb_hdfs_type = disk_to_hdfs_map[ self.spec.worker_group.disk_spec.disk_type] # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) # Set zone cmd.flags['zone'] = self.dpb_service_zone if self.dpb_version != 'latest': cmd.flags['image-version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image cmd.flags['metadata'] = util.MakeFormattedDefaultTags() # TODO(saksena): Retrieve the cluster create time and hold in a var cmd.Issue()
def _Create(self): logging.info('Creating NFS server %s', self.name) volume_arg = 'name={0},capacity={1}'.format( self.server_directory.strip('/'), self.disk_spec.disk_size) network_arg = 'name={0}'.format(self.network) args = [ '--file-share', volume_arg, '--network', network_arg, '--labels', util.MakeFormattedDefaultTags() ] if self.nfs_tier: args += ['--tier', self.nfs_tier] try: self._NfsCommand('create', *args) except errors.Error as ex: # if this NFS service already exists reuse it if self._Exists(): logging.info('Reusing existing NFS server %s', self.name) else: raise errors.Resource.RetryableCreationError( 'Error creating NFS service %s' % self.name, ex)
def _Create(self): """Creates the disk.""" cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name) cmd.flags['size'] = self.disk_size cmd.flags['type'] = self.disk_type if self.provisioned_iops and self.disk_type == PD_EXTREME: cmd.flags['provisioned-iops'] = self.provisioned_iops cmd.flags['labels'] = util.MakeFormattedDefaultTags() if self.image: cmd.flags['image'] = self.image if self.image_project: cmd.flags['image-project'] = self.image_project if self.replica_zones: cmd.flags['region'] = self.region cmd.flags['replica-zones'] = ','.join(self.replica_zones) del cmd.flags['zone'] _, stderr, retcode = cmd.Issue(raise_on_failure=False) util.CheckGcloudResponseKnownFailures(stderr, retcode)
def _BuildContext(launcher_vm, booter_template_vm): """Returns the context variables for Jinja2 template during rendering.""" context = { 'boot_machine_type': booter_template_vm.machine_type, 'cloud': FLAGS.cloud, 'contact_launcher': FLAGS.vms_contact_launcher, 'launcher_vm_name': launcher_vm.name, 'os_type': 'linux' if _IsLinux() else 'windows', 'server_ip': launcher_vm.internal_ip, 'server_port': _PORT, 'start_time_file': _START_TIME_FILE_PATH, 'timeout': _TIMEOUT_SECONDS, 'vm_count': FLAGS.boots_per_launcher, 'zone': launcher_vm.zone, 'use_public_ip': '' if FLAGS.use_public_ip else 'no-', } cloud = FLAGS.cloud if cloud == 'GCP': context.update({ 'boot_disk_size': booter_template_vm.boot_disk_size, 'boot_vm_name_prefix': _BOOT_VM_NAME_PREFIX.format(launcher_name=launcher_vm.name), 'image_family': booter_template_vm.image_family, 'image_project': booter_template_vm.image_project, 'gcloud_path': FLAGS.gcloud_path, 'project': FLAGS.project, 'tags': gcp_util.MakeFormattedDefaultTags(), }) elif cloud == 'AWS': tags = aws_util.MakeDefaultTags() tags.update({'launcher_id': launcher_vm.name}) context.update({ 'group_name': booter_template_vm.placement_group.name, 'image': booter_template_vm.image, 'key_name': 'perfkit-key-{0}'.format(FLAGS.run_uri), 'region': aws_util.GetRegionFromZone(launcher_vm.zone), 'subnet_id': booter_template_vm.network.subnet.id, 'tags': aws_util.FormatTagSpecifications('instance', tags), }) elif cloud == 'Azure': context.update({ 'boot_vm_name_prefix': launcher_vm.name.split('-', 1)[1], 'location': launcher_vm.region, 'image': booter_template_vm.image, 'storage_sku': booter_template_vm.os_disk.disk_type, 'resource_group': launcher_vm.resource_group.name, 'nic': _BOOT_NIC_NAME_PREFIX.format(run_uri=FLAGS.run_uri), 'password': booter_template_vm.password, 'start_id': GetAzBootVMStartIdByLauncher(launcher_vm.name), }) return context
def SubmitJob(self, jarfile, classname, job_script=None, job_poll_interval=None, job_arguments=None, job_stdout_file=None, job_type=spark_service.SPARK_JOB_TYPE): cmd = self.DataprocGcloudCommand('jobs', 'submit', job_type) cmd.flags['cluster'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() # If we don't put this here, zone is auotmatically added to the command # which breaks dataproc jobs submit cmd.flags['zone'] = [] cmd.additional_flags = [] if classname and jarfile: cmd.flags['jars'] = jarfile cmd.flags['class'] = classname elif jarfile: cmd.flags['jar'] = jarfile elif job_script: cmd.additional_flags += [job_script] # Dataproc gives as stdout an object describing job execution. # Its stderr contains a mix of the stderr of the job, and the # stdout of the job. We can set the driver log level to FATAL # to suppress those messages, and we can then separate, hopefully # the job standard out from the log messages. cmd.flags['driver-log-levels'] = 'root={}'.format( FLAGS.spark_service_log_level) if job_arguments: cmd.additional_flags += ['--'] + job_arguments stdout, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: return {spark_service.SUCCESS: False} stats = self._GetStats(stdout) stats[spark_service.SUCCESS] = True if job_stdout_file: with open(job_stdout_file, 'w') as f: lines = stderr.splitlines(True) if (not re.match(r'Job \[.*\] submitted.', lines[0]) or not re.match(r'Waiting for job output...', lines[1])): raise Exception('Dataproc output in unexpected format.') i = 2 if job_type == spark_service.SPARK_JOB_TYPE: if not re.match(r'\r', lines[i]): raise Exception('Dataproc output in unexpected format.') i += 1 # Eat these status lines. They end in \r, so they overwrite # themselves at the console or when you cat a file. But they # are part of this string. while re.match(r'\[Stage \d+:', lines[i]): i += 1 if not re.match(r' *\r$', lines[i]): raise Exception('Dataproc output in unexpected format.') while i < len(lines) and not re.match(r'Job \[.*\]', lines[i]): f.write(lines[i]) i += 1 if i != len(lines) - 1: raise Exception('Dataproc output in unexpected format.') return stats
def _Create(self): """Creates the cluster.""" cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project if self.spec.worker_count: # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count else: cmd.flags['single-node'] = True # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') cmd.flags['optional-components'] = ','.join(self.spec.applications) # Enable component gateway for debuggability. Does not impact performance. cmd.flags['enable-component-gateway'] = True # TODO(pclay): stop ignoring spec.master_group? for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.disk_spec.disk_size: size_in_gb = '{}GB'.format( str(self.spec.worker_group.disk_spec.disk_size)) self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) # Set boot_disk_type if self.spec.worker_group.disk_spec.disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.disk_spec.disk_type) self.dpb_hdfs_type = disk_to_hdfs_map[ self.spec.worker_group.disk_spec.disk_type] # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) # This will actually be used for storage self.dpb_hdfs_type = 'Local SSD' # Set zone cmd.flags['zone'] = self.dpb_service_zone if self.dpb_version: cmd.flags['image-version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image if FLAGS.dpb_cluster_properties: cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties) # Ideally DpbServiceSpec would have a network spec, which we would create to # Resolve the name, but because EMR provisions its own VPC and we are # generally happy using pre-existing networks for Dataproc. Just use the # underlying flag instead. if FLAGS.gce_network_name: cmd.flags['network'] = FLAGS.gce_network_name metadata = util.GetDefaultTags() metadata.update( flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) cmd.flags['metadata'] = util.FormatTags(metadata) cmd.flags['labels'] = util.MakeFormattedDefaultTags() timeout = 900 # 15 min stdout, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) self._cluster_create_time = self._ParseClusterCreateTime(stdout) if retcode: util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)
def _GenerateCreateCommand(self, ssh_keys_path): """Generates a command to create the VM instance. Args: ssh_keys_path: string. Path to a file containing the sshKeys metadata. Returns: GcloudCommand. gcloud command to issue in order to create the VM instance. """ args = ['compute', 'instances', 'create', self.name] cmd = util.GcloudCommand(self, *args) if self.network.subnet_resource is not None: cmd.flags['subnet'] = self.network.subnet_resource.name else: cmd.flags['network'] = self.network.network_resource.name if self.image: cmd.flags['image'] = self.image elif self.image_family: cmd.flags['image-family'] = self.image_family if self.image_project is not None: cmd.flags['image-project'] = self.image_project cmd.flags['boot-disk-auto-delete'] = True if self.boot_disk_size: cmd.flags['boot-disk-size'] = self.boot_disk_size if self.boot_disk_type: cmd.flags['boot-disk-type'] = self.boot_disk_type if self.machine_type is None: cmd.flags['custom-cpu'] = self.cpus cmd.flags['custom-memory'] = '{0}MiB'.format(self.memory_mib) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform else: cmd.flags['machine-type'] = self.machine_type if self.min_cpu_platform and 'n1-' in self.machine_type: cmd.flags['min-cpu-platform'] = self.min_cpu_platform elif self.min_cpu_platform: logging.warning('Cannot set min-cpu-platform for %s', self.machine_type) if self.gpu_count and self.machine_type and 'a2-' not in self.machine_type: # A2 machine type already has predefined GPU type and count. cmd.flags['accelerator'] = GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count) cmd.flags['tags'] = ','.join(['perfkitbenchmarker'] + (self.gce_tags or [])) cmd.flags['no-restart-on-failure'] = True if self.node_group: cmd.flags['node-group'] = self.node_group.name if self.gce_shielded_secure_boot: cmd.flags['shielded-secure-boot'] = True if self.network.placement_group: self.metadata.update( self.network.placement_group.GetResourceMetadata()) cmd.flags['resource-policies'] = self.network.placement_group.name cmd.flags['maintenance-policy'] = 'TERMINATE' else: self.metadata[ 'placement_group_style'] = placement_group.PLACEMENT_GROUP_NONE metadata_from_file = {'sshKeys': ssh_keys_path} parsed_metadata_from_file = flag_util.ParseKeyValuePairs( FLAGS.gcp_instance_metadata_from_file) for key, value in six.iteritems(parsed_metadata_from_file): if key in metadata_from_file: logging.warning( 'Metadata "%s" is set internally. Cannot be overridden ' 'from command line.', key) continue metadata_from_file[key] = value cmd.flags['metadata-from-file'] = ','.join( ['%s=%s' % (k, v) for k, v in six.iteritems(metadata_from_file)]) metadata = {} metadata.update(self.boot_metadata) metadata.update(util.GetDefaultTags()) additional_metadata = {} additional_metadata.update(self.vm_metadata) additional_metadata.update( flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) for key, value in six.iteritems(additional_metadata): if key in metadata: logging.warning( 'Metadata "%s" is set internally. Cannot be overridden ' 'from command line.', key) continue metadata[key] = value if self.preemptible: cmd.flags['preemptible'] = True preemptible_status_bucket = ( f'gs://{FLAGS.gcp_preemptible_status_bucket}/{FLAGS.run_uri}/') self.preempt_marker = f'{preemptible_status_bucket}{self.name}' metadata.update([self._PreemptibleMetadataKeyValue()]) cmd.flags['metadata'] = util.FormatTags(metadata) # TODO(user): If GCE one day supports live migration on GPUs # this can be revised. if (FLAGS['gce_migrate_on_maintenance'].present and FLAGS.gce_migrate_on_maintenance and self.gpu_count): raise errors.Config.InvalidValue( 'Cannot set flag gce_migrate_on_maintenance on instances with GPUs, ' 'as it is not supported by GCP.') if not FLAGS.gce_migrate_on_maintenance or self.gpu_count: cmd.flags['maintenance-policy'] = 'TERMINATE' cmd.flags['local-ssd'] = ( ['interface={0}'.format(FLAGS.gce_ssd_interface)] * self.max_local_disks) if FLAGS.gcloud_scopes: cmd.flags['scopes'] = ','.join( re.split(r'[,; ]', FLAGS.gcloud_scopes)) cmd.flags['network-tier'] = self.gce_network_tier.upper() cmd.flags['labels'] = util.MakeFormattedDefaultTags() return cmd
def SubmitJob(self, jarfile=None, classname=None, pyspark_file=None, query_file=None, job_poll_interval=None, job_stdout_file=None, job_arguments=None, job_files=None, job_jars=None, job_type=None, properties=None): """See base class.""" assert job_type args = ['batches', 'submit', job_type] additional_args = [] if job_type == self.PYSPARK_JOB_TYPE: args.append(pyspark_file) cmd = self.DataprocGcloudCommand(*args) cmd.flags['batch'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() job_jars = job_jars or [] if classname: if jarfile: # Dataproc does not support both a main class and a main jar so just # make the main jar an additional jar instead. job_jars.append(jarfile) cmd.flags['class'] = classname elif jarfile: cmd.flags['jar'] = jarfile if query_file: additional_args += query_file if job_files: cmd.flags['files'] = ','.join(job_files) if job_jars: cmd.flags['jars'] = ','.join(job_jars) if FLAGS.gce_network_name: cmd.flags['network'] = FLAGS.gce_network_name if self.dpb_version: cmd.flags['version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['container-image'] = FLAGS.gcp_dataproc_image all_properties = self.GetJobProperties() all_properties.update(properties or {}) if all_properties: # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping cmd.flags['properties'] = '^@^' + '@'.join( '{}={}'.format(k, v) for k, v in all_properties.items()) if job_arguments: additional_args += ['--'] + job_arguments cmd.additional_flags = additional_args _, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: raise dpb_service.JobSubmissionError(stderr) fetch_batch_cmd = self.DataprocGcloudCommand('batches', 'describe', self.cluster_id) stdout, stderr, retcode = fetch_batch_cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: raise dpb_service.JobSubmissionError(stderr) results = json.loads(stdout) # Otherwise retcode would not have been 0 assert results['state'] == 'SUCCEEDED' done_time = self._ParseTime(results['stateTime']) pending_time = None start_time = None for state in results['stateHistory']: if state['state'] == 'PENDING': pending_time = self._ParseTime(state['stateStartTime']) elif state['state'] == 'RUNNING': start_time = self._ParseTime(state['stateStartTime']) assert pending_time and start_time and done_time return dpb_service.JobResult( run_time=(done_time - start_time).total_seconds(), pending_time=(start_time - pending_time).total_seconds())
def SubmitJob(self, jarfile=None, classname=None, pyspark_file=None, query_file=None, job_poll_interval=None, job_stdout_file=None, job_arguments=None, job_files=None, job_jars=None, job_type=None, properties=None): """See base class.""" assert job_type args = ['jobs', 'submit', job_type] if job_type == self.PYSPARK_JOB_TYPE: args.append(pyspark_file) cmd = self.DataprocGcloudCommand(*args) cmd.flags['cluster'] = self.cluster_id cmd.flags['labels'] = util.MakeFormattedDefaultTags() job_jars = job_jars or [] if classname: if jarfile: # Dataproc does not support both a main class and a main jar so just # make the main jar an additional jar instead. job_jars.append(jarfile) cmd.flags['class'] = classname elif jarfile: cmd.flags['jar'] = jarfile if query_file: cmd.flags['file'] = query_file if job_files: cmd.flags['files'] = ','.join(job_files) if job_jars: cmd.flags['jars'] = ','.join(job_jars) # Dataproc gives as stdout an object describing job execution. # Its stderr contains a mix of the stderr of the job, and the # stdout of the job. We set the driver log level to FATAL # to suppress those messages, and we can then separate, hopefully # the job standard out from the log messages. cmd.flags['driver-log-levels'] = 'root={}'.format(FLAGS.dpb_log_level) all_properties = self.GetJobProperties() all_properties.update(properties or {}) if all_properties: # For commas: https://cloud.google.com/sdk/gcloud/reference/topic/escaping cmd.flags['properties'] = '^@^' + '@'.join( '{}={}'.format(k, v) for k, v in all_properties.items()) if job_arguments: cmd.additional_flags = ['--'] + job_arguments stdout, stderr, retcode = cmd.Issue(timeout=None, raise_on_failure=False) if retcode != 0: raise dpb_service.JobSubmissionError(stderr) results = json.loads(stdout) # Otherwise retcode would not have been 0 assert results['status']['state'] == 'DONE' done_time = GcpDpbDataproc._ParseTime( results['status']['stateStartTime']) pending_time = None start_time = None for state in results['statusHistory']: if state['state'] == 'PENDING': pending_time = GcpDpbDataproc._ParseTime( state['stateStartTime']) elif state['state'] == 'RUNNING': start_time = GcpDpbDataproc._ParseTime(state['stateStartTime']) assert pending_time and start_time and done_time return dpb_service.JobResult( run_time=(done_time - start_time).total_seconds(), pending_time=(start_time - pending_time).total_seconds())
def _Create(self): """Creates the cluster.""" cmd = util.GcloudCommand(self, 'container', 'clusters', 'create', self.name) cmd.flags['cluster-version'] = self.cluster_version if FLAGS.gke_enable_alpha: cmd.args.append('--enable-kubernetes-alpha') cmd.args.append('--no-enable-autorepair') cmd.args.append('--no-enable-autoupgrade') user = util.GetDefaultUser() if FLAGS.gcp_service_account: cmd.flags['service-account'] = FLAGS.gcp_service_account # Matches service accounts that either definitely belongs to this project or # are a GCP managed service account like the GCE default service account, # which we can't tell to which project they belong. elif re.match(SERVICE_ACCOUNT_PATTERN, user): logging.info( 'Re-using configured service-account for GKE Cluster: %s', user) cmd.flags['service-account'] = user self.use_application_default_credentials = False else: logging.info('Using default GCE service account for GKE cluster') cmd.flags['scopes'] = 'cloud-platform' if self.vm_config.gpu_count: cmd.flags['accelerator'] = ( gce_virtual_machine.GenerateAcceleratorSpecString( self.vm_config.gpu_type, self.vm_config.gpu_count)) if self.vm_config.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.vm_config.min_cpu_platform if self.vm_config.boot_disk_size: cmd.flags['disk-size'] = self.vm_config.boot_disk_size if self.vm_config.boot_disk_type: cmd.flags['disk-type'] = self.vm_config.boot_disk_type if self.vm_config.max_local_disks: # TODO(pclay): Switch to local-ssd-volumes which support NVME when it # leaves alpha. See # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create cmd.flags['local-ssd-count'] = self.vm_config.max_local_disks if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes: cmd.args.append('--enable-autoscaling') cmd.flags['max-nodes'] = self.max_nodes cmd.flags['min-nodes'] = self.min_nodes cmd.flags['num-nodes'] = self.num_nodes if self.vm_config.machine_type is None: cmd.flags['machine-type'] = 'custom-{0}-{1}'.format( self.vm_config.cpus, self.vm_config.memory_mib) else: cmd.flags['machine-type'] = self.vm_config.machine_type cmd.flags['metadata'] = util.MakeFormattedDefaultTags() cmd.flags['labels'] = util.MakeFormattedDefaultTags() # This command needs a long timeout due to the many minutes it # can take to provision a large GPU-accelerated GKE cluster. _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False) if retcode: # Log specific type of failure, if known. if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr: logging.exception('Container resources exhausted: %s', stderr) raise errors.Benchmarks.InsufficientCapacityCloudFailure( 'Container resources exhausted in zone %s: %s' % (self.zone, stderr)) util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)
def _CreateGcloudSqlInstance(self): storage_size = self.spec.db_disk_spec.disk_size instance_zone = self.spec.db_spec.zone authorized_network = self._GetAuthorizedNetworks([self.client_vm]) database_version_string = self._GetEngineVersionString( self.spec.engine, self.spec.engine_version) cmd_string = [ self, 'beta', 'sql', 'instances', 'create', self.instance_id, '--quiet', '--format=json', '--activation-policy=ALWAYS', '--assign-ip', '--authorized-networks=%s' % authorized_network, '--zone=%s' % instance_zone, '--database-version=%s' % database_version_string, '--storage-size=%d' % storage_size, '--labels=%s' % util.MakeFormattedDefaultTags(), ] if self.spec.engine == relational_db.MYSQL: cmd_string.append('--enable-bin-log') if self.spec.engine == relational_db.SQLSERVER: # `--root-password` is required when creating SQL Server instances. cmd_string.append('--root-password={0}'.format( self.spec.database_password)) if (self.spec.db_spec.cpus and self.spec.db_spec.memory): self._ValidateSpec() memory = self.spec.db_spec.memory cpus = self.spec.db_spec.cpus self._ValidateMachineType(memory, cpus) cmd_string.append('--cpu={}'.format(cpus)) cmd_string.append('--memory={}MiB'.format(memory)) elif hasattr(self.spec.db_spec, 'machine_type'): machine_type_flag = '--tier=%s' % self.spec.db_spec.machine_type cmd_string.append(machine_type_flag) else: raise Exception('Unspecified machine type') if self.spec.high_availability: cmd_string.append(self._GetHighAvailabilityFlag()) if self.spec.backup_enabled: cmd_string.append('--backup') cmd_string.append('--backup-start-time={}'.format( self.spec.backup_start_time)) else: cmd_string.append('--no-backup') cmd = util.GcloudCommand(*cmd_string) cmd.flags['project'] = self.project _, stderr, retcode = cmd.Issue(timeout=CREATION_TIMEOUT) util.CheckGcloudResponseKnownFailures(stderr, retcode)