def CreateDataset(self, dataset=None, description=None): """Creates a new dataset. See https://cloud.google.com/bigquery/docs/tables Args: dataset: Optional name of the dataset. If none, will be extracted from the cluster_identifier. description: Optional description of the dataset. Escape double quotes. """ project_dataset = self.FormatProjectAndDatasetForCommand(dataset) cmd = [ 'bq', 'mk', '--dataset', '--default_table_expiration=%d' % DEFAULT_TABLE_EXPIRATION ] if description: cmd.extend(['--description', '"%s"' % description]) cmd.append(project_dataset) vm_util.IssueCommand(cmd) cmd = ['bq', 'update'] for key, value in gcp_util.GetDefaultTags().items(): cmd.extend(['--set_label', f'{key}:{value}']) cmd.append(project_dataset) vm_util.IssueCommand(cmd)
def _Create(self) -> None: """Creates the instance, the database, and update the schema.""" cmd = util.GcloudCommand(self, 'spanner', 'instances', 'create', self.name) cmd.flags['description'] = self._description cmd.flags['nodes'] = self.nodes cmd.flags['config'] = self._config _, _, retcode = cmd.Issue(raise_on_failure=False) if retcode != 0: logging.error('Create GCP Spanner instance failed.') return self._UpdateLabels(util.GetDefaultTags()) cmd = util.GcloudCommand(self, 'spanner', 'databases', 'create', self.database) cmd.flags['instance'] = self.name _, _, retcode = cmd.Issue(raise_on_failure=False) if retcode != 0: logging.error('Create GCP Spanner database failed.') return cmd = util.GcloudCommand(self, 'spanner', 'databases', 'ddl', 'update', self.database) cmd.flags['instance'] = self.name cmd.flags['ddl'] = self._ddl _, _, retcode = cmd.Issue(raise_on_failure=False) if retcode != 0: logging.error('Update GCP Spanner database schema failed.') else: logging.info('Created GCP Spanner instance and database.')
def MakeBucket(self, bucket, raise_on_failure=True): command = ['gsutil', 'mb'] if self.location: command.extend(['-l', self.location]) if self.location and '-' in self.location: # regional buckets command.extend(['-c', 'regional']) elif FLAGS.object_storage_storage_class is not None: command.extend(['-c', FLAGS.object_storage_storage_class]) if FLAGS.project: command.extend(['-p', FLAGS.project]) command.extend(['gs://%s' % bucket]) _, stderr, ret_code = vm_util.IssueCommand(command, raise_on_failure=False) if ret_code and raise_on_failure: raise errors.Benchmarks.BucketCreationError(stderr) command = ['gsutil', 'label', 'ch'] for key, value in util.GetDefaultTags().items(): command.extend(['-l', f'{key}:{value}']) command.extend([f'gs://{bucket}']) _, stderr, ret_code = vm_util.IssueCommand(command, raise_on_failure=False) if ret_code and raise_on_failure: raise errors.Benchmarks.BucketCreationError(stderr)
def _Create(self): """Creates the instance.""" cmd = _GetBigtableGcloudCommand(self, 'bigtable', 'instances', 'create', self.name) cmd.flags['display-name'] = self.name cmd.flags['cluster-storage-type'] = self.storage_type cmd.flags['project'] = self.project cmd.flags['cluster-config'] = self._BuildClusterConfigs() logging.info('Creating instance %s.', self.name) _, stderr, _ = cmd.Issue() if 'Insufficient node quota' in stderr: raise errors.Benchmarks.QuotaFailure( f'Insufficient node quota in project {self.project} ' f'and zone {self.zone}') self._UpdateLabels(util.GetDefaultTags()) if self.multicluster_routing: cmd = _GetBigtableGcloudCommand( self, 'bigtable', 'app-profiles', 'update', 'default') cmd.flags['instance'] = self.name cmd.flags['route-any'] = True cmd.flags['force'] = True cmd.Issue()
def LoadDataset(self, source_bucket, tables, schema_dir, dataset=None, append=True, skip_header_row=True, field_delimiter=','): """Load all tables in a dataset to a database from CSV object storage. See https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv Args: source_bucket: Name of the bucket to load the data from. Should already exist. Each table must have its own subfolder in the bucket named after the table, containing one or more csv files that make up the table data. tables: List of table names to load. schema_dir: GCS directory containing json schemas of all tables to load. dataset: Optional name of the dataset. If none, will be extracted from the cluster_identifier. append: If True, appends loaded data to the existing set. If False, replaces the existing data (if any). skip_header_row: If True, skips the first row of data being loaded. field_delimiter: The separator for fields in the CSV file. """ project_dataset = self.FormatProjectAndDatasetForCommand(dataset) for table in tables: schema_path = schema_dir + table + '.json' local_schema = './%s.json' % table vm_util.IssueCommand(['gsutil', 'cp', schema_path, local_schema]) cmd = [ 'bq', 'load', '--noreplace' if append else '--replace', '--source_format=CSV', '--field_delimiter=%s' % field_delimiter, '--skip_leading_rows=%d' % (1 if skip_header_row else 0), '%s.%s' % (project_dataset, table), 'gs://%s/%s/*.csv' % (source_bucket, table), local_schema ] _, stderr, retcode = vm_util.IssueCommand(cmd, raise_on_failure=False) if retcode: logging.warning( 'Loading table %s failed. stderr: %s, retcode: %s', table, stderr, retcode) cmd = ['bq', 'update'] for key, value in gcp_util.GetDefaultTags().items(): cmd.extend(['--set_label', f'{key}:{value}']) cmd.append(f'{project_dataset}.{table}') vm_util.IssueCommand(cmd)
def _GenerateCreateCommand(self, ssh_keys_path): """Generates a command to create the VM instance. Args: ssh_keys_path: string. Path to a file containing the sshKeys metadata. Returns: GcloudCommand. gcloud command to issue in order to create the VM instance. """ args = ['compute', 'instances', 'create', self.name] cmd = util.GcloudCommand(self, *args) if self.network.subnet_resource is not None: cmd.flags['subnet'] = self.network.subnet_resource.name else: cmd.flags['network'] = self.network.network_resource.name if self.image: cmd.flags['image'] = self.image elif self.image_family: cmd.flags['image-family'] = self.image_family if self.image_project is not None: cmd.flags['image-project'] = self.image_project cmd.flags['boot-disk-auto-delete'] = True if self.boot_disk_size: cmd.flags['boot-disk-size'] = self.boot_disk_size if self.boot_disk_type: cmd.flags['boot-disk-type'] = self.boot_disk_type if self.machine_type is None: cmd.flags['custom-cpu'] = self.cpus cmd.flags['custom-memory'] = '{0}MiB'.format(self.memory_mib) if self.min_cpu_platform: cmd.flags['min-cpu-platform'] = self.min_cpu_platform else: cmd.flags['machine-type'] = self.machine_type if self.min_cpu_platform and 'n1-' in self.machine_type: cmd.flags['min-cpu-platform'] = self.min_cpu_platform elif self.min_cpu_platform: logging.warning('Cannot set min-cpu-platform for %s', self.machine_type) if self.gpu_count and self.machine_type and 'a2-' not in self.machine_type: # A2 machine type already has predefined GPU type and count. cmd.flags['accelerator'] = GenerateAcceleratorSpecString( self.gpu_type, self.gpu_count) cmd.flags['tags'] = ','.join(['perfkitbenchmarker'] + (self.gce_tags or [])) cmd.flags['no-restart-on-failure'] = True if self.node_group: cmd.flags['node-group'] = self.node_group.name if self.gce_shielded_secure_boot: cmd.flags['shielded-secure-boot'] = True if self.network.placement_group: self.metadata.update( self.network.placement_group.GetResourceMetadata()) cmd.flags['resource-policies'] = self.network.placement_group.name cmd.flags['maintenance-policy'] = 'TERMINATE' else: self.metadata[ 'placement_group_style'] = placement_group.PLACEMENT_GROUP_NONE metadata_from_file = {'sshKeys': ssh_keys_path} parsed_metadata_from_file = flag_util.ParseKeyValuePairs( FLAGS.gcp_instance_metadata_from_file) for key, value in six.iteritems(parsed_metadata_from_file): if key in metadata_from_file: logging.warning( 'Metadata "%s" is set internally. Cannot be overridden ' 'from command line.', key) continue metadata_from_file[key] = value cmd.flags['metadata-from-file'] = ','.join( ['%s=%s' % (k, v) for k, v in six.iteritems(metadata_from_file)]) metadata = {} metadata.update(self.boot_metadata) metadata.update(util.GetDefaultTags()) additional_metadata = {} additional_metadata.update(self.vm_metadata) additional_metadata.update( flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) for key, value in six.iteritems(additional_metadata): if key in metadata: logging.warning( 'Metadata "%s" is set internally. Cannot be overridden ' 'from command line.', key) continue metadata[key] = value if self.preemptible: cmd.flags['preemptible'] = True preemptible_status_bucket = ( f'gs://{FLAGS.gcp_preemptible_status_bucket}/{FLAGS.run_uri}/') self.preempt_marker = f'{preemptible_status_bucket}{self.name}' metadata.update([self._PreemptibleMetadataKeyValue()]) cmd.flags['metadata'] = util.FormatTags(metadata) # TODO(user): If GCE one day supports live migration on GPUs # this can be revised. if (FLAGS['gce_migrate_on_maintenance'].present and FLAGS.gce_migrate_on_maintenance and self.gpu_count): raise errors.Config.InvalidValue( 'Cannot set flag gce_migrate_on_maintenance on instances with GPUs, ' 'as it is not supported by GCP.') if not FLAGS.gce_migrate_on_maintenance or self.gpu_count: cmd.flags['maintenance-policy'] = 'TERMINATE' cmd.flags['local-ssd'] = ( ['interface={0}'.format(FLAGS.gce_ssd_interface)] * self.max_local_disks) if FLAGS.gcloud_scopes: cmd.flags['scopes'] = ','.join( re.split(r'[,; ]', FLAGS.gcloud_scopes)) cmd.flags['network-tier'] = self.gce_network_tier.upper() cmd.flags['labels'] = util.MakeFormattedDefaultTags() return cmd
def _UpdateTimeout(self, timeout_minutes: int) -> None: """See base class.""" labels = util.GetDefaultTags(timeout_minutes) self._UpdateLabels(labels)
def _Create(self): """Creates the cluster.""" cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id) if self.project is not None: cmd.flags['project'] = self.project if self.spec.worker_count: # The number of worker machines in the cluster cmd.flags['num-workers'] = self.spec.worker_count else: cmd.flags['single-node'] = True # Initialize applications on the dataproc cluster if self.spec.applications: logging.info('Include the requested applications') cmd.flags['optional-components'] = ','.join(self.spec.applications) # Enable component gateway for debuggability. Does not impact performance. cmd.flags['enable-component-gateway'] = True # TODO(pclay): stop ignoring spec.master_group? for role in ['worker', 'master']: # Set machine type if self.spec.worker_group.vm_spec.machine_type: self._AddToCmd(cmd, '{0}-machine-type'.format(role), self.spec.worker_group.vm_spec.machine_type) # Set boot_disk_size if self.spec.worker_group.disk_spec.disk_size: size_in_gb = '{}GB'.format( str(self.spec.worker_group.disk_spec.disk_size)) self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role), size_in_gb) # Set boot_disk_type if self.spec.worker_group.disk_spec.disk_type: self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role), self.spec.worker_group.disk_spec.disk_type) self.dpb_hdfs_type = disk_to_hdfs_map[ self.spec.worker_group.disk_spec.disk_type] # Set ssd count if self.spec.worker_group.vm_spec.num_local_ssds: self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role), self.spec.worker_group.vm_spec.num_local_ssds) # This will actually be used for storage self.dpb_hdfs_type = 'Local SSD' # Set zone cmd.flags['zone'] = self.dpb_service_zone if self.dpb_version: cmd.flags['image-version'] = self.dpb_version if FLAGS.gcp_dataproc_image: cmd.flags['image'] = FLAGS.gcp_dataproc_image if FLAGS.dpb_cluster_properties: cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties) # Ideally DpbServiceSpec would have a network spec, which we would create to # Resolve the name, but because EMR provisions its own VPC and we are # generally happy using pre-existing networks for Dataproc. Just use the # underlying flag instead. if FLAGS.gce_network_name: cmd.flags['network'] = FLAGS.gce_network_name metadata = util.GetDefaultTags() metadata.update( flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata)) cmd.flags['metadata'] = util.FormatTags(metadata) cmd.flags['labels'] = util.MakeFormattedDefaultTags() timeout = 900 # 15 min stdout, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False) self._cluster_create_time = self._ParseClusterCreateTime(stdout) if retcode: util.CheckGcloudResponseKnownFailures(stderr, retcode) raise errors.Resource.CreationError(stderr)