def CreateDataset(self, dataset=None, description=None):
    """Creates a new dataset.

    See https://cloud.google.com/bigquery/docs/tables

    Args:
      dataset: Optional name of the dataset. If none, will be extracted from the
        cluster_identifier.
      description: Optional description of the dataset. Escape double quotes.
    """
    project_dataset = self.FormatProjectAndDatasetForCommand(dataset)
    cmd = [
        'bq', 'mk', '--dataset',
        '--default_table_expiration=%d' % DEFAULT_TABLE_EXPIRATION
    ]
    if description:
      cmd.extend(['--description', '"%s"' % description])
    cmd.append(project_dataset)
    vm_util.IssueCommand(cmd)

    cmd = ['bq', 'update']
    for key, value in gcp_util.GetDefaultTags().items():
      cmd.extend(['--set_label', f'{key}:{value}'])
    cmd.append(project_dataset)
    vm_util.IssueCommand(cmd)
    def _Create(self) -> None:
        """Creates the instance, the database, and update the schema."""
        cmd = util.GcloudCommand(self, 'spanner', 'instances', 'create',
                                 self.name)
        cmd.flags['description'] = self._description
        cmd.flags['nodes'] = self.nodes
        cmd.flags['config'] = self._config
        _, _, retcode = cmd.Issue(raise_on_failure=False)
        if retcode != 0:
            logging.error('Create GCP Spanner instance failed.')
            return

        self._UpdateLabels(util.GetDefaultTags())

        cmd = util.GcloudCommand(self, 'spanner', 'databases', 'create',
                                 self.database)
        cmd.flags['instance'] = self.name
        _, _, retcode = cmd.Issue(raise_on_failure=False)
        if retcode != 0:
            logging.error('Create GCP Spanner database failed.')
            return

        cmd = util.GcloudCommand(self, 'spanner', 'databases', 'ddl', 'update',
                                 self.database)
        cmd.flags['instance'] = self.name
        cmd.flags['ddl'] = self._ddl
        _, _, retcode = cmd.Issue(raise_on_failure=False)
        if retcode != 0:
            logging.error('Update GCP Spanner database schema failed.')
        else:
            logging.info('Created GCP Spanner instance and database.')
Beispiel #3
0
    def MakeBucket(self, bucket, raise_on_failure=True):
        command = ['gsutil', 'mb']
        if self.location:
            command.extend(['-l', self.location])
        if self.location and '-' in self.location:
            # regional buckets
            command.extend(['-c', 'regional'])
        elif FLAGS.object_storage_storage_class is not None:
            command.extend(['-c', FLAGS.object_storage_storage_class])
        if FLAGS.project:
            command.extend(['-p', FLAGS.project])
        command.extend(['gs://%s' % bucket])

        _, stderr, ret_code = vm_util.IssueCommand(command,
                                                   raise_on_failure=False)
        if ret_code and raise_on_failure:
            raise errors.Benchmarks.BucketCreationError(stderr)

        command = ['gsutil', 'label', 'ch']
        for key, value in util.GetDefaultTags().items():
            command.extend(['-l', f'{key}:{value}'])
        command.extend([f'gs://{bucket}'])
        _, stderr, ret_code = vm_util.IssueCommand(command,
                                                   raise_on_failure=False)
        if ret_code and raise_on_failure:
            raise errors.Benchmarks.BucketCreationError(stderr)
Beispiel #4
0
  def _Create(self):
    """Creates the instance."""
    cmd = _GetBigtableGcloudCommand(self, 'bigtable', 'instances', 'create',
                                    self.name)
    cmd.flags['display-name'] = self.name
    cmd.flags['cluster-storage-type'] = self.storage_type
    cmd.flags['project'] = self.project
    cmd.flags['cluster-config'] = self._BuildClusterConfigs()

    logging.info('Creating instance %s.', self.name)

    _, stderr, _ = cmd.Issue()
    if 'Insufficient node quota' in stderr:
      raise errors.Benchmarks.QuotaFailure(
          f'Insufficient node quota in project {self.project} '
          f'and zone {self.zone}')

    self._UpdateLabels(util.GetDefaultTags())

    if self.multicluster_routing:
      cmd = _GetBigtableGcloudCommand(
          self, 'bigtable', 'app-profiles', 'update', 'default')
      cmd.flags['instance'] = self.name
      cmd.flags['route-any'] = True
      cmd.flags['force'] = True
      cmd.Issue()
    def LoadDataset(self,
                    source_bucket,
                    tables,
                    schema_dir,
                    dataset=None,
                    append=True,
                    skip_header_row=True,
                    field_delimiter=','):
        """Load all tables in a dataset to a database from CSV object storage.

    See https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv

    Args:
      source_bucket: Name of the bucket to load the data from. Should already
        exist. Each table must have its own subfolder in the bucket named after
        the table, containing one or more csv files that make up the table data.
      tables: List of table names to load.
      schema_dir: GCS directory containing json schemas of all tables to load.
      dataset: Optional name of the dataset. If none, will be extracted from the
        cluster_identifier.
      append: If True, appends loaded data to the existing set. If False,
        replaces the existing data (if any).
      skip_header_row: If True, skips the first row of data being loaded.
      field_delimiter: The separator for fields in the CSV file.
    """
        project_dataset = self.FormatProjectAndDatasetForCommand(dataset)
        for table in tables:
            schema_path = schema_dir + table + '.json'
            local_schema = './%s.json' % table
            vm_util.IssueCommand(['gsutil', 'cp', schema_path, local_schema])
            cmd = [
                'bq', 'load', '--noreplace' if append else '--replace',
                '--source_format=CSV',
                '--field_delimiter=%s' % field_delimiter,
                '--skip_leading_rows=%d' % (1 if skip_header_row else 0),
                '%s.%s' % (project_dataset, table),
                'gs://%s/%s/*.csv' % (source_bucket, table), local_schema
            ]
            _, stderr, retcode = vm_util.IssueCommand(cmd,
                                                      raise_on_failure=False)
            if retcode:
                logging.warning(
                    'Loading table %s failed. stderr: %s, retcode: %s', table,
                    stderr, retcode)

            cmd = ['bq', 'update']
            for key, value in gcp_util.GetDefaultTags().items():
                cmd.extend(['--set_label', f'{key}:{value}'])
            cmd.append(f'{project_dataset}.{table}')
            vm_util.IssueCommand(cmd)
    def _GenerateCreateCommand(self, ssh_keys_path):
        """Generates a command to create the VM instance.

    Args:
      ssh_keys_path: string. Path to a file containing the sshKeys metadata.

    Returns:
      GcloudCommand. gcloud command to issue in order to create the VM instance.
    """
        args = ['compute', 'instances', 'create', self.name]

        cmd = util.GcloudCommand(self, *args)
        if self.network.subnet_resource is not None:
            cmd.flags['subnet'] = self.network.subnet_resource.name
        else:
            cmd.flags['network'] = self.network.network_resource.name
        if self.image:
            cmd.flags['image'] = self.image
        elif self.image_family:
            cmd.flags['image-family'] = self.image_family
        if self.image_project is not None:
            cmd.flags['image-project'] = self.image_project
        cmd.flags['boot-disk-auto-delete'] = True
        if self.boot_disk_size:
            cmd.flags['boot-disk-size'] = self.boot_disk_size
        if self.boot_disk_type:
            cmd.flags['boot-disk-type'] = self.boot_disk_type
        if self.machine_type is None:
            cmd.flags['custom-cpu'] = self.cpus
            cmd.flags['custom-memory'] = '{0}MiB'.format(self.memory_mib)
            if self.min_cpu_platform:
                cmd.flags['min-cpu-platform'] = self.min_cpu_platform
        else:
            cmd.flags['machine-type'] = self.machine_type
            if self.min_cpu_platform and 'n1-' in self.machine_type:
                cmd.flags['min-cpu-platform'] = self.min_cpu_platform
            elif self.min_cpu_platform:
                logging.warning('Cannot set min-cpu-platform for %s',
                                self.machine_type)
        if self.gpu_count and self.machine_type and 'a2-' not in self.machine_type:
            # A2 machine type already has predefined GPU type and count.
            cmd.flags['accelerator'] = GenerateAcceleratorSpecString(
                self.gpu_type, self.gpu_count)
        cmd.flags['tags'] = ','.join(['perfkitbenchmarker'] +
                                     (self.gce_tags or []))
        cmd.flags['no-restart-on-failure'] = True
        if self.node_group:
            cmd.flags['node-group'] = self.node_group.name
        if self.gce_shielded_secure_boot:
            cmd.flags['shielded-secure-boot'] = True

        if self.network.placement_group:
            self.metadata.update(
                self.network.placement_group.GetResourceMetadata())
            cmd.flags['resource-policies'] = self.network.placement_group.name
            cmd.flags['maintenance-policy'] = 'TERMINATE'
        else:
            self.metadata[
                'placement_group_style'] = placement_group.PLACEMENT_GROUP_NONE

        metadata_from_file = {'sshKeys': ssh_keys_path}
        parsed_metadata_from_file = flag_util.ParseKeyValuePairs(
            FLAGS.gcp_instance_metadata_from_file)
        for key, value in six.iteritems(parsed_metadata_from_file):
            if key in metadata_from_file:
                logging.warning(
                    'Metadata "%s" is set internally. Cannot be overridden '
                    'from command line.', key)
                continue
            metadata_from_file[key] = value
        cmd.flags['metadata-from-file'] = ','.join(
            ['%s=%s' % (k, v) for k, v in six.iteritems(metadata_from_file)])

        metadata = {}
        metadata.update(self.boot_metadata)
        metadata.update(util.GetDefaultTags())

        additional_metadata = {}
        additional_metadata.update(self.vm_metadata)
        additional_metadata.update(
            flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata))

        for key, value in six.iteritems(additional_metadata):
            if key in metadata:
                logging.warning(
                    'Metadata "%s" is set internally. Cannot be overridden '
                    'from command line.', key)
                continue
            metadata[key] = value

        if self.preemptible:
            cmd.flags['preemptible'] = True
            preemptible_status_bucket = (
                f'gs://{FLAGS.gcp_preemptible_status_bucket}/{FLAGS.run_uri}/')
            self.preempt_marker = f'{preemptible_status_bucket}{self.name}'
            metadata.update([self._PreemptibleMetadataKeyValue()])

        cmd.flags['metadata'] = util.FormatTags(metadata)

        # TODO(user): If GCE one day supports live migration on GPUs
        #                           this can be revised.
        if (FLAGS['gce_migrate_on_maintenance'].present
                and FLAGS.gce_migrate_on_maintenance and self.gpu_count):
            raise errors.Config.InvalidValue(
                'Cannot set flag gce_migrate_on_maintenance on instances with GPUs, '
                'as it is not supported by GCP.')
        if not FLAGS.gce_migrate_on_maintenance or self.gpu_count:
            cmd.flags['maintenance-policy'] = 'TERMINATE'
        cmd.flags['local-ssd'] = (
            ['interface={0}'.format(FLAGS.gce_ssd_interface)] *
            self.max_local_disks)
        if FLAGS.gcloud_scopes:
            cmd.flags['scopes'] = ','.join(
                re.split(r'[,; ]', FLAGS.gcloud_scopes))
        cmd.flags['network-tier'] = self.gce_network_tier.upper()
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        return cmd
 def _UpdateTimeout(self, timeout_minutes: int) -> None:
     """See base class."""
     labels = util.GetDefaultTags(timeout_minutes)
     self._UpdateLabels(labels)
Beispiel #8
0
    def _Create(self):
        """Creates the cluster."""
        cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        if self.spec.worker_count:
            # The number of worker machines in the cluster
            cmd.flags['num-workers'] = self.spec.worker_count
        else:
            cmd.flags['single-node'] = True

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')
            cmd.flags['optional-components'] = ','.join(self.spec.applications)

        # Enable component gateway for debuggability. Does not impact performance.
        cmd.flags['enable-component-gateway'] = True

        # TODO(pclay): stop ignoring spec.master_group?
        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)
            # Set boot_disk_size
            if self.spec.worker_group.disk_spec.disk_size:
                size_in_gb = '{}GB'.format(
                    str(self.spec.worker_group.disk_spec.disk_size))
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               size_in_gb)
            # Set boot_disk_type
            if self.spec.worker_group.disk_spec.disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.disk_spec.disk_type)
                self.dpb_hdfs_type = disk_to_hdfs_map[
                    self.spec.worker_group.disk_spec.disk_type]

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)
                # This will actually be used for storage
                self.dpb_hdfs_type = 'Local SSD'
        # Set zone
        cmd.flags['zone'] = self.dpb_service_zone
        if self.dpb_version:
            cmd.flags['image-version'] = self.dpb_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        if FLAGS.dpb_cluster_properties:
            cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties)

        # Ideally DpbServiceSpec would have a network spec, which we would create to
        # Resolve the name, but because EMR provisions its own VPC and we are
        # generally happy using pre-existing networks for Dataproc. Just use the
        # underlying flag instead.
        if FLAGS.gce_network_name:
            cmd.flags['network'] = FLAGS.gce_network_name

        metadata = util.GetDefaultTags()
        metadata.update(
            flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata))
        cmd.flags['metadata'] = util.FormatTags(metadata)
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()
        timeout = 900  # 15 min
        stdout, stderr, retcode = cmd.Issue(timeout=timeout,
                                            raise_on_failure=False)
        self._cluster_create_time = self._ParseClusterCreateTime(stdout)
        if retcode:
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)