Esempio n. 1
0
    def _Create(self):
        """Creates the cluster."""
        cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        if self.spec.worker_count:
            # The number of worker machines in the cluster
            cmd.flags['num-workers'] = self.spec.worker_count
        else:
            cmd.flags['single-node'] = True

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')
            cmd.flags['optional-components'] = ','.join(self.spec.applications)

        # TODO(pclay): stop ignoring spec.master_group?
        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)
            # Set boot_disk_size
            if self.spec.worker_group.disk_spec.disk_size:
                size_in_gb = '{}GB'.format(
                    str(self.spec.worker_group.disk_spec.disk_size))
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               size_in_gb)
            # Set boot_disk_type
            if self.spec.worker_group.disk_spec.disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.disk_spec.disk_type)
                self.dpb_hdfs_type = disk_to_hdfs_map[
                    self.spec.worker_group.disk_spec.disk_type]

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)
        # Set zone
        cmd.flags['zone'] = self.dpb_service_zone
        if self.dpb_version:
            cmd.flags['image-version'] = self.dpb_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        timeout = 900  # 15 min
        # TODO(saksena): Retrieve the cluster create time and hold in a var
        _, stderr, retcode = cmd.Issue(timeout=timeout, raise_on_failure=False)
        if retcode:
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)
    def _Create(self):
        """Create a GCE VM instance."""
        num_hosts = len(self.host_list)
        with open(self.ssh_public_key) as f:
            public_key = f.read().rstrip('\n')
        with vm_util.NamedTemporaryFile(mode='w',
                                        dir=vm_util.GetTempDir(),
                                        prefix='key-metadata') as tf:
            tf.write('%s:%s\n' % (self.user_name, public_key))
            tf.close()
            create_cmd = self._GenerateCreateCommand(tf.name)
            _, stderr, retcode = create_cmd.Issue(
                timeout=_GCE_VM_CREATE_TIMEOUT, raise_on_failure=False)

        if (self.use_dedicated_host and retcode
                and _INSUFFICIENT_HOST_CAPACITY in stderr):
            if self.num_vms_per_host:
                raise errors.Resource.CreationError(
                    'Failed to create host: %d vms of type %s per host exceeds '
                    'memory capacity limits of the host' %
                    (self.num_vms_per_host, self.machine_type))
            else:
                logging.warning(
                    'Creation failed due to insufficient host capacity. A new host will '
                    'be created and instance creation will be retried.')
                with self._host_lock:
                    if num_hosts == len(self.host_list):
                        host = GceSoleTenantNodeGroup(self.node_template,
                                                      self.zone, self.project)
                        self.host_list.append(host)
                        host.Create()
                    self.node_group = self.host_list[-1]
                raise errors.Resource.RetryableCreationError()
        if (not self.use_dedicated_host and retcode
                and _INSUFFICIENT_HOST_CAPACITY in stderr):
            logging.error(util.STOCKOUT_MESSAGE)
            raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                util.STOCKOUT_MESSAGE)
        util.CheckGcloudResponseKnownFailures(stderr, retcode)
        if retcode:
            if (create_cmd.rate_limited and 'already exists' in stderr
                    and FLAGS.retry_on_rate_limited):
                # Gcloud create commands may still create VMs despite being rate
                # limited.
                return
            if util.RATE_LIMITED_MESSAGE in stderr:
                raise errors.Benchmarks.QuotaFailure.RateLimitExceededError(
                    stderr)
            if self.preemptible and _FAILED_TO_START_DUE_TO_PREEMPTION in stderr:
                self.spot_early_termination = True
                raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                    'Interrupted before VM started')
            raise errors.Resource.CreationError(
                'Failed to create VM: %s return code: %s' % (stderr, retcode))
Esempio n. 3
0
 def _Create(self):
     """Creates the disk."""
     cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name)
     cmd.flags['size'] = self.disk_size
     cmd.flags['type'] = self.disk_type
     if self.image:
         cmd.flags['image'] = self.image
     if self.image_project:
         cmd.flags['image-project'] = self.image_project
     _, stderr, retcode = cmd.Issue()
     util.CheckGcloudResponseKnownFailures(stderr, retcode)
Esempio n. 4
0
 def _Create(self):
     """Creates the disk."""
     cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name)
     cmd.flags['size'] = self.disk_size
     cmd.flags['type'] = self.disk_type
     cmd.flags['labels'] = util.MakeFormattedDefaultTags()
     if self.image:
         cmd.flags['image'] = self.image
     if self.image_project:
         cmd.flags['image-project'] = self.image_project
     _, stderr, retcode = cmd.Issue(raise_on_failure=False)
     util.CheckGcloudResponseKnownFailures(stderr, retcode)
Esempio n. 5
0
    def _IssueResourceCreationCommand(self, cmd):
        """Issues a command to gcloud to create resources."""

        # This command needs a long timeout due to the many minutes it
        # can take to provision a large GPU-accelerated GKE cluster.
        _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False)
        if retcode:
            # Log specific type of failure, if known.
            if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr:
                logging.exception('Container resources exhausted: %s', stderr)
                raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                    'Container resources exhausted in zone %s: %s' %
                    (self.zone, stderr))
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)
Esempio n. 6
0
  def _Create(self):
    """Creates the disk."""
    cmd = util.GcloudCommand(self, 'compute', 'disks', 'create', self.name)
    cmd.flags['size'] = self.disk_size
    cmd.flags['type'] = self.disk_type
    if self.provisioned_iops and self.disk_type == PD_EXTREME:
      cmd.flags['provisioned-iops'] = self.provisioned_iops
    cmd.flags['labels'] = util.MakeFormattedDefaultTags()
    if self.image:
      cmd.flags['image'] = self.image
    if self.image_project:
      cmd.flags['image-project'] = self.image_project

    if self.replica_zones:
      cmd.flags['region'] = self.region
      cmd.flags['replica-zones'] = ','.join(self.replica_zones)
      del cmd.flags['zone']

    _, stderr, retcode = cmd.Issue(raise_on_failure=False)
    util.CheckGcloudResponseKnownFailures(stderr, retcode)
Esempio n. 7
0
    def _Create(self):
        """Create a GCE VM instance."""
        num_hosts = len(self.host_list)
        with open(self.ssh_public_key) as f:
            public_key = f.read().rstrip('\n')
        with vm_util.NamedTemporaryFile(mode='w',
                                        dir=vm_util.GetTempDir(),
                                        prefix='key-metadata') as tf:
            tf.write('%s:%s\n' % (self.user_name, public_key))
            tf.close()
            create_cmd = self._GenerateCreateCommand(tf.name)
            _, stderr, retcode = create_cmd.Issue(
                timeout=_GCE_VM_CREATE_TIMEOUT)

        if (self.use_dedicated_host and retcode
                and _INSUFFICIENT_HOST_CAPACITY in stderr
                and not self.num_vms_per_host):
            logging.warning(
                'Creation failed due to insufficient host capacity. A new host will '
                'be created and instance creation will be retried.')
            with self._host_lock:
                if num_hosts == len(self.host_list):
                    host = GceSoleTenantNodeGroup(self.node_template,
                                                  self.zone, self.project)
                    self.host_list.append(host)
                    host.Create()
                self.node_group = self.host_list[-1]
            raise errors.Resource.RetryableCreationError()
        if (not self.use_dedicated_host and retcode
                and _INSUFFICIENT_HOST_CAPACITY in stderr):
            logging.error(STOCKOUT_MESSAGE)
            raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                STOCKOUT_MESSAGE)
        util.CheckGcloudResponseKnownFailures(stderr, retcode)
        if retcode:
            raise errors.Resource.CreationError(
                'Failed to create VM: %s return code: %s' % (retcode, stderr))
Esempio n. 8
0
  def ExtractDataset(self,
                     dest_bucket,
                     dataset=None,
                     tables=None,
                     dest_format='CSV'):
    """Extract all tables in a dataset to a GCS bucket.

    Args:
      dest_bucket: Name of the bucket to extract the data to. Should already
        exist.
      dataset: Optional name of the dataset. If none, will be extracted from the
        cluster_identifier.
      tables: Optional list of table names to extract. If none, all tables in
        the dataset will be extracted.
      dest_format: Format to extract data in. Can be one of: CSV, JSON, or Avro.
    """
    if tables is None:
      tables = self.GetAllTablesInDataset(dataset)
    gcs_uri = 'gs://' + dest_bucket

    # Make sure the bucket is empty.
    vm_util.IssueCommand(['gsutil', '-m', 'rm', gcs_uri + '/**'],
                         raise_on_failure=False)

    project_dataset = self.FormatProjectAndDatasetForCommand(dataset)
    for table in tables:
      cmd = [
          'bq', 'extract',
          '--destination_format=%s' % dest_format,
          '%s.%s' % (project_dataset, table),
          '%s/%s/*.csv' % (gcs_uri, table)
      ]
      _, stderr, retcode = vm_util.IssueCommand(cmd)
      # There is a 10T daily limit on extracting from BQ. Large datasets will
      # inherently hit this limit and benchmarks shouldn't use those.
      gcp_util.CheckGcloudResponseKnownFailures(stderr, retcode)
Esempio n. 9
0
    def _Create(self):
        """Creates the cluster."""
        cmd = util.GcloudCommand(self, 'container', 'clusters', 'create',
                                 self.name)

        cmd.flags['cluster-version'] = self.cluster_version
        if FLAGS.gke_enable_alpha:
            cmd.args.append('--enable-kubernetes-alpha')
            cmd.args.append('--no-enable-autorepair')
            cmd.args.append('--no-enable-autoupgrade')

        user = util.GetDefaultUser()
        if FLAGS.gcp_service_account:
            cmd.flags['service-account'] = FLAGS.gcp_service_account
        # Matches service accounts that either definitely belongs to this project or
        # are a GCP managed service account like the GCE default service account,
        # which we can't tell to which project they belong.
        elif re.match(SERVICE_ACCOUNT_PATTERN, user):
            logging.info(
                'Re-using configured service-account for GKE Cluster: %s',
                user)
            cmd.flags['service-account'] = user
            self.use_application_default_credentials = False
        else:
            logging.info('Using default GCE service account for GKE cluster')
            cmd.flags['scopes'] = 'cloud-platform'

        if self.vm_config.gpu_count:
            cmd.flags['accelerator'] = (
                gce_virtual_machine.GenerateAcceleratorSpecString(
                    self.vm_config.gpu_type, self.vm_config.gpu_count))
        if self.vm_config.min_cpu_platform:
            cmd.flags['min-cpu-platform'] = self.vm_config.min_cpu_platform

        if self.vm_config.boot_disk_size:
            cmd.flags['disk-size'] = self.vm_config.boot_disk_size
        if self.vm_config.boot_disk_type:
            cmd.flags['disk-type'] = self.vm_config.boot_disk_type
        if self.vm_config.max_local_disks:
            # TODO(pclay): Switch to local-ssd-volumes which support NVME when it
            # leaves alpha. See
            # https://cloud.google.com/sdk/gcloud/reference/alpha/container/clusters/create
            cmd.flags['local-ssd-count'] = self.vm_config.max_local_disks

        if self.min_nodes != self.num_nodes or self.max_nodes != self.num_nodes:
            cmd.args.append('--enable-autoscaling')
            cmd.flags['max-nodes'] = self.max_nodes
            cmd.flags['min-nodes'] = self.min_nodes

        cmd.flags['num-nodes'] = self.num_nodes

        if self.vm_config.machine_type is None:
            cmd.flags['machine-type'] = 'custom-{0}-{1}'.format(
                self.vm_config.cpus, self.vm_config.memory_mib)
        else:
            cmd.flags['machine-type'] = self.vm_config.machine_type

        cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()

        # This command needs a long timeout due to the many minutes it
        # can take to provision a large GPU-accelerated GKE cluster.
        _, stderr, retcode = cmd.Issue(timeout=1200, raise_on_failure=False)
        if retcode:
            # Log specific type of failure, if known.
            if 'ZONE_RESOURCE_POOL_EXHAUSTED' in stderr:
                logging.exception('Container resources exhausted: %s', stderr)
                raise errors.Benchmarks.InsufficientCapacityCloudFailure(
                    'Container resources exhausted in zone %s: %s' %
                    (self.zone, stderr))
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)
Esempio n. 10
0
    def _CreateGcloudSqlInstance(self):
        storage_size = self.spec.db_disk_spec.disk_size
        instance_zone = self.spec.db_spec.zone

        authorized_network = self._GetAuthorizedNetworks([self.client_vm])

        database_version_string = self._GetEngineVersionString(
            self.spec.engine, self.spec.engine_version)

        cmd_string = [
            self,
            'beta',
            'sql',
            'instances',
            'create',
            self.instance_id,
            '--quiet',
            '--format=json',
            '--activation-policy=ALWAYS',
            '--assign-ip',
            '--authorized-networks=%s' % authorized_network,
            '--zone=%s' % instance_zone,
            '--database-version=%s' % database_version_string,
            '--storage-size=%d' % storage_size,
            '--labels=%s' % util.MakeFormattedDefaultTags(),
        ]
        if self.spec.engine == relational_db.MYSQL:
            cmd_string.append('--enable-bin-log')

        if self.spec.engine == relational_db.SQLSERVER:
            # `--root-password` is required when creating SQL Server instances.
            cmd_string.append('--root-password={0}'.format(
                self.spec.database_password))

        if (self.spec.db_spec.cpus and self.spec.db_spec.memory):
            self._ValidateSpec()
            memory = self.spec.db_spec.memory
            cpus = self.spec.db_spec.cpus
            self._ValidateMachineType(memory, cpus)
            cmd_string.append('--cpu={}'.format(cpus))
            cmd_string.append('--memory={}MiB'.format(memory))
        elif hasattr(self.spec.db_spec, 'machine_type'):
            machine_type_flag = '--tier=%s' % self.spec.db_spec.machine_type
            cmd_string.append(machine_type_flag)
        else:
            raise Exception('Unspecified machine type')

        if self.spec.high_availability:
            cmd_string.append(self._GetHighAvailabilityFlag())

        if self.spec.backup_enabled:
            cmd_string.append('--backup')
            cmd_string.append('--backup-start-time={}'.format(
                self.spec.backup_start_time))
        else:
            cmd_string.append('--no-backup')
        cmd = util.GcloudCommand(*cmd_string)
        cmd.flags['project'] = self.project

        _, stderr, retcode = cmd.Issue(timeout=CREATION_TIMEOUT)

        util.CheckGcloudResponseKnownFailures(stderr, retcode)
Esempio n. 11
0
    def _Create(self):
        """Creates the cluster."""
        cmd = self.DataprocGcloudCommand('clusters', 'create', self.cluster_id)
        if self.project is not None:
            cmd.flags['project'] = self.project

        if self.spec.worker_count:
            # The number of worker machines in the cluster
            cmd.flags['num-workers'] = self.spec.worker_count
        else:
            cmd.flags['single-node'] = True

        # Initialize applications on the dataproc cluster
        if self.spec.applications:
            logging.info('Include the requested applications')
            cmd.flags['optional-components'] = ','.join(self.spec.applications)

        # Enable component gateway for debuggability. Does not impact performance.
        cmd.flags['enable-component-gateway'] = True

        # TODO(pclay): stop ignoring spec.master_group?
        for role in ['worker', 'master']:
            # Set machine type
            if self.spec.worker_group.vm_spec.machine_type:
                self._AddToCmd(cmd, '{0}-machine-type'.format(role),
                               self.spec.worker_group.vm_spec.machine_type)
            # Set boot_disk_size
            if self.spec.worker_group.disk_spec.disk_size:
                size_in_gb = '{}GB'.format(
                    str(self.spec.worker_group.disk_spec.disk_size))
                self._AddToCmd(cmd, '{0}-boot-disk-size'.format(role),
                               size_in_gb)
            # Set boot_disk_type
            if self.spec.worker_group.disk_spec.disk_type:
                self._AddToCmd(cmd, '{0}-boot-disk-type'.format(role),
                               self.spec.worker_group.disk_spec.disk_type)
                self.dpb_hdfs_type = disk_to_hdfs_map[
                    self.spec.worker_group.disk_spec.disk_type]

            # Set ssd count
            if self.spec.worker_group.vm_spec.num_local_ssds:
                self._AddToCmd(cmd, 'num-{0}-local-ssds'.format(role),
                               self.spec.worker_group.vm_spec.num_local_ssds)
                # This will actually be used for storage
                self.dpb_hdfs_type = 'Local SSD'
        # Set zone
        cmd.flags['zone'] = self.dpb_service_zone
        if self.dpb_version:
            cmd.flags['image-version'] = self.dpb_version

        if FLAGS.gcp_dataproc_image:
            cmd.flags['image'] = FLAGS.gcp_dataproc_image

        if FLAGS.dpb_cluster_properties:
            cmd.flags['properties'] = ','.join(FLAGS.dpb_cluster_properties)

        # Ideally DpbServiceSpec would have a network spec, which we would create to
        # Resolve the name, but because EMR provisions its own VPC and we are
        # generally happy using pre-existing networks for Dataproc. Just use the
        # underlying flag instead.
        if FLAGS.gce_network_name:
            cmd.flags['network'] = FLAGS.gce_network_name

        metadata = util.GetDefaultTags()
        metadata.update(
            flag_util.ParseKeyValuePairs(FLAGS.gcp_instance_metadata))
        cmd.flags['metadata'] = util.FormatTags(metadata)
        cmd.flags['labels'] = util.MakeFormattedDefaultTags()
        timeout = 900  # 15 min
        stdout, stderr, retcode = cmd.Issue(timeout=timeout,
                                            raise_on_failure=False)
        self._cluster_create_time = self._ParseClusterCreateTime(stdout)
        if retcode:
            util.CheckGcloudResponseKnownFailures(stderr, retcode)
            raise errors.Resource.CreationError(stderr)