Ejemplo n.º 1
0
    def __init__(self, network_spec):
        super(GceNetwork, self).__init__(network_spec)
        self.project = network_spec.project
        self.vpn_gateway = {}

        #  Figuring out the type of network here.
        #  Precedence: User Managed > MULTI > SINGLE > DEFAULT
        self.net_type = network.NetType.DEFAULT.value
        self.cidr = NETWORK_RANGE
        if FLAGS.gce_subnet_region:
            self.net_type = network.NetType.SINGLE.value
            self.cidr = FLAGS.gce_subnet_addr
        if network_spec.cidr:
            self.net_type = network.NetType.MULTI.value
            self.cidr = network_spec.cidr

        name = self._MakeGceNetworkName()

        subnet_region = (FLAGS.gce_subnet_region if not network_spec.cidr else
                         util.GetRegionFromZone(network_spec.zone))
        mode = 'auto' if subnet_region is None else 'custom'
        self.network_resource = GceNetworkResource(name, mode, self.project)
        if subnet_region is None:
            self.subnet_resource = None
        else:
            self.subnet_resource = GceSubnetResource(
                FLAGS.gce_subnet_name or name, name, subnet_region, self.cidr,
                self.project)

        # Stage FW rules.
        self.all_nets = self._GetNetworksFromSpec(
            network_spec)  # Holds the different networks in this run.
        self.external_nets_rules = {
        }  # Holds FW rules for any external subnets.

        #  Set the default rule to allow all traffic within this network's subnet.
        firewall_name = self._MakeGceFWRuleName()
        self.default_firewall_rule = GceFirewallRule(firewall_name,
                                                     self.project, ALLOW_ALL,
                                                     name, self.cidr)

        # Set external rules to allow traffic from other subnets in this benchmark.
        for ext_net in self.all_nets:
            if ext_net == self.cidr:
                continue  # We've already added our own network to the default rule.
            rule_name = self._MakeGceFWRuleName(dst_cidr=ext_net)
            self.external_nets_rules[rule_name] = GceFirewallRule(
                rule_name, self.project, ALLOW_ALL, name, ext_net)

        # Add VpnGateways to the network.
        if FLAGS.use_vpn:
            for gatewaynum in range(0, FLAGS.vpn_service_gateway_count):
                vpn_gateway_name = 'vpngw-%s-%s-%s' % (util.GetRegionFromZone(
                    network_spec.zone), gatewaynum, FLAGS.run_uri)
                self.vpn_gateway[vpn_gateway_name] = GceVpnGateway(
                    vpn_gateway_name, name,
                    util.GetRegionFromZone(network_spec.zone),
                    network_spec.cidr, self.project)
Ejemplo n.º 2
0
def Prepare(benchmark_spec):
    """Install and set up MNIST on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
    benchmark_spec.always_call_cleanup = True
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if not benchmark_spec.tpus:
        vm.Install('tensorflow')
    vm.Install('cloud_tpu_models')
    vm.Install('tensorflow_models')
    if benchmark_spec.tpus:
        storage_service = gcs.GoogleCloudStorageService()
        benchmark_spec.storage_service = storage_service
        bucket = 'pkb{}'.format(FLAGS.run_uri)
        benchmark_spec.bucket = bucket
        benchmark_spec.model_dir = 'gs://{}'.format(bucket)
        location = benchmark_spec.tpu_groups['train'].GetZone()
        storage_service.PrepareService(util.GetRegionFromZone(location))
        storage_service.MakeBucket(bucket)
        storage_service.AclBucket(benchmark_spec.gcp_service_account,
                                  gcs.WRITER, bucket)
    else:
        benchmark_spec.model_dir = '/tmp'
def GetCommonMetadata(custom_metadata: Optional[Dict[str, Any]] = None) -> str:
  """Returns pkb metadata associated with this run as cloudharmony metadata.

  Cloudharmony benchmarks take in benchmark setup configurations as inputs and
  include them in the output as metadata for the run. This function creates a
  string of input metadata from pkb flags to be included as run parameter for
  cloudharmony benchmarks.

  Args:
     custom_metadata: a dictionary of metadata key value pairs that should
     override any flag chosen in the function, or should also be included.
  Returns:
     A string of metadata that should be appended to the cloudharmony
     benchmark run.
  """
  if FLAGS.cloud != providers.GCP:
    # Should not be including cloudharmony metadata for non-gcp runs.
    return ''

  metadata = {
      'meta_compute_service': 'Google Compute Engine',
      'meta_compute_service_id': 'google:compute',
      'meta_instance_id': FLAGS.machine_type,
      'meta_provider': 'Google Cloud Platform',
      'meta_provider_id': 'google',
      'meta_region': gcp_util.GetRegionFromZone(FLAGS.zone[0]),
      'meta_zone': FLAGS.zone[0],
      'meta_test_id': FLAGS.run_uri,
  }
  if custom_metadata:
    metadata.update(custom_metadata)

  metadata_pair = [f'--{key} {value}' for key, value in metadata.items()]
  return ' '.join(metadata_pair)
Ejemplo n.º 4
0
    def __init__(self, dpb_service_spec):
        super(UnmanagedDpbService, self).__init__(dpb_service_spec)
        #  Dictionary to hold the cluster vms.
        self.vms = {}
        self.cloud = dpb_service_spec.worker_group.cloud
        if not self.dpb_service_zone:
            raise errors.Setup.InvalidSetupError(
                'dpb_service_zone must be provided, for provisioning.')
        if self.cloud == 'GCP':
            self.region = gcp_util.GetRegionFromZone(FLAGS.dpb_service_zone)
            self.storage_service = gcs.GoogleCloudStorageService()
            self.persistent_fs_prefix = 'gs://'
        elif self.cloud == 'AWS':
            self.region = aws_util.GetRegionFromZone(FLAGS.dpb_service_zone)
            self.storage_service = s3.S3Service()
            self.persistent_fs_prefix = 's3://'
        else:
            self.region = None
            self.storage_service = None
            self.persistent_fs_prefix = None
            self.manage_bucket = False
            logging.warning(
                'Cloud provider %s does not support object storage. '
                'Some benchmarks will not work.', self.cloud)

        if self.storage_service:
            self.storage_service.PrepareService(location=self.region)

        # set in _Create of derived classes
        self.leader = None
Ejemplo n.º 5
0
    def __init__(self, dpb_service_spec):
        super(UnmanagedDpbService, self).__init__(dpb_service_spec)
        #  Dictionary to hold the cluster vms.
        self.vms = {}
        self.cloud = dpb_service_spec.worker_group.cloud
        if not self.dpb_service_zone:
            raise errors.Setup.InvalidSetupError(
                'dpb_service_zone must be provided, for provisioning.')
        if self.cloud == 'GCP':
            self.region = gcp_util.GetRegionFromZone(FLAGS.dpb_service_zone)
            self.storage_service = gcs.GoogleCloudStorageService()
            self.persistent_fs_prefix = 'gs://'
        elif self.cloud == 'AWS':
            self.region = aws_util.GetRegionFromZone(FLAGS.dpb_service_zone)
            self.storage_service = s3.S3Service()
            self.persistent_fs_prefix = 's3://'
        else:
            raise errors.Config.InvalidValue(
                f'Unsupported Cloud provider {self.cloud}')

        if self.storage_service:
            self.storage_service.PrepareService(location=self.region)

        # set in _Create of derived classes
        self.leader = None
Ejemplo n.º 6
0
  def __init__(self,
               disk_spec,
               name,
               zone,
               project,
               image=None,
               image_project=None,
               replica_zones=None):
    super(GceDisk, self).__init__(disk_spec)
    self.attached_vm_name = None
    self.image = image
    self.image_project = image_project
    self.name = name
    self.zone = zone
    self.project = project
    self.replica_zones = replica_zones
    self.region = util.GetRegionFromZone(self.zone)
    self.provisioned_iops = None
    if self.disk_type == PD_EXTREME:
      self.provisioned_iops = FLAGS.gcp_provisioned_iops

    disk_metadata = DISK_METADATA[disk_spec.disk_type]
    if self.replica_zones:
      disk_metadata[disk.REPLICATION] = disk.REGION
      self.metadata['replica_zones'] = replica_zones
    self.metadata.update(DISK_METADATA[disk_spec.disk_type])
    if self.disk_type == disk.LOCAL:
      self.metadata['interface'] = FLAGS.gce_ssd_interface
    if self.provisioned_iops and self.disk_type == PD_EXTREME:
      self.metadata['provisioned_iops'] = self.provisioned_iops
 def GetFullRegistryTag(self, image):
   """Gets the full tag of the image."""
   region = util.GetMultiRegionFromRegion(util.GetRegionFromZone(self.zone))
   hostname = '{region}.gcr.io'.format(region=region)
   full_tag = '{hostname}/{project}/{name}'.format(
       hostname=hostname, project=self.project, name=image)
   return full_tag
Ejemplo n.º 8
0
 def _GetDefaultConfig(self):
     """Gets the config that corresponds the region used for the test."""
     try:
         region = util.GetRegionFromZone(
             FLAGS.zones[0] if FLAGS.zones else FLAGS.zone[0])
     except IndexError:
         region = _DEFAULT_REGION
     return f'regional-{region}'
Ejemplo n.º 9
0
    def _Create(self):
        """Creates the Cloud SQL instance and authorizes traffic from anywhere."""
        storage_size = self.spec.disk_spec.disk_size
        instance_zone = self.spec.vm_spec.zone

        # TODO: We should create the client VM with a static IP, and
        # only authorize that specific IP address. The client VM can be accessed
        # like so: self.client_vm
        authorized_network = '0.0.0.0/0'
        database_version_string = self._GetEngineVersionString(
            self.spec.engine, self.spec.engine_version)

        cmd_string = [
            self,
            'beta',
            'sql',
            'instances',
            'create',
            self.instance_id,
            '--quiet',
            '--format=json',
            '--async',
            '--activation-policy=ALWAYS',
            '--assign-ip',
            '--authorized-networks=%s' % authorized_network,
            '--enable-bin-log',
            '--gce-zone=%s' % instance_zone,
            '--region=%s' % util.GetRegionFromZone(instance_zone),
            '--database-version=%s' % database_version_string,
            '--pricing-plan=%s' % self.GCP_PRICING_PLAN,
            '--storage-size=%d' % storage_size,
        ]
        # TODO(ferneyhough): add tier machine types support for Postgres
        if self.spec.engine == managed_relational_db.MYSQL:
            machine_type_flag = '--tier=%s' % self.spec.vm_spec.machine_type
            cmd_string.append(machine_type_flag)
        else:
            self._ValidateSpec()
            memory = self.spec.vm_spec.memory
            cpus = self.spec.vm_spec.cpus
            self._ValidateMachineType(memory, cpus)
            cmd_string.append('--cpu={}'.format(cpus))
            cmd_string.append('--memory={}MiB'.format(memory))

        if self.spec.high_availability:
            cmd_string.append(self._GetHighAvailabilityFlag())

        if self.spec.backup_enabled:
            cmd_string.append('--backup')
            cmd_string.append('--backup-start-time={}'.format(
                self.spec.backup_start_time))
        else:
            cmd_string.append('--no-backup')
        cmd = util.GcloudCommand(*cmd_string)
        cmd.flags['project'] = self.project

        _, _, _ = cmd.Issue()
def Prepare(benchmark_spec):
    """Install and set up MNIST on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
    benchmark_spec.always_call_cleanup = True
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if not benchmark_spec.use_tpu:
        vm.Install('tensorflow')
    vm.Install('cloud_tpu_models')
    vm.RemoteCommand('git clone https://github.com/tensorflow/models.git',
                     should_log=True)
    if benchmark_spec.use_tpu:
        storage_service = gcs.GoogleCloudStorageService()
        storage_service.PrepareVM(vm)
        benchmark_spec.storage_service = storage_service
        model_dir = 'gs://{}'.format(FLAGS.run_uri)
        benchmark_spec.model_dir = model_dir
        vm.RemoteCommand(
            '{gsutil} mb -c regional -l {location} {model_dir}'.format(
                gsutil=vm.gsutil_path,
                location=util.GetRegionFromZone(
                    benchmark_spec.tpu_groups['train'].GetZone()),
                model_dir=benchmark_spec.model_dir),
            should_log=True)
        vm.RemoteCommand(
            '{gsutil} acl ch -u {service_account}:W {model_dir}'.format(
                gsutil=vm.gsutil_path,
                service_account=benchmark_spec.gcp_service_account,
                model_dir=benchmark_spec.model_dir),
            should_log=True)
    else:
        benchmark_spec.model_dir = '/tmp'

    if (FLAGS.imagenet_data_dir
            or FLAGS.t2t_data_dir) and FLAGS.cloud != 'GCP':
        vm.Install('google_cloud_sdk')
        vm.RemoteCommand('echo "export {}" >> ~/.bashrc'.format(GCP_ENV),
                         login_shell=True)
        credential_path = os.path.join('~', '.config', 'gcloud')
        vm.RemoteCommand('mkdir -p {}'.format(credential_path),
                         login_shell=True)
        credential_file = os.path.join(credential_path,
                                       'application_default_credentials.json')
        vm.PushFile(FLAGS.gcp_credential, credential_file)
        vm.RemoteCommand(
            '{env} gcloud auth '
            'activate-service-account --key-file {key_file}'.format(
                env=GCP_ENV, key_file=credential_file),
            login_shell=True)
 def __init__(self, spec):
   super(GkeCluster, self).__init__(spec)
   self.project = spec.vm_spec.project
   self.cluster_version = FLAGS.container_cluster_version
   self.use_application_default_credentials = True
   self.zones = self.zone and self.zone.split(',')
   if not self.zones:
     raise errors.Config.MissingOption(
         'container_cluster.vm_spec.GCP.zone is required.')
   elif len(self.zones) == 1 and util.IsRegion(self.zone):
     self.region = self.zone
     self.zones = []
     logging.info("Interpreting zone '%s' as a region", self.zone)
   else:
     self.region = util.GetRegionFromZone(self.zones[0])
Ejemplo n.º 12
0
def Prepare(benchmark_spec):
    """Install and set up MNIST on the target vm.

  Args:
    benchmark_spec: The benchmark specification
  """
    benchmark_spec.always_call_cleanup = True
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    vm = benchmark_spec.vms[0]
    if not benchmark_spec.tpus:
        vm.Install('tensorflow')
    vm.Install('cloud_tpu_models')
    vm.Install('tensorflow_models')
    if benchmark_spec.tpus:
        storage_service = gcs.GoogleCloudStorageService()
        benchmark_spec.storage_service = storage_service
        bucket = 'pkb{}'.format(FLAGS.run_uri)
        benchmark_spec.bucket = bucket
        benchmark_spec.model_dir = 'gs://{}'.format(bucket)
        location = benchmark_spec.tpu_groups['train'].GetZone()
        storage_service.PrepareService(util.GetRegionFromZone(location))
        storage_service.MakeBucket(bucket)
        storage_service.ChmodBucket(benchmark_spec.gcp_service_account, 'W',
                                    bucket)
    else:
        benchmark_spec.model_dir = '/tmp'

    if (FLAGS.imagenet_data_dir
            or FLAGS.t2t_data_dir) and FLAGS.cloud != 'GCP':
        vm.Install('google_cloud_sdk')
        vm.RemoteCommand('echo "export {}" >> ~/.bashrc'.format(GCP_ENV),
                         login_shell=True)
        credential_path = os.path.join('~', '.config', 'gcloud')
        vm.RemoteCommand('mkdir -p {}'.format(credential_path),
                         login_shell=True)
        credential_file = os.path.join(credential_path,
                                       'application_default_credentials.json')
        vm.PushFile(FLAGS.gcp_credential, credential_file)
        vm.RemoteCommand(
            '{env} gcloud auth '
            'activate-service-account --key-file {key_file}'.format(
                env=GCP_ENV, key_file=credential_file),
            login_shell=True)
    def __init__(self, gce_placement_group_spec):
        """Init method for GcePlacementGroup.

    Args:
      gce_placement_group_spec: Object containing the
        information needed to create an GcePlacementGroup.
    """
        super(GcePlacementGroup, self).__init__(gce_placement_group_spec)
        self.project = gce_placement_group_spec.project
        self.region = gcp_util.GetRegionFromZone(gce_placement_group_spec.zone)
        self.zone = None
        self.num_vms = gce_placement_group_spec.num_vms
        self.name = 'perfkit-{}'.format(context.GetThreadBenchmarkSpec().uuid)
        self.style = gce_placement_group_spec.placement_group_style
        self.availability_domain_count = FLAGS.gce_availability_domain_count
        self.metadata.update({
            'placement_group_name': self.name,
            'placement_group_style': self.style
        })
def _PrepareBucket(benchmark_spec):
    """Prepare storage bucket for profiling results, if needed.

  Args:
    benchmark_spec: The benchmark specification
  """
    if (mlperf_benchmark.NONE in FLAGS.mlperf_profiler
            and not FLAGS.mlperf_keep_nccl_log):
        return

    if FLAGS.cloud != 'GCP':
        return

    location = benchmark_spec.zones[0]
    bucket = benchmark_spec.bucket
    storage_service = benchmark_spec.storage_service
    storage_service.PrepareService(util.GetRegionFromZone(location))
    storage_service.MakeBucket(bucket, raise_on_failure=False)
    storage_service.AclBucket(benchmark_spec.gcp_service_account, gcs.WRITER,
                              bucket)
Ejemplo n.º 15
0
    def __init__(self, dpb_service_spec):
        super().__init__(dpb_service_spec)
        self.dpb_service_type = self.SERVICE_TYPE
        # Set DPB version as Spark version for metadata
        self.dpb_version = 'spark_' + FLAGS.spark_version

        benchmark_spec = context.GetThreadBenchmarkSpec()
        self.k8s_cluster = benchmark_spec.container_cluster
        assert self.k8s_cluster
        assert self.k8s_cluster.CLUSTER_TYPE == container_service.KUBERNETES
        self.cloud = self.k8s_cluster.CLOUD
        self.container_registry = benchmark_spec.container_registry
        assert self.container_registry

        self.spark_drivers = []

        # TODO(pclay): Support overriding image?
        # Corresponds with data/docker/spark directory
        self.image = 'spark'

        if self.cloud == 'GCP':
            self.region = gcp_util.GetRegionFromZone(self.k8s_cluster.zone)
            self.storage_service = gcs.GoogleCloudStorageService()
            self.persistent_fs_prefix = 'gs://'
        elif self.cloud == 'AWS':
            self.region = self.k8s_cluster.region
            self.storage_service = s3.S3Service()
            self.persistent_fs_prefix = 's3://'
        else:
            raise errors.Config.InvalidValue(
                f'Unsupported Cloud provider {self.cloud}')

        self.storage_service.PrepareService(location=self.region)

        # TODO(pclay): support
        assert not FLAGS.dpb_cluster_properties

        if self.k8s_cluster.num_nodes < 2:
            raise errors.Config.InvalidValue(
                f'Cluster type {KUBERNETES_SPARK_CLUSTER} requires at least 2 nodes.'
                f'Found {self.k8s_cluster.num_nodes}.')
Ejemplo n.º 16
0
    def __init__(self, network_spec: GceNetworkSpec):
        super(GceNetwork, self).__init__(network_spec)
        self.project: Optional[str] = network_spec.project
        self.vpn_gateway: Dict[str, GceVpnGateway] = {}

        #  Figuring out the type of network here.
        #  Precedence: User Managed > MULTI > SINGLE > DEFAULT
        self.net_type = network.NetType.DEFAULT.value
        self.cidr = NETWORK_RANGE
        if FLAGS.gce_subnet_region:
            self.net_type = network.NetType.SINGLE.value
            self.cidr = FLAGS.gce_subnet_addr
        if network_spec.cidr:
            self.net_type = network.NetType.MULTI.value
            self.cidr = network_spec.cidr
        self.mtu = network_spec.mtu

        name = self._MakeGceNetworkName()

        subnet_region = (FLAGS.gce_subnet_region if not network_spec.cidr else
                         util.GetRegionFromZone(network_spec.zone))
        mode = 'auto' if subnet_region is None else 'custom'
        self.network_resource = GceNetworkResource(name, mode, self.project,
                                                   self.mtu)
        if subnet_region is None:
            self.subnet_resource = None
        else:
            self.subnet_resource = GceSubnetResource(
                FLAGS.gce_subnet_name or name, name, subnet_region, self.cidr,
                self.project)

        # Stage FW rules.
        self.all_nets = self._GetNetworksFromSpec(
            network_spec)  # Holds the different networks in this run.
        # Holds FW rules for any external subnets.
        self.external_nets_rules: Dict[str, GceFirewallRule] = {}

        #  Set the default rule to allow all traffic within this network's subnet.
        firewall_name = self._MakeGceFWRuleName()
        self.default_firewall_rule = GceFirewallRule(firewall_name,
                                                     self.project, ALLOW_ALL,
                                                     name, self.cidr)

        # Set external rules to allow traffic from other subnets in this benchmark.
        for ext_net in self.all_nets:
            if ext_net == self.cidr:
                continue  # We've already added our own network to the default rule.
            rule_name = self._MakeGceFWRuleName(dst_cidr=ext_net)
            self.external_nets_rules[rule_name] = GceFirewallRule(
                rule_name, self.project, ALLOW_ALL, name, ext_net)

        # Add VpnGateways to the network.
        if FLAGS.use_vpn:
            for gatewaynum in range(0, FLAGS.vpn_service_gateway_count):
                vpn_gateway_name = 'vpngw-%s-%s-%s' % (util.GetRegionFromZone(
                    network_spec.zone), gatewaynum, FLAGS.run_uri)
                self.vpn_gateway[vpn_gateway_name] = GceVpnGateway(
                    vpn_gateway_name, name,
                    util.GetRegionFromZone(network_spec.zone),
                    network_spec.cidr, self.project)

        # Add GCE Placement Group
        no_placement_group = (not FLAGS.placement_group_style
                              or FLAGS.placement_group_style
                              == placement_group.PLACEMENT_GROUP_NONE)
        if no_placement_group:
            self.placement_group = None
        else:
            placement_group_spec = gce_placement_group.GcePlacementGroupSpec(
                'GcePlacementGroupSpec',
                flag_values=FLAGS,
                zone=network_spec.zone,
                project=self.project,
                num_vms=self._GetNumberVms())
            self.placement_group = gce_placement_group.GcePlacementGroup(
                placement_group_spec)
Ejemplo n.º 17
0
def Prepare(benchmark_spec, vm=None):
    """Install and set up MLPerf on the target vm.

  Args:
    benchmark_spec: The benchmark specification
    vm: The VM to work on

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
    _UpdateBenchmarkSpecWithFlags(benchmark_spec)
    if vm is None:
        vm = benchmark_spec.vms[0]

    if (bool(benchmark_spec.tpus) and cuda_toolkit.CheckNvidiaGpuExists(vm)):
        raise errors.Config.InvalidValue(
            'Invalid configuration. GPUs and TPUs can not both present in the config.'
        )

    vm.RemoteCommand(
        'if [ ! -d "$HOME/training_results_v0.6" ]; then '
        '  git clone https://github.com/mlperf/training_results_v0.6.git ; '
        'fi',
        should_log=True)
    vm.InstallPackages('python3-pip')

    if benchmark_spec.tpus:
        if vm == benchmark_spec.vms[0]:
            storage_service = gcs.GoogleCloudStorageService()
            benchmark_spec.storage_service = storage_service
            bucket = 'pkb{}'.format(FLAGS.run_uri)
            benchmark_spec.bucket = bucket
            benchmark_spec.model_dir = 'gs://{}'.format(bucket)
            location = benchmark_spec.tpu_groups['train'].GetZone()
            storage_service.PrepareService(util.GetRegionFromZone(location))
            storage_service.MakeBucket(bucket)
            storage_service.ChmodBucket(benchmark_spec.gcp_service_account,
                                        'W', bucket)

        # For MLPerf v0.6, the benchmake code of different hardware are different.
        if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-128'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-256'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-512'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-1024'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-2048'):
            run_path = (
                '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}'
                .format(model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))
        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')

        if 'mask' in benchmark_spec.benchmark:
            model = 'mask_rcnn'
        elif 'gnmt' in benchmark_spec.benchmark:
            model = 'nmt'
        else:
            model = benchmark_spec.benchmark

        code_path = (
            '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
            .format(
                model=benchmark_spec.benchmark,
                tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))

        vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
        vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
        if ('mask' in benchmark_spec.benchmark
                or 'ssd' in benchmark_spec.benchmark):
            # TODO(b/141876878): coco whl package for python 3.5
            vm.RemoteCommand(
                'cd /tmp && '
                'wget https://storage.cloud.google.com/mlperf_artifcats/v0.6_training/coco-1.1-cp36-cp36m-linux_x86_64.whl'
            )

            vm.RemoteCommand('cd {path} && '
                             'sed "s/--progress-bar off/ /g" ./setup.sh | '
                             'sed "s/pip /pip3 /g" > ./setup1.sh && '
                             'chmod 755 ./setup1.sh && '
                             './setup1.sh'.format(path=run_path))
        else:
            vm.RemoteCommand(
                'cd {path} && '
                'sed "s/--progress-bar off/ /g" ./setup.sh > ./setup1.sh && '
                'chmod 755 ./setup1.sh && '
                './setup1.sh'.format(path=run_path))

        if 'mask' not in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'pip3 uninstall -y tf-estimator-nightly && '
                'pip3 install tf-estimator-nightly==1.14.0.dev2019051801')

        vm.RemoteCommand(
            r'cd {path} && '
            r'sed "s/--model_dir=.*/--model_dir=gs:\/\/{bucket} \\\/g" run_and_time.sh | '
            r'sed "s/--tpu=.*/--tpu={tpu} \\\/g" | '
            r'sed "s/--output_dir=.*/--output_dir=gs:\/\/{bucket} \\\/g" | '
            r'sed "s/--cloud_tpu_name=.*/--cloud_tpu_name={tpu} \\\/g" | '
            r'sed "s/--out_dir=.*/--out_dir=gs:\/\/{bucket} \\\/g" | '
            r'sed "s/--tpu_name=.*/--tpu_name={tpu} \\\/g" > run_and_time1.sh && '
            r'chmod 755 run_and_time1.sh '.format(
                path=run_path,
                bucket=bucket,
                tpu=benchmark_spec.tpu_groups['train'].GetName()))

        if 'gnmt' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd {code_path}/{model} && '
                'cp metric.py metric0.py && '
                'sed "s/ sacrebleu -t/ python3 -m sacrebleu -t/g" metric0.py > metric.py'
                .format(code_path=code_path, model=model))

    else:
        benchmark_spec.model_dir = '/tmp'

        has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm)
        if has_gpu:
            vm.Install('cuda_toolkit')

        vm.Install('nvidia_docker')
        vm.RemoteCommand(
            'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')

        if 'resnet' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet &&'
                ' sudo docker build --pull --network=host . -t mlperf-nvidia:image_classification',
                should_log=True)
            _DownloadData(benchmark_spec.imagenet_data_dir,
                          posixpath.join('/data', 'imagenet'), vm)

        if 'transformer' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch &&'
                ' sudo docker build --pull --network=host . -t mlperf-nvidia:translation',
                should_log=True)
            _DownloadData(benchmark_spec.wmt_data_dir,
                          posixpath.join('/data', 'wmt'), vm)

        if 'minigo' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:minigo .',
                should_log=True)

        if 'mask' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:object_detection . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco2017_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)

        if 'gnmt' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:rnn_translator . ',
                should_log=True)
            _DownloadData(benchmark_spec.gnmt_data_dir,
                          posixpath.join('/data', 'gnmt'), vm)

        if 'ssd' in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'cd training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch && '
                'sudo docker build --pull --network=host -t mlperf-nvidia:single_stage_detector . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco2017_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)
def GetRegionFromZone(zone: str) -> str:
  # only gcp is supported as cloudharmony metadata is exclusive to gcp runs.
  if FLAGS.cloud == 'GCP':
    return gcp_util.GetRegionFromZone(zone)
  else:
    return zone
Ejemplo n.º 19
0
def PrepareRunner(benchmark_spec, vm=None):
    """Install and set up MLPerf on the target vm.

  Args:
    benchmark_spec: The benchmark specification
    vm: The VM to work on

  Raises:
    errors.Config.InvalidValue upon both GPUs and TPUs appear in the config
  """
    vm = vm or benchmark_spec.vms[0]
    if benchmark_spec.tpus:
        if vm == benchmark_spec.vms[0]:
            storage_service = gcs.GoogleCloudStorageService()
            benchmark_spec.storage_service = storage_service
            if FLAGS.mlperf_bucket:
                bucket = FLAGS.mlperf_bucket
                benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}'
            else:
                bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri)
                benchmark_spec.model_dir = f'gs://{bucket}'

            benchmark_spec.bucket = bucket
            location = benchmark_spec.tpu_groups['train'].GetZone()
            storage_service.PrepareService(util.GetRegionFromZone(location))
            storage_service.MakeBucket(bucket)
            storage_service.AclBucket(benchmark_spec.gcp_service_account,
                                      gcs.WRITER, bucket)

        # For MLPerf 1.0, the benchmake code of different hardware are different.
        if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-128'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-256'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-512'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-1024'
                or benchmark_spec.tpu_groups['train'].GetAcceleratorType()
                == 'v3-2048'):
            run_path = (
                '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}'
                .format(version=MLPERF_VERSION,
                        model=benchmark_spec.benchmark,
                        tpus=benchmark_spec.tpu_groups['train'].
                        GetAcceleratorType()))
        else:
            raise ValueError(
                'MLPerf configurations do not support the hardware in PKB. PKB may '
                'need to be updated if this is a new TPU type.')

        if MASK in benchmark_spec.benchmark:
            model = 'mask_rcnn'
        elif GNMT in benchmark_spec.benchmark:
            model = 'nmt'
        else:
            model = benchmark_spec.benchmark

        code_path = (
            '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}'
            .format(
                version=MLPERF_VERSION,
                model=benchmark_spec.benchmark,
                tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))

        vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
        vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
        if (MASK in benchmark_spec.benchmark
                or SSD in benchmark_spec.benchmark):
            # Install the coco package, to load the coco dataset for Mask-RCNN
            # and SSD benchmarks.
            # TODO(user): coco whl package for python 3.5
            vm.RemoteCommand(
                'cd /tmp && '
                f'wget https://storage.cloud.google.com/mlperf_artifcats/{MLPERF_VERSION}_training/coco-1.1-cp36-cp36m-linux_x86_64.whl'
            )

        setup_script = posixpath.join(run_path, 'setup.sh')
        vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script)
        vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script)
        vm.RemoteCommand(
            'chmod 755 {script} && {script}'.format(script=setup_script))

        if MASK not in benchmark_spec.benchmark:
            vm.RemoteCommand(
                'pip3 uninstall -y tf-estimator-nightly && '
                'pip3 install tf-estimator-nightly==1.14.0.dev2019051801')

        if RESNET in benchmark_spec.benchmark:
            data_dir = benchmark_spec.imagenet_data_dir
        elif TRANSFORMER in benchmark_spec.benchmark:
            data_dir = benchmark_spec.wmt_data_dir
        elif MASK in benchmark_spec.benchmark:
            data_dir = benchmark_spec.coco_data_dir
        elif GNMT in benchmark_spec.benchmark:
            data_dir = benchmark_spec.gnmt_data_dir
        elif SSD in benchmark_spec.benchmark:
            data_dir = benchmark_spec.coco_data_dir
        elif BERT in benchmark_spec.benchmark:
            data_dir = benchmark_spec.bert_data_dir
        else:
            raise ValueError(
                'Unknown operation, cannot find {} in benchmark'.format(
                    benchmark_spec.benchmark))

        run_script = posixpath.join(run_path, 'run_and_time.sh')
        data_dir = data_dir.replace('/', r'\/')
        checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/')
        decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/')
        tpu = benchmark_spec.tpu_groups['train'].GetName()
        vm_util.ReplaceText(vm, '--model_dir=.*',
                            r'--model_dir=gs:\/\/{} \\\\'.format(bucket),
                            run_script)
        vm_util.ReplaceText(vm, '--data_dir=.*',
                            r'--data_dir={} \\\\'.format(data_dir), run_script)
        vm_util.ReplaceText(
            vm, '--training_file_pattern=.*',
            r'--training_file_pattern={}\/train-* \\\\'.format(data_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--validation_file_pattern=.*',
            r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--val_json_file=.*',
            r'--val_json_file={}\/instances_val2017.json \\\\'.format(
                data_dir), run_script)
        vm_util.ReplaceText(vm, '--resnet_checkpoint=.*',
                            r'--resnet_checkpoint={} \\\\'.format(checkpoint),
                            run_script)
        vm_util.ReplaceText(
            vm, '--decode_from_file=.*',
            r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--decode_reference=.*',
            r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir),
            run_script)
        vm_util.ReplaceText(
            vm, '--decode_to_file=.*',
            r'--decode_to_file={}\/decode.transformer_mlperf_tpu.'
            r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket),
            run_script)
        vm_util.ReplaceText(vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu),
                            run_script)
        vm_util.ReplaceText(vm, '--output_dir=.*',
                            r'--output_dir=gs:\/\/{} \\\\'.format(bucket),
                            run_script)
        vm_util.ReplaceText(vm, '--cloud_tpu_name=.*',
                            r'--cloud_tpu_name={} \\\\'.format(tpu),
                            run_script)
        vm_util.ReplaceText(vm, '--out_dir=.*',
                            r'--out_dir=gs:\/\/{} \\\\'.format(bucket),
                            run_script)
        vm_util.ReplaceText(vm, '--tpu_name=.*',
                            r'--tpu_name={} \\\\'.format(tpu), run_script)
        vm.RemoteCommand('chmod 755 {}'.format(run_script))

        if GNMT in benchmark_spec.benchmark:
            metric_script = posixpath.join(code_path, model, 'metric.py')
            vm_util.ReplaceText(vm, ' sacrebleu -t',
                                ' python3 -m sacrebleu -t', metric_script)
    else:
        benchmark_spec.model_dir = '/tmp'

        has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm)
        if has_gpu:
            vm.Install('cuda_toolkit')

        vm.Install('nvidia_docker')
        vm.RemoteCommand(
            'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')

        if RESNET in benchmark_spec.benchmark:
            run_script = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet/run_and_time.sh'
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet &&'
                ' sudo docker build --network=host . -t mlperf-nvidia:image_classification',
                should_log=True)
            _DownloadData(benchmark_spec.imagenet_data_dir,
                          posixpath.join('/data', 'imagenet'), vm)

        if TRANSFORMER in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/transformer/implementations/pytorch &&'
                ' sudo docker build --network=host . -t mlperf-nvidia:translation',
                should_log=True)
            _DownloadData(benchmark_spec.wmt_data_dir,
                          posixpath.join('/data', 'wmt'), vm)

        if MINIGO in benchmark_spec.benchmark:
            build_path = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/minigo/implementations/tensorflow'
            run_script = posixpath.join(build_path, 'run_and_time.sh')
            vm_util.ReplaceText(
                vm, 'get_data.py', 'get_data.py --src_dir={}'.format(
                    FLAGS.minigo_model_dir.replace('/', r'\/')), run_script)
            vm.RemoteCommand('cd {} && sudo docker build --network=host -t '
                             'mlperf-nvidia:minigo .'.format(build_path),
                             should_log=True)

        if MASK in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:object_detection . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)

        if GNMT in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/gnmt/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:rnn_translator . ',
                should_log=True)
            _DownloadData(benchmark_spec.gnmt_data_dir,
                          posixpath.join('/data', 'gnmt'), vm)

        if SSD in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/ssd/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:single_stage_detector . ',
                should_log=True)
            _DownloadData(benchmark_spec.coco_data_dir,
                          posixpath.join('/data', 'coco2017'), vm)

        if BERT in benchmark_spec.benchmark:
            vm.RemoteCommand(
                f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/implementations/pytorch && '
                'sudo docker build --network=host -t mlperf-nvidia:language_model . ',
                should_log=True)
            _DownloadData(benchmark_spec.bert_data_dir,
                          posixpath.join('/data', 'bert_data'), vm)
Ejemplo n.º 20
0
 def __init__(self, name, node_type, zone, project):
   super(GceSoleTenantNodeTemplate, self).__init__()
   self.name = name
   self.node_type = node_type
   self.region = util.GetRegionFromZone(zone)
   self.project = project
Ejemplo n.º 21
0
 def testGetRegionFromZone(self):
     zone = 'us-central1-xyz'
     self.assertEqual(util.GetRegionFromZone(zone), 'us-central1')