def __init__(self, network_spec): super(GceNetwork, self).__init__(network_spec) self.project = network_spec.project self.vpn_gateway = {} # Figuring out the type of network here. # Precedence: User Managed > MULTI > SINGLE > DEFAULT self.net_type = network.NetType.DEFAULT.value self.cidr = NETWORK_RANGE if FLAGS.gce_subnet_region: self.net_type = network.NetType.SINGLE.value self.cidr = FLAGS.gce_subnet_addr if network_spec.cidr: self.net_type = network.NetType.MULTI.value self.cidr = network_spec.cidr name = self._MakeGceNetworkName() subnet_region = (FLAGS.gce_subnet_region if not network_spec.cidr else util.GetRegionFromZone(network_spec.zone)) mode = 'auto' if subnet_region is None else 'custom' self.network_resource = GceNetworkResource(name, mode, self.project) if subnet_region is None: self.subnet_resource = None else: self.subnet_resource = GceSubnetResource( FLAGS.gce_subnet_name or name, name, subnet_region, self.cidr, self.project) # Stage FW rules. self.all_nets = self._GetNetworksFromSpec( network_spec) # Holds the different networks in this run. self.external_nets_rules = { } # Holds FW rules for any external subnets. # Set the default rule to allow all traffic within this network's subnet. firewall_name = self._MakeGceFWRuleName() self.default_firewall_rule = GceFirewallRule(firewall_name, self.project, ALLOW_ALL, name, self.cidr) # Set external rules to allow traffic from other subnets in this benchmark. for ext_net in self.all_nets: if ext_net == self.cidr: continue # We've already added our own network to the default rule. rule_name = self._MakeGceFWRuleName(dst_cidr=ext_net) self.external_nets_rules[rule_name] = GceFirewallRule( rule_name, self.project, ALLOW_ALL, name, ext_net) # Add VpnGateways to the network. if FLAGS.use_vpn: for gatewaynum in range(0, FLAGS.vpn_service_gateway_count): vpn_gateway_name = 'vpngw-%s-%s-%s' % (util.GetRegionFromZone( network_spec.zone), gatewaynum, FLAGS.run_uri) self.vpn_gateway[vpn_gateway_name] = GceVpnGateway( vpn_gateway_name, name, util.GetRegionFromZone(network_spec.zone), network_spec.cidr, self.project)
def Prepare(benchmark_spec): """Install and set up MNIST on the target vm. Args: benchmark_spec: The benchmark specification """ benchmark_spec.always_call_cleanup = True _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if not benchmark_spec.tpus: vm.Install('tensorflow') vm.Install('cloud_tpu_models') vm.Install('tensorflow_models') if benchmark_spec.tpus: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service bucket = 'pkb{}'.format(FLAGS.run_uri) benchmark_spec.bucket = bucket benchmark_spec.model_dir = 'gs://{}'.format(bucket) location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.AclBucket(benchmark_spec.gcp_service_account, gcs.WRITER, bucket) else: benchmark_spec.model_dir = '/tmp'
def GetCommonMetadata(custom_metadata: Optional[Dict[str, Any]] = None) -> str: """Returns pkb metadata associated with this run as cloudharmony metadata. Cloudharmony benchmarks take in benchmark setup configurations as inputs and include them in the output as metadata for the run. This function creates a string of input metadata from pkb flags to be included as run parameter for cloudharmony benchmarks. Args: custom_metadata: a dictionary of metadata key value pairs that should override any flag chosen in the function, or should also be included. Returns: A string of metadata that should be appended to the cloudharmony benchmark run. """ if FLAGS.cloud != providers.GCP: # Should not be including cloudharmony metadata for non-gcp runs. return '' metadata = { 'meta_compute_service': 'Google Compute Engine', 'meta_compute_service_id': 'google:compute', 'meta_instance_id': FLAGS.machine_type, 'meta_provider': 'Google Cloud Platform', 'meta_provider_id': 'google', 'meta_region': gcp_util.GetRegionFromZone(FLAGS.zone[0]), 'meta_zone': FLAGS.zone[0], 'meta_test_id': FLAGS.run_uri, } if custom_metadata: metadata.update(custom_metadata) metadata_pair = [f'--{key} {value}' for key, value in metadata.items()] return ' '.join(metadata_pair)
def __init__(self, dpb_service_spec): super(UnmanagedDpbService, self).__init__(dpb_service_spec) # Dictionary to hold the cluster vms. self.vms = {} self.cloud = dpb_service_spec.worker_group.cloud if not self.dpb_service_zone: raise errors.Setup.InvalidSetupError( 'dpb_service_zone must be provided, for provisioning.') if self.cloud == 'GCP': self.region = gcp_util.GetRegionFromZone(FLAGS.dpb_service_zone) self.storage_service = gcs.GoogleCloudStorageService() self.persistent_fs_prefix = 'gs://' elif self.cloud == 'AWS': self.region = aws_util.GetRegionFromZone(FLAGS.dpb_service_zone) self.storage_service = s3.S3Service() self.persistent_fs_prefix = 's3://' else: self.region = None self.storage_service = None self.persistent_fs_prefix = None self.manage_bucket = False logging.warning( 'Cloud provider %s does not support object storage. ' 'Some benchmarks will not work.', self.cloud) if self.storage_service: self.storage_service.PrepareService(location=self.region) # set in _Create of derived classes self.leader = None
def __init__(self, dpb_service_spec): super(UnmanagedDpbService, self).__init__(dpb_service_spec) # Dictionary to hold the cluster vms. self.vms = {} self.cloud = dpb_service_spec.worker_group.cloud if not self.dpb_service_zone: raise errors.Setup.InvalidSetupError( 'dpb_service_zone must be provided, for provisioning.') if self.cloud == 'GCP': self.region = gcp_util.GetRegionFromZone(FLAGS.dpb_service_zone) self.storage_service = gcs.GoogleCloudStorageService() self.persistent_fs_prefix = 'gs://' elif self.cloud == 'AWS': self.region = aws_util.GetRegionFromZone(FLAGS.dpb_service_zone) self.storage_service = s3.S3Service() self.persistent_fs_prefix = 's3://' else: raise errors.Config.InvalidValue( f'Unsupported Cloud provider {self.cloud}') if self.storage_service: self.storage_service.PrepareService(location=self.region) # set in _Create of derived classes self.leader = None
def __init__(self, disk_spec, name, zone, project, image=None, image_project=None, replica_zones=None): super(GceDisk, self).__init__(disk_spec) self.attached_vm_name = None self.image = image self.image_project = image_project self.name = name self.zone = zone self.project = project self.replica_zones = replica_zones self.region = util.GetRegionFromZone(self.zone) self.provisioned_iops = None if self.disk_type == PD_EXTREME: self.provisioned_iops = FLAGS.gcp_provisioned_iops disk_metadata = DISK_METADATA[disk_spec.disk_type] if self.replica_zones: disk_metadata[disk.REPLICATION] = disk.REGION self.metadata['replica_zones'] = replica_zones self.metadata.update(DISK_METADATA[disk_spec.disk_type]) if self.disk_type == disk.LOCAL: self.metadata['interface'] = FLAGS.gce_ssd_interface if self.provisioned_iops and self.disk_type == PD_EXTREME: self.metadata['provisioned_iops'] = self.provisioned_iops
def GetFullRegistryTag(self, image): """Gets the full tag of the image.""" region = util.GetMultiRegionFromRegion(util.GetRegionFromZone(self.zone)) hostname = '{region}.gcr.io'.format(region=region) full_tag = '{hostname}/{project}/{name}'.format( hostname=hostname, project=self.project, name=image) return full_tag
def _GetDefaultConfig(self): """Gets the config that corresponds the region used for the test.""" try: region = util.GetRegionFromZone( FLAGS.zones[0] if FLAGS.zones else FLAGS.zone[0]) except IndexError: region = _DEFAULT_REGION return f'regional-{region}'
def _Create(self): """Creates the Cloud SQL instance and authorizes traffic from anywhere.""" storage_size = self.spec.disk_spec.disk_size instance_zone = self.spec.vm_spec.zone # TODO: We should create the client VM with a static IP, and # only authorize that specific IP address. The client VM can be accessed # like so: self.client_vm authorized_network = '0.0.0.0/0' database_version_string = self._GetEngineVersionString( self.spec.engine, self.spec.engine_version) cmd_string = [ self, 'beta', 'sql', 'instances', 'create', self.instance_id, '--quiet', '--format=json', '--async', '--activation-policy=ALWAYS', '--assign-ip', '--authorized-networks=%s' % authorized_network, '--enable-bin-log', '--gce-zone=%s' % instance_zone, '--region=%s' % util.GetRegionFromZone(instance_zone), '--database-version=%s' % database_version_string, '--pricing-plan=%s' % self.GCP_PRICING_PLAN, '--storage-size=%d' % storage_size, ] # TODO(ferneyhough): add tier machine types support for Postgres if self.spec.engine == managed_relational_db.MYSQL: machine_type_flag = '--tier=%s' % self.spec.vm_spec.machine_type cmd_string.append(machine_type_flag) else: self._ValidateSpec() memory = self.spec.vm_spec.memory cpus = self.spec.vm_spec.cpus self._ValidateMachineType(memory, cpus) cmd_string.append('--cpu={}'.format(cpus)) cmd_string.append('--memory={}MiB'.format(memory)) if self.spec.high_availability: cmd_string.append(self._GetHighAvailabilityFlag()) if self.spec.backup_enabled: cmd_string.append('--backup') cmd_string.append('--backup-start-time={}'.format( self.spec.backup_start_time)) else: cmd_string.append('--no-backup') cmd = util.GcloudCommand(*cmd_string) cmd.flags['project'] = self.project _, _, _ = cmd.Issue()
def Prepare(benchmark_spec): """Install and set up MNIST on the target vm. Args: benchmark_spec: The benchmark specification """ benchmark_spec.always_call_cleanup = True _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if not benchmark_spec.use_tpu: vm.Install('tensorflow') vm.Install('cloud_tpu_models') vm.RemoteCommand('git clone https://github.com/tensorflow/models.git', should_log=True) if benchmark_spec.use_tpu: storage_service = gcs.GoogleCloudStorageService() storage_service.PrepareVM(vm) benchmark_spec.storage_service = storage_service model_dir = 'gs://{}'.format(FLAGS.run_uri) benchmark_spec.model_dir = model_dir vm.RemoteCommand( '{gsutil} mb -c regional -l {location} {model_dir}'.format( gsutil=vm.gsutil_path, location=util.GetRegionFromZone( benchmark_spec.tpu_groups['train'].GetZone()), model_dir=benchmark_spec.model_dir), should_log=True) vm.RemoteCommand( '{gsutil} acl ch -u {service_account}:W {model_dir}'.format( gsutil=vm.gsutil_path, service_account=benchmark_spec.gcp_service_account, model_dir=benchmark_spec.model_dir), should_log=True) else: benchmark_spec.model_dir = '/tmp' if (FLAGS.imagenet_data_dir or FLAGS.t2t_data_dir) and FLAGS.cloud != 'GCP': vm.Install('google_cloud_sdk') vm.RemoteCommand('echo "export {}" >> ~/.bashrc'.format(GCP_ENV), login_shell=True) credential_path = os.path.join('~', '.config', 'gcloud') vm.RemoteCommand('mkdir -p {}'.format(credential_path), login_shell=True) credential_file = os.path.join(credential_path, 'application_default_credentials.json') vm.PushFile(FLAGS.gcp_credential, credential_file) vm.RemoteCommand( '{env} gcloud auth ' 'activate-service-account --key-file {key_file}'.format( env=GCP_ENV, key_file=credential_file), login_shell=True)
def __init__(self, spec): super(GkeCluster, self).__init__(spec) self.project = spec.vm_spec.project self.cluster_version = FLAGS.container_cluster_version self.use_application_default_credentials = True self.zones = self.zone and self.zone.split(',') if not self.zones: raise errors.Config.MissingOption( 'container_cluster.vm_spec.GCP.zone is required.') elif len(self.zones) == 1 and util.IsRegion(self.zone): self.region = self.zone self.zones = [] logging.info("Interpreting zone '%s' as a region", self.zone) else: self.region = util.GetRegionFromZone(self.zones[0])
def Prepare(benchmark_spec): """Install and set up MNIST on the target vm. Args: benchmark_spec: The benchmark specification """ benchmark_spec.always_call_cleanup = True _UpdateBenchmarkSpecWithFlags(benchmark_spec) vm = benchmark_spec.vms[0] if not benchmark_spec.tpus: vm.Install('tensorflow') vm.Install('cloud_tpu_models') vm.Install('tensorflow_models') if benchmark_spec.tpus: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service bucket = 'pkb{}'.format(FLAGS.run_uri) benchmark_spec.bucket = bucket benchmark_spec.model_dir = 'gs://{}'.format(bucket) location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.ChmodBucket(benchmark_spec.gcp_service_account, 'W', bucket) else: benchmark_spec.model_dir = '/tmp' if (FLAGS.imagenet_data_dir or FLAGS.t2t_data_dir) and FLAGS.cloud != 'GCP': vm.Install('google_cloud_sdk') vm.RemoteCommand('echo "export {}" >> ~/.bashrc'.format(GCP_ENV), login_shell=True) credential_path = os.path.join('~', '.config', 'gcloud') vm.RemoteCommand('mkdir -p {}'.format(credential_path), login_shell=True) credential_file = os.path.join(credential_path, 'application_default_credentials.json') vm.PushFile(FLAGS.gcp_credential, credential_file) vm.RemoteCommand( '{env} gcloud auth ' 'activate-service-account --key-file {key_file}'.format( env=GCP_ENV, key_file=credential_file), login_shell=True)
def __init__(self, gce_placement_group_spec): """Init method for GcePlacementGroup. Args: gce_placement_group_spec: Object containing the information needed to create an GcePlacementGroup. """ super(GcePlacementGroup, self).__init__(gce_placement_group_spec) self.project = gce_placement_group_spec.project self.region = gcp_util.GetRegionFromZone(gce_placement_group_spec.zone) self.zone = None self.num_vms = gce_placement_group_spec.num_vms self.name = 'perfkit-{}'.format(context.GetThreadBenchmarkSpec().uuid) self.style = gce_placement_group_spec.placement_group_style self.availability_domain_count = FLAGS.gce_availability_domain_count self.metadata.update({ 'placement_group_name': self.name, 'placement_group_style': self.style })
def _PrepareBucket(benchmark_spec): """Prepare storage bucket for profiling results, if needed. Args: benchmark_spec: The benchmark specification """ if (mlperf_benchmark.NONE in FLAGS.mlperf_profiler and not FLAGS.mlperf_keep_nccl_log): return if FLAGS.cloud != 'GCP': return location = benchmark_spec.zones[0] bucket = benchmark_spec.bucket storage_service = benchmark_spec.storage_service storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket, raise_on_failure=False) storage_service.AclBucket(benchmark_spec.gcp_service_account, gcs.WRITER, bucket)
def __init__(self, dpb_service_spec): super().__init__(dpb_service_spec) self.dpb_service_type = self.SERVICE_TYPE # Set DPB version as Spark version for metadata self.dpb_version = 'spark_' + FLAGS.spark_version benchmark_spec = context.GetThreadBenchmarkSpec() self.k8s_cluster = benchmark_spec.container_cluster assert self.k8s_cluster assert self.k8s_cluster.CLUSTER_TYPE == container_service.KUBERNETES self.cloud = self.k8s_cluster.CLOUD self.container_registry = benchmark_spec.container_registry assert self.container_registry self.spark_drivers = [] # TODO(pclay): Support overriding image? # Corresponds with data/docker/spark directory self.image = 'spark' if self.cloud == 'GCP': self.region = gcp_util.GetRegionFromZone(self.k8s_cluster.zone) self.storage_service = gcs.GoogleCloudStorageService() self.persistent_fs_prefix = 'gs://' elif self.cloud == 'AWS': self.region = self.k8s_cluster.region self.storage_service = s3.S3Service() self.persistent_fs_prefix = 's3://' else: raise errors.Config.InvalidValue( f'Unsupported Cloud provider {self.cloud}') self.storage_service.PrepareService(location=self.region) # TODO(pclay): support assert not FLAGS.dpb_cluster_properties if self.k8s_cluster.num_nodes < 2: raise errors.Config.InvalidValue( f'Cluster type {KUBERNETES_SPARK_CLUSTER} requires at least 2 nodes.' f'Found {self.k8s_cluster.num_nodes}.')
def __init__(self, network_spec: GceNetworkSpec): super(GceNetwork, self).__init__(network_spec) self.project: Optional[str] = network_spec.project self.vpn_gateway: Dict[str, GceVpnGateway] = {} # Figuring out the type of network here. # Precedence: User Managed > MULTI > SINGLE > DEFAULT self.net_type = network.NetType.DEFAULT.value self.cidr = NETWORK_RANGE if FLAGS.gce_subnet_region: self.net_type = network.NetType.SINGLE.value self.cidr = FLAGS.gce_subnet_addr if network_spec.cidr: self.net_type = network.NetType.MULTI.value self.cidr = network_spec.cidr self.mtu = network_spec.mtu name = self._MakeGceNetworkName() subnet_region = (FLAGS.gce_subnet_region if not network_spec.cidr else util.GetRegionFromZone(network_spec.zone)) mode = 'auto' if subnet_region is None else 'custom' self.network_resource = GceNetworkResource(name, mode, self.project, self.mtu) if subnet_region is None: self.subnet_resource = None else: self.subnet_resource = GceSubnetResource( FLAGS.gce_subnet_name or name, name, subnet_region, self.cidr, self.project) # Stage FW rules. self.all_nets = self._GetNetworksFromSpec( network_spec) # Holds the different networks in this run. # Holds FW rules for any external subnets. self.external_nets_rules: Dict[str, GceFirewallRule] = {} # Set the default rule to allow all traffic within this network's subnet. firewall_name = self._MakeGceFWRuleName() self.default_firewall_rule = GceFirewallRule(firewall_name, self.project, ALLOW_ALL, name, self.cidr) # Set external rules to allow traffic from other subnets in this benchmark. for ext_net in self.all_nets: if ext_net == self.cidr: continue # We've already added our own network to the default rule. rule_name = self._MakeGceFWRuleName(dst_cidr=ext_net) self.external_nets_rules[rule_name] = GceFirewallRule( rule_name, self.project, ALLOW_ALL, name, ext_net) # Add VpnGateways to the network. if FLAGS.use_vpn: for gatewaynum in range(0, FLAGS.vpn_service_gateway_count): vpn_gateway_name = 'vpngw-%s-%s-%s' % (util.GetRegionFromZone( network_spec.zone), gatewaynum, FLAGS.run_uri) self.vpn_gateway[vpn_gateway_name] = GceVpnGateway( vpn_gateway_name, name, util.GetRegionFromZone(network_spec.zone), network_spec.cidr, self.project) # Add GCE Placement Group no_placement_group = (not FLAGS.placement_group_style or FLAGS.placement_group_style == placement_group.PLACEMENT_GROUP_NONE) if no_placement_group: self.placement_group = None else: placement_group_spec = gce_placement_group.GcePlacementGroupSpec( 'GcePlacementGroupSpec', flag_values=FLAGS, zone=network_spec.zone, project=self.project, num_vms=self._GetNumberVms()) self.placement_group = gce_placement_group.GcePlacementGroup( placement_group_spec)
def Prepare(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) if vm is None: vm = benchmark_spec.vms[0] if (bool(benchmark_spec.tpus) and cuda_toolkit.CheckNvidiaGpuExists(vm)): raise errors.Config.InvalidValue( 'Invalid configuration. GPUs and TPUs can not both present in the config.' ) vm.RemoteCommand( 'if [ ! -d "$HOME/training_results_v0.6" ]; then ' ' git clone https://github.com/mlperf/training_results_v0.6.git ; ' 'fi', should_log=True) vm.InstallPackages('python3-pip') if benchmark_spec.tpus: if vm == benchmark_spec.vms[0]: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service bucket = 'pkb{}'.format(FLAGS.run_uri) benchmark_spec.bucket = bucket benchmark_spec.model_dir = 'gs://{}'.format(bucket) location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.ChmodBucket(benchmark_spec.gcp_service_account, 'W', bucket) # For MLPerf v0.6, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/tpu-{tpus}' .format(model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') if 'mask' in benchmark_spec.benchmark: model = 'mask_rcnn' elif 'gnmt' in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark code_path = ( '$HOME/training_results_v0.6/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ') vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12') if ('mask' in benchmark_spec.benchmark or 'ssd' in benchmark_spec.benchmark): # TODO(b/141876878): coco whl package for python 3.5 vm.RemoteCommand( 'cd /tmp && ' 'wget https://storage.cloud.google.com/mlperf_artifcats/v0.6_training/coco-1.1-cp36-cp36m-linux_x86_64.whl' ) vm.RemoteCommand('cd {path} && ' 'sed "s/--progress-bar off/ /g" ./setup.sh | ' 'sed "s/pip /pip3 /g" > ./setup1.sh && ' 'chmod 755 ./setup1.sh && ' './setup1.sh'.format(path=run_path)) else: vm.RemoteCommand( 'cd {path} && ' 'sed "s/--progress-bar off/ /g" ./setup.sh > ./setup1.sh && ' 'chmod 755 ./setup1.sh && ' './setup1.sh'.format(path=run_path)) if 'mask' not in benchmark_spec.benchmark: vm.RemoteCommand( 'pip3 uninstall -y tf-estimator-nightly && ' 'pip3 install tf-estimator-nightly==1.14.0.dev2019051801') vm.RemoteCommand( r'cd {path} && ' r'sed "s/--model_dir=.*/--model_dir=gs:\/\/{bucket} \\\/g" run_and_time.sh | ' r'sed "s/--tpu=.*/--tpu={tpu} \\\/g" | ' r'sed "s/--output_dir=.*/--output_dir=gs:\/\/{bucket} \\\/g" | ' r'sed "s/--cloud_tpu_name=.*/--cloud_tpu_name={tpu} \\\/g" | ' r'sed "s/--out_dir=.*/--out_dir=gs:\/\/{bucket} \\\/g" | ' r'sed "s/--tpu_name=.*/--tpu_name={tpu} \\\/g" > run_and_time1.sh && ' r'chmod 755 run_and_time1.sh '.format( path=run_path, bucket=bucket, tpu=benchmark_spec.tpu_groups['train'].GetName())) if 'gnmt' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd {code_path}/{model} && ' 'cp metric.py metric0.py && ' 'sed "s/ sacrebleu -t/ python3 -m sacrebleu -t/g" metric0.py > metric.py' .format(code_path=code_path, model=model)) else: benchmark_spec.model_dir = '/tmp' has_gpu = cuda_toolkit.CheckNvidiaGpuExists(vm) if has_gpu: vm.Install('cuda_toolkit') vm.Install('nvidia_docker') vm.RemoteCommand( 'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi') if 'resnet' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet &&' ' sudo docker build --pull --network=host . -t mlperf-nvidia:image_classification', should_log=True) _DownloadData(benchmark_spec.imagenet_data_dir, posixpath.join('/data', 'imagenet'), vm) if 'transformer' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch &&' ' sudo docker build --pull --network=host . -t mlperf-nvidia:translation', should_log=True) _DownloadData(benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm) if 'minigo' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:minigo .', should_log=True) if 'mask' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:object_detection . ', should_log=True) _DownloadData(benchmark_spec.coco2017_data_dir, posixpath.join('/data', 'coco2017'), vm) if 'gnmt' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:rnn_translator . ', should_log=True) _DownloadData(benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm) if 'ssd' in benchmark_spec.benchmark: vm.RemoteCommand( 'cd training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch && ' 'sudo docker build --pull --network=host -t mlperf-nvidia:single_stage_detector . ', should_log=True) _DownloadData(benchmark_spec.coco2017_data_dir, posixpath.join('/data', 'coco2017'), vm)
def GetRegionFromZone(zone: str) -> str: # only gcp is supported as cloudharmony metadata is exclusive to gcp runs. if FLAGS.cloud == 'GCP': return gcp_util.GetRegionFromZone(zone) else: return zone
def PrepareRunner(benchmark_spec, vm=None): """Install and set up MLPerf on the target vm. Args: benchmark_spec: The benchmark specification vm: The VM to work on Raises: errors.Config.InvalidValue upon both GPUs and TPUs appear in the config """ vm = vm or benchmark_spec.vms[0] if benchmark_spec.tpus: if vm == benchmark_spec.vms[0]: storage_service = gcs.GoogleCloudStorageService() benchmark_spec.storage_service = storage_service if FLAGS.mlperf_bucket: bucket = FLAGS.mlperf_bucket benchmark_spec.model_dir = f'gs://{bucket}/pkb-{FLAGS.run_uri}' else: bucket = f'pkb-{FLAGS.run_uri}'.format(uri=FLAGS.run_uri) benchmark_spec.model_dir = f'gs://{bucket}' benchmark_spec.bucket = bucket location = benchmark_spec.tpu_groups['train'].GetZone() storage_service.PrepareService(util.GetRegionFromZone(location)) storage_service.MakeBucket(bucket) storage_service.AclBucket(benchmark_spec.gcp_service_account, gcs.WRITER, bucket) # For MLPerf 1.0, the benchmake code of different hardware are different. if (benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-32' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-128' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-256' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-512' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-1024' or benchmark_spec.tpu_groups['train'].GetAcceleratorType() == 'v3-2048'): run_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/tpu-{tpus}' .format(version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train']. GetAcceleratorType())) else: raise ValueError( 'MLPerf configurations do not support the hardware in PKB. PKB may ' 'need to be updated if this is a new TPU type.') if MASK in benchmark_spec.benchmark: model = 'mask_rcnn' elif GNMT in benchmark_spec.benchmark: model = 'nmt' else: model = benchmark_spec.benchmark code_path = ( '$HOME/training_results_{version}/Google/benchmarks/{model}/implementations/tpu-{tpus}-{model}' .format( version=MLPERF_VERSION, model=benchmark_spec.benchmark, tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType())) vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ') vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12') if (MASK in benchmark_spec.benchmark or SSD in benchmark_spec.benchmark): # Install the coco package, to load the coco dataset for Mask-RCNN # and SSD benchmarks. # TODO(user): coco whl package for python 3.5 vm.RemoteCommand( 'cd /tmp && ' f'wget https://storage.cloud.google.com/mlperf_artifcats/{MLPERF_VERSION}_training/coco-1.1-cp36-cp36m-linux_x86_64.whl' ) setup_script = posixpath.join(run_path, 'setup.sh') vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script) vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script) vm.RemoteCommand( 'chmod 755 {script} && {script}'.format(script=setup_script)) if MASK not in benchmark_spec.benchmark: vm.RemoteCommand( 'pip3 uninstall -y tf-estimator-nightly && ' 'pip3 install tf-estimator-nightly==1.14.0.dev2019051801') if RESNET in benchmark_spec.benchmark: data_dir = benchmark_spec.imagenet_data_dir elif TRANSFORMER in benchmark_spec.benchmark: data_dir = benchmark_spec.wmt_data_dir elif MASK in benchmark_spec.benchmark: data_dir = benchmark_spec.coco_data_dir elif GNMT in benchmark_spec.benchmark: data_dir = benchmark_spec.gnmt_data_dir elif SSD in benchmark_spec.benchmark: data_dir = benchmark_spec.coco_data_dir elif BERT in benchmark_spec.benchmark: data_dir = benchmark_spec.bert_data_dir else: raise ValueError( 'Unknown operation, cannot find {} in benchmark'.format( benchmark_spec.benchmark)) run_script = posixpath.join(run_path, 'run_and_time.sh') data_dir = data_dir.replace('/', r'\/') checkpoint = FLAGS.mlperf_gcs_resnet_checkpoint.replace('/', r'\/') decode_dir = FLAGS.mlperf_transformer_decode_dir.replace('/', r'\/') tpu = benchmark_spec.tpu_groups['train'].GetName() vm_util.ReplaceText(vm, '--model_dir=.*', r'--model_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--data_dir=.*', r'--data_dir={} \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--training_file_pattern=.*', r'--training_file_pattern={}\/train-* \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--validation_file_pattern=.*', r'--validation_file_pattern={}\/val-* \\\\'.format(data_dir), run_script) vm_util.ReplaceText( vm, '--val_json_file=.*', r'--val_json_file={}\/instances_val2017.json \\\\'.format( data_dir), run_script) vm_util.ReplaceText(vm, '--resnet_checkpoint=.*', r'--resnet_checkpoint={} \\\\'.format(checkpoint), run_script) vm_util.ReplaceText( vm, '--decode_from_file=.*', r'--decode_from_file={}\/wmt14-en-de.src \\\\'.format(decode_dir), run_script) vm_util.ReplaceText( vm, '--decode_reference=.*', r'--decode_reference={}\/wmt14-en-de.ref \\\\'.format(decode_dir), run_script) vm_util.ReplaceText( vm, '--decode_to_file=.*', r'--decode_to_file={}\/decode.transformer_mlperf_tpu.' r'translate_ende_wmt32k_packed.2x2_log_1018_2 \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--tpu=.*', r'--tpu={} \\\\'.format(tpu), run_script) vm_util.ReplaceText(vm, '--output_dir=.*', r'--output_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--cloud_tpu_name=.*', r'--cloud_tpu_name={} \\\\'.format(tpu), run_script) vm_util.ReplaceText(vm, '--out_dir=.*', r'--out_dir=gs:\/\/{} \\\\'.format(bucket), run_script) vm_util.ReplaceText(vm, '--tpu_name=.*', r'--tpu_name={} \\\\'.format(tpu), run_script) vm.RemoteCommand('chmod 755 {}'.format(run_script)) if GNMT in benchmark_spec.benchmark: metric_script = posixpath.join(code_path, model, 'metric.py') vm_util.ReplaceText(vm, ' sacrebleu -t', ' python3 -m sacrebleu -t', metric_script) else: benchmark_spec.model_dir = '/tmp' has_gpu = nvidia_driver.CheckNvidiaGpuExists(vm) if has_gpu: vm.Install('cuda_toolkit') vm.Install('nvidia_docker') vm.RemoteCommand( 'if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi') if RESNET in benchmark_spec.benchmark: run_script = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet/run_and_time.sh' vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/resnet/implementations/mxnet &&' ' sudo docker build --network=host . -t mlperf-nvidia:image_classification', should_log=True) _DownloadData(benchmark_spec.imagenet_data_dir, posixpath.join('/data', 'imagenet'), vm) if TRANSFORMER in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/transformer/implementations/pytorch &&' ' sudo docker build --network=host . -t mlperf-nvidia:translation', should_log=True) _DownloadData(benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'), vm) if MINIGO in benchmark_spec.benchmark: build_path = f'training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/minigo/implementations/tensorflow' run_script = posixpath.join(build_path, 'run_and_time.sh') vm_util.ReplaceText( vm, 'get_data.py', 'get_data.py --src_dir={}'.format( FLAGS.minigo_model_dir.replace('/', r'\/')), run_script) vm.RemoteCommand('cd {} && sudo docker build --network=host -t ' 'mlperf-nvidia:minigo .'.format(build_path), should_log=True) if MASK in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:object_detection . ', should_log=True) _DownloadData(benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm) if GNMT in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/gnmt/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:rnn_translator . ', should_log=True) _DownloadData(benchmark_spec.gnmt_data_dir, posixpath.join('/data', 'gnmt'), vm) if SSD in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/ssd/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:single_stage_detector . ', should_log=True) _DownloadData(benchmark_spec.coco_data_dir, posixpath.join('/data', 'coco2017'), vm) if BERT in benchmark_spec.benchmark: vm.RemoteCommand( f'cd training_results_{MLPERF_VERSION}/NVIDIA/benchmarks/bert/implementations/pytorch && ' 'sudo docker build --network=host -t mlperf-nvidia:language_model . ', should_log=True) _DownloadData(benchmark_spec.bert_data_dir, posixpath.join('/data', 'bert_data'), vm)
def __init__(self, name, node_type, zone, project): super(GceSoleTenantNodeTemplate, self).__init__() self.name = name self.node_type = node_type self.region = util.GetRegionFromZone(zone) self.project = project
def testGetRegionFromZone(self): zone = 'us-central1-xyz' self.assertEqual(util.GetRegionFromZone(zone), 'us-central1')