def _CreateInstance(self): """Generates and execute command for creating a Rackspace VM.""" with tempfile.NamedTemporaryFile(dir=vm_util.GetTempDir(), prefix='user-data') as tf: with open(self.ssh_public_key) as f: public_key = f.read().rstrip('\n') tf.write(CLOUD_CONFIG_TEMPLATE.format(self.user_name, public_key)) tf.flush() create_cmd = self._GetCreateCommand(tf) stdout, stderr, _ = create_cmd.Issue() if stderr: resp = json.loads(stderr) raise errors.Error(''.join(( 'Non-recoverable error has occurred: %s\n' % str(resp), 'Following command caused the error: %s' % repr(create_cmd), ))) resp = json.loads(stdout) self.id = resp['ID']
def CollectResultFile(vm, results): """Collect result file on vm. Args: vm: The target vm. results: A dictionary of lists. Each list contains results of a field defined in RESULTS_METRICS collected from each loader machines. """ result_path = _ResultFilePath(vm) vm.PullFile(vm_util.GetTempDir(), result_path) resp, _ = vm.RemoteCommand('tail -n 20 ' + result_path) for metric in RESULTS_METRICS: value = regex_util.ExtractGroup(r'%s[\t ]+: ([\d\.:]+)' % metric, resp) if metric == RESULTS_METRICS[-1]: # Total operation time value = value.split(':') results[metric].append( int(value[0]) * 3600 + int(value[1]) * 60 + int(value[2])) else: results[metric].append(float(value))
def Register(parsed_flags): """Registers the dstat collector if FLAGS.dstat is set.""" if not parsed_flags.dstat: return output_directory = (parsed_flags.dstat_output if parsed_flags['dstat_output'].present else vm_util.GetTempDir()) logging.debug( 'Registering dstat collector with interval %s, output to %s.', parsed_flags.dstat_interval, output_directory) if not os.path.isdir(output_directory): os.makedirs(output_directory) collector = _DStatCollector(interval=parsed_flags.dstat_interval, output_directory=output_directory) events.before_phase.connect(collector.Start, events.RUN_PHASE, weak=False) events.after_phase.connect(collector.Stop, events.RUN_PHASE, weak=False)
def __init__(self, interval=None, output_directory=None): """Runs dstat on 'vms'. Start dstat collection via `Start`. Stop via `Stop`. Args: interval: Optional int. Interval in seconds in which to collect samples. """ self.interval = interval self.output_directory = output_directory or vm_util.GetTempDir() self._lock = threading.Lock() self._pids = {} self._file_names = {} self._role_mapping = {} # mapping vm role to dstat file self._start_time = 0 if not os.path.isdir(self.output_directory): raise IOError('dstat output directory does not exist: {0}'.format( self.output_directory))
def test_initialize_beam_repo_beam_not_exists(self): FLAGS.beam_location = None with mock.patch.object(beam_benchmark_helper, '_PrebuildBeam') as mock_prebuild, \ mock.patch.object(vm_util, 'GenTempDir'), \ mock.patch.object(vm_util, 'GetTempDir'), \ mock.patch.object(vm_util, 'IssueCommand') as mock_run: mock_spec = mock.MagicMock() mock_spec.dpb_service.SERVICE_TYPE = dpb_service.DATAFLOW beam_benchmark_helper.InitializeBeamRepo(mock_spec) expected_cmd = [ 'git', 'clone', 'https://github.com/apache/beam.git' ] mock_run.assert_called_once_with(expected_cmd, cwd=vm_util.GetTempDir()) mock_prebuild.assert_called_once()
def GetSpecFromFile(cls, name): """Unpickles the spec and returns it. Args: name: The name of the benchmark (and the name of the pickled file). Returns: A BenchmarkSpec object. """ file_name = '%s/%s' % (vm_util.GetTempDir(), name) try: with open(file_name, 'rb') as pickle_file: spec = pickle.load(pickle_file) except Exception as e: # pylint: disable=broad-except logging.error('Unable to unpickle spec file for benchmark %s.', name) raise e # Always let the spec be deleted after being unpickled so that # it's possible to run cleanup even if cleanup has already run. spec.deleted = False return spec
def _Create(self): """Create a GCE VM instance.""" num_hosts = len(self.host_list) with open(self.ssh_public_key) as f: public_key = f.read().rstrip('\n') with vm_util.NamedTemporaryFile(mode='w', dir=vm_util.GetTempDir(), prefix='key-metadata') as tf: tf.write('%s:%s\n' % (self.user_name, public_key)) tf.close() create_cmd = self._GenerateCreateCommand(tf.name) _, stderr, retcode = create_cmd.Issue(timeout=_GCE_VM_CREATE_TIMEOUT, raise_on_failure=False) if (self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr and not self.num_vms_per_host): logging.warning( 'Creation failed due to insufficient host capacity. A new host will ' 'be created and instance creation will be retried.') with self._host_lock: if num_hosts == len(self.host_list): host = GceSoleTenantNodeGroup(self.node_template, self.zone, self.project) self.host_list.append(host) host.Create() self.node_group = self.host_list[-1] raise errors.Resource.RetryableCreationError() if (not self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr): logging.error(STOCKOUT_MESSAGE) raise errors.Benchmarks.InsufficientCapacityCloudFailure(STOCKOUT_MESSAGE) util.CheckGcloudResponseKnownFailures(stderr, retcode) if retcode: if (create_cmd.rate_limited and 'already exists' in stderr and FLAGS.gcp_retry_on_rate_limited): # Gcloud create commands may still create VMs despite being rate # limited. return if util.RATE_LIMITED_MESSAGE in stderr: raise errors.Benchmarks.QuotaFailure.RateLimitExceededError(stderr) raise errors.Resource.CreationError( 'Failed to create VM: %s return code: %s' % (stderr, retcode))
def __init__(self, interval=None, output_directory=None): """Runs collector on 'vms'. Start collector collection via `Start`. Stop via `Stop`. Args: interval: Optional int. Interval in seconds in which to collect samples. output_directory: Optional directory where to save collection output. Raises: IOError: for when the output directory doesn't exist. """ self.interval = interval self.output_directory = output_directory or vm_util.GetTempDir() self._lock = threading.Lock() self._pid_files = {} self._role_mapping = {} # mapping vm role to output file self._start_time = 0 if not os.path.isdir(self.output_directory): raise IOError('collector output directory does not exist: {0}'.format( self.output_directory))
def RunBenchmarks(): """Runs all benchmarks in PerfKitBenchmarker. Returns: Exit status for the process. """ benchmark_specs = _CreateBenchmarkSpecs() collector = SampleCollector() try: tasks = [(RunBenchmarkTask, (spec,), {}) for spec in benchmark_specs] spec_sample_tuples = background_tasks.RunParallelProcesses( tasks, FLAGS.run_processes) benchmark_specs, sample_lists = zip(*spec_sample_tuples) for sample_list in sample_lists: collector.samples.extend(sample_list) finally: if collector.samples: collector.PublishSamples() if benchmark_specs: logging.info(benchmark_status.CreateSummary(benchmark_specs)) logging.info('Complete logs can be found at: %s', vm_util.PrependTempDir(LOG_FILE_NAME)) if stages.TEARDOWN not in FLAGS.run_stage: logging.info( 'To run again with this setup, please use --run_uri=%s', FLAGS.run_uri) if FLAGS.archive_bucket: archive.ArchiveRun(vm_util.GetTempDir(), FLAGS.archive_bucket, gsutil_path=FLAGS.gsutil_path, prefix=FLAGS.run_uri + '_') all_benchmarks_succeeded = all(spec.status == benchmark_status.SUCCEEDED for spec in benchmark_specs) return 0 if all_benchmarks_succeeded else 1
def Register(parsed_flags): """Registers the sar collector if FLAGS.sar is set.""" if not parsed_flags.sar: return output_directory = (parsed_flags.sar_output if parsed_flags['sar_output'].present else vm_util.GetTempDir()) logging.debug('Registering sar collector with interval %s, output to %s.', parsed_flags.sar_interval, output_directory) if not os.path.isdir(output_directory): os.makedirs(output_directory) collector = _SarCollector(interval=parsed_flags.sar_interval, output_directory=output_directory) events.before_phase.connect(collector.Start, events.RUN_PHASE, weak=False) events.after_phase.connect(collector.Stop, events.RUN_PHASE, weak=False) if parsed_flags.sar_publish: events.samples_created.connect(collector.Analyze, events.RUN_PHASE, weak=False)
def _Create(self): """Create a GCE VM instance.""" num_hosts = len(self.host_list) with open(self.ssh_public_key) as f: public_key = f.read().rstrip('\n') with vm_util.NamedTemporaryFile(dir=vm_util.GetTempDir(), prefix='key-metadata') as tf: tf.write('%s:%s\n' % (self.user_name, public_key)) tf.close() create_cmd = self._GenerateCreateCommand(tf.name) _, stderr, retcode = create_cmd.Issue() if (self.use_dedicated_host and retcode and _INSUFFICIENT_HOST_CAPACITY in stderr and not self.num_vms_per_host): logging.warning( 'Creation failed due to insufficient host capacity. A new host will ' 'be created and instance creation will be retried.') with self._host_lock: if num_hosts == len(self.host_list): host = GceSoleTenantHost(self.host_type, self.zone, self.project) self.host_list.append(host) host.Create() self.host = self.host_list[-1] raise errors.Resource.RetryableCreationError()
def __init__(self, benchmark_info): if (FLAGS.benchmark_config_pair and benchmark_info['name'] in FLAGS.benchmark_config_pair.keys()): # TODO(user): Unify naming between config_reader and # perfkitbenchmarker. self.config = config_reader.ConfigLoader( FLAGS.benchmark_config_pair[benchmark_info['name']]) self.vms = [] self.vm_dict = {'default': []} self.networks = {} self.benchmark_name = benchmark_info['name'] if hasattr(self, 'config'): config_dict = {} for section in self.config._config.sections(): config_dict[ section] = self.config.GetSectionOptionsAsDictionary( section) self.cloud = config_dict['cluster']['type'] self.project = config_dict['cluster']['project'] self.zones = [config_dict['cluster']['zone']] self.image = [] self.machine_type = [] for node in self.config.node_sections: self.vm_dict[node.split(':')[1]] = [] args = [((config_dict[node], node.split(':')[1]), {}) for node in self.config.node_sections] vm_util.RunThreaded(self.CreateVirtualMachineFromNodeSection, args) self.num_vms = len(self.vms) self.image = ','.join(self.image) self.zones = ','.join(self.zones) self.machine_type = ','.join(self.machine_type) else: self.cloud = FLAGS.cloud self.project = FLAGS.project defaults = DEFAULTS[self.cloud] self.zones = FLAGS.zones or [defaults[ZONE]] self.image = FLAGS.image or defaults[IMAGE] self.machine_type = FLAGS.machine_type or defaults[MACHINE_TYPE] if benchmark_info['num_machines'] is None: self.num_vms = FLAGS.num_vms else: self.num_vms = benchmark_info['num_machines'] self.scratch_disk = benchmark_info['scratch_disk'] self.scratch_disk_size = FLAGS.scratch_disk_size self.scratch_disk_type = FLAGS.scratch_disk_type self.vms = [ self.CreateVirtualMachine(self.zones[min( index, len(self.zones) - 1)]) for index in range(self.num_vms) ] self.vm_dict['default'] = self.vms for i in range(benchmark_info['scratch_disk']): disk_spec = disk.BaseDiskSpec( self.scratch_disk_size, DISK_TYPE[self.cloud][self.scratch_disk_type], '/scratch%d' % i) for vm in self.vms: vm.disk_specs.append(disk_spec) firewall_class = CLASSES[self.cloud][FIREWALL] self.firewall = firewall_class(self.project) self.file_name = '%s/%s' % (vm_util.GetTempDir(), benchmark_info['name']) self.deleted = False
def __init__(self, benchmark_info): if (FLAGS.benchmark_config_pair and benchmark_info['name'] in FLAGS.benchmark_config_pair.keys()): # TODO(user): Unify naming between config_reader and # perfkitbenchmarker. self.config = config_reader.ConfigLoader( FLAGS.benchmark_config_pair[benchmark_info['name']]) self.networks = {} self.firewalls = {} self.vms = [] self.vm_dict = {'default': []} self.benchmark_name = benchmark_info['name'] if hasattr(self, 'config'): config_dict = {} for section in self.config._config.sections(): config_dict[section] = self.config.GetSectionOptionsAsDictionary( section) self.cloud = config_dict['cluster']['type'] self.project = config_dict['cluster']['project'] self.zones = [config_dict['cluster']['zone']] self.image = [] self.machine_type = [] for node in self.config.node_sections: self.vm_dict[node.split(':')[1]] = [] args = [((config_dict[node], node.split(':')[1]), {}) for node in self.config.node_sections] vm_util.RunThreaded( self.CreateVirtualMachineFromNodeSection, args) self.num_vms = len(self.vms) self.image = ','.join(self.image) self.zones = ','.join(self.zones) self.machine_type = ','.join(self.machine_type) else: self.cloud = FLAGS.cloud self.project = FLAGS.project self.zones = FLAGS.zones self.image = FLAGS.image self.machine_type = FLAGS.machine_type if benchmark_info['num_machines'] is None: self.num_vms = FLAGS.num_vms else: self.num_vms = benchmark_info['num_machines'] self.scratch_disk = benchmark_info['scratch_disk'] self.scratch_disk_size = FLAGS.scratch_disk_size self.scratch_disk_type = FLAGS.scratch_disk_type self.scratch_disk_iops = FLAGS.scratch_disk_iops self.vms = [ self.CreateVirtualMachine( self.zones[min(index, len(self.zones) - 1)]) for index in range(self.num_vms)] self.vm_dict['default'] = self.vms for vm in self.vms: # If we are using local disks and num_striped_disks has not been # set, then we want to set it to stripe all local disks together. if (FLAGS.scratch_disk_type == disk.LOCAL and benchmark_info['scratch_disk'] and not FLAGS['num_striped_disks'].present): num_striped_disks = (vm.max_local_disks // benchmark_info['scratch_disk']) if num_striped_disks == 0: raise errors.Error( 'Not enough local disks to run benchmark "%s". It requires at ' 'least %d local disk(s). The specified machine type has %d ' 'local disk(s).' % (benchmark_info['name'], int(benchmark_info['scratch_disk']), vm.max_local_disks)) else: num_striped_disks = FLAGS.num_striped_disks for i in range(benchmark_info['scratch_disk']): mount_point = '%s%d' % (FLAGS.scratch_dir, i) disk_spec = disk.BaseDiskSpec( self.scratch_disk_size, self.scratch_disk_type, mount_point, self.scratch_disk_iops, num_striped_disks) vm.disk_specs.append(disk_spec) self.file_name = '%s/%s' % (vm_util.GetTempDir(), benchmark_info['name']) self.deleted = False self.always_call_cleanup = False
def _GetBeamDir(): # TODO: This is temporary, find a better way. return FLAGS.beam_location or os.path.join(vm_util.GetTempDir(), 'beam')
def PushHBaseSite(vm): conf_dir = posixpath.join(ycsb.YCSB_DIR, 'hbase10-binding', 'conf') vm.RemoteCommand('mkdir -p {}'.format(conf_dir)) vm.PushFile(os.path.join(vm_util.GetTempDir(), HBASE_SITE), posixpath.join(conf_dir, HBASE_SITE))
def _GetPickleFilename(uid): """Returns the filename for the pickled BenchmarkSpec.""" return os.path.join(vm_util.GetTempDir(), uid)
def RunBenchmarks(): """Runs all benchmarks in PerfKitBenchmarker. Returns: Exit status for the process. """ benchmark_specs = _CreateBenchmarkSpecs() if FLAGS.randomize_run_order: random.shuffle(benchmark_specs) if FLAGS.dry_run: print 'PKB will run with the following configurations:' for spec in benchmark_specs: print spec print '' return 0 collector = SampleCollector() try: tasks = [(RunBenchmarkTask, (spec,), {}) for spec in benchmark_specs] if FLAGS.run_with_pdb and FLAGS.run_processes == 1: spec_sample_tuples = RunBenchmarkTasksInSeries(tasks) else: spec_sample_tuples = background_tasks.RunParallelProcesses( tasks, FLAGS.run_processes, FLAGS.run_processes_delay) benchmark_specs, sample_lists = zip(*spec_sample_tuples) for sample_list in sample_lists: collector.samples.extend(sample_list) finally: if collector.samples: collector.PublishSamples() if benchmark_specs: logging.info(benchmark_status.CreateSummary(benchmark_specs)) logging.info('Complete logs can be found at: %s', vm_util.PrependTempDir(LOG_FILE_NAME)) logging.info('Completion statuses can be found at: %s', vm_util.PrependTempDir(COMPLETION_STATUS_FILE_NAME)) if stages.TEARDOWN not in FLAGS.run_stage: logging.info( 'To run again with this setup, please use --run_uri=%s', FLAGS.run_uri) if FLAGS.archive_bucket: archive.ArchiveRun(vm_util.GetTempDir(), FLAGS.archive_bucket, gsutil_path=FLAGS.gsutil_path, prefix=FLAGS.run_uri + '_') # Write completion status file(s) completion_status_file_name = ( vm_util.PrependTempDir(COMPLETION_STATUS_FILE_NAME)) with open(completion_status_file_name, 'w') as status_file: _WriteCompletionStatusFile(benchmark_specs, status_file) if FLAGS.completion_status_file: with open(FLAGS.completion_status_file, 'w') as status_file: _WriteCompletionStatusFile(benchmark_specs, status_file) all_benchmarks_succeeded = all(spec.status == benchmark_status.SUCCEEDED for spec in benchmark_specs) return 0 if all_benchmarks_succeeded else 1
def Run(benchmark_spec): """Run Horovod on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms vm_util.RunThreaded(lambda vm: vm.RemoteCommand('rm -rf /tmp/models'), vms) master_vm = vms[0] # GCP should work out of the box with the deep learning image but the AWS # image requires us to use the correct Tensorflow Python environment. if FLAGS.cloud == 'AWS': master_vm.RobustRemoteCommand( '. anaconda3/bin/activate tensorflow_p36') python_interpreter = 'anaconda3/envs/tensorflow_p36/bin/python' else: python_interpreter = 'python3' nccl_params = [ 'TF_CPP_MIN_LOG_LEVEL=0', 'NCCL_SOCKET_IFNAME=^lo,docker0', ] if benchmark_spec.timeline: nccl_params.extend([ 'HOROVOD_TIMELINE={}/timeline.json'.format(vm_util.VM_TMP_DIR), 'HOROVOD_TIMELINE_MARK_CYCLES=1', ]) if benchmark_spec.cuda_visible_devices: nccl_params.append('CUDA_VISIBLE_DEVICES={}'.format( benchmark_spec.cuda_visible_devices)) if FLAGS.nccl_extra_params: for extra_param in FLAGS.nccl_extra_params: nccl_params.append(extra_param) run_command = ('mpirun -np {num_gpus} -hostfile {host_file} ' '-mca plm_rsh_no_tree_spawn 1 ' '--allow-run-as-root ' '-bind-to socket -map-by slot ' '{nccl_params} ' '-mca pml ob1 -mca btl ^openib ' '-mca btl_tcp_if_exclude lo,docker0 ' '{python} ').format(num_gpus=benchmark_spec.total_gpus, host_file=MACHINEFILE, python=python_interpreter, nccl_params=' '.join([ '-x {}'.format(param) for param in nccl_params ])) if benchmark_spec.model == 'resnet-50': resnet_dir = 'DeepLearningExamples/TensorFlow/Classification/RN50v1.5/' run_command += ( 'DeepLearningExamples/TensorFlow/Classification/RN50v1.5/main.py ' '--mode=training_benchmark ' '--warmup_steps 50 ' '--precision {precision} ' '--batch_size {batch_size} ' '--results_dir /tmp/models ' '--data_dir {data_dir} ' '--iter_unit epoch ' '--data_format NHWC ' '--num_iter {num_epochs} ').format( precision=benchmark_spec.precision, batch_size=benchmark_spec.batch_size, num_epochs=benchmark_spec.num_epochs, data_dir='{}/imagenet'.format(resnet_dir)) else: # bert if not benchmark_spec.bert_finetune: raise NotImplementedError('BERT pretraining is not supported.') bert_dir = ('DeepLearningExamples/TensorFlow/LanguageModeling/BERT/' 'data/download/google_pretrained_weights/{}').format( 'uncased_L-12_H-768_A-12' if benchmark_spec.model == 'bert-base' else 'uncased_L-24_H-1024_A-16') run_command += ( 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/run_squad.py ' '--vocab_file={vocab_file} ' '--bert_config_file={bert_config} ' '--init_checkpoint={init_ckpt} ' '--do_train=True ' '--train_file={train_file} ' '--train_batch_size={batch_size} ' '--learning_rate=5e-6 ' '--num_train_epochs={num_epochs} ' '--max_seq_length={max_seq_len} ' '--doc_stride={doc_stride} ' '--output_dir=/tmp/models ' '--horovod ' '{fp16} ' ).format( batch_size=benchmark_spec.batch_size, num_epochs=benchmark_spec.num_epochs, fp16='--use_fp16' if benchmark_spec.precision == 'fp16' else '', vocab_file='{}/vocab.txt'.format(bert_dir), bert_config='{}/bert_config.json'.format(bert_dir), init_ckpt='{}/bert_model.ckpt'.format(bert_dir), max_seq_len=benchmark_spec.max_seq_len, doc_stride=64 if benchmark_spec.max_seq_len == 128 else 128, train_file= 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/squad/v1.1/train-v1.1.json', ) stdout, stderr = master_vm.RobustRemoteCommand(run_command, should_log=True) if benchmark_spec.timeline: master_vm.PullFile(vm_util.GetTempDir(), '{}/timeline.json'.format(vm_util.VM_TMP_DIR)) return _MakeSamplesFromOutput(benchmark_spec, stdout, stderr)
def RunWithExec(benchmark_spec, exec_path, remote_job_file_path, job_file_contents): """Spawn fio and gather the results. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. exec_path: string path to the fio executable. remote_job_file_path: path, on the vm, to the location of the job file. job_file_contents: string contents of the fio job file. Returns: A list of sample.Sample objects. """ vm = benchmark_spec.vms[0] logging.info('FIO running on %s', vm) disk = vm.scratch_disks[0] mount_point = disk.mount_point job_file_string = GetOrGenerateJobFileString( FLAGS.fio_jobfile, FLAGS.fio_generate_scenarios, AgainstDevice(), disk, FLAGS.fio_io_depths, FLAGS.fio_num_jobs, FLAGS.fio_working_set_size, FLAGS.fio_blocksize, FLAGS.fio_runtime, FLAGS.fio_parameters, job_file_contents) job_file_path = vm_util.PrependTempDir(vm.name + LOCAL_JOB_FILE_SUFFIX) with open(job_file_path, 'w') as job_file: job_file.write(job_file_string) logging.info('Wrote fio job file at %s', job_file_path) logging.info(job_file_string) vm.PushFile(job_file_path, remote_job_file_path) if AgainstDevice(): fio_command = '%s --output-format=json --filename=%s %s' % ( exec_path, disk.GetDevicePath(), remote_job_file_path) else: fio_command = '%s --output-format=json --directory=%s %s' % ( exec_path, mount_point, remote_job_file_path) collect_logs = any([ FLAGS.fio_lat_log, FLAGS.fio_bw_log, FLAGS.fio_iops_log, FLAGS.fio_hist_log ]) log_file_base = '' if collect_logs: log_file_base = '%s_%s' % (PKB_FIO_LOG_FILE_NAME, str(time.time())) fio_command = ' '.join([fio_command, GetLogFlags(log_file_base)]) # TODO(user): This only gives results at the end of a job run # so the program pauses here with no feedback to the user. # This is a pretty lousy experience. logging.info('FIO Results:') stdout, _ = vm.RobustRemoteCommand(fio_command, should_log=True) bin_vals = [] if collect_logs: vm.PullFile(vm_util.GetTempDir(), '%s*.log' % log_file_base) if FLAGS.fio_hist_log: num_logs = int( vm.RemoteCommand('ls %s_clat_hist.*.log | wc -l' % log_file_base)[0]) bin_vals += [ fio.ComputeHistogramBinVals( vm, '%s_clat_hist.%s.log' % (log_file_base, idx + 1)) for idx in range(num_logs) ] samples = fio.ParseResults(job_file_string, json.loads(stdout), log_file_base=log_file_base, bin_vals=bin_vals) return samples
def RunBenchmarks(publish=True): """Runs all benchmarks in PerfKitBenchmarker. Args: publish: A boolean indicating whether results should be published. Returns: Exit status for the process. """ if FLAGS.version: print version.VERSION return _LogCommandLineFlags() if FLAGS.os_type == benchmark_spec.WINDOWS and not vm_util.RunningOnWindows( ): logging.error('In order to run benchmarks on Windows VMs, you must be ' 'running on Windows.') return 1 collector = SampleCollector() if FLAGS.static_vm_file: with open(FLAGS.static_vm_file) as fp: static_virtual_machine.StaticVirtualMachine.ReadStaticVirtualMachineFile( fp) run_status_lists = [] benchmark_tuple_list = benchmark_sets.GetBenchmarksFromFlags() total_benchmarks = len(benchmark_tuple_list) benchmark_counts = collections.defaultdict(itertools.count) args = [] for i, benchmark_tuple in enumerate(benchmark_tuple_list): benchmark_module, user_config = benchmark_tuple benchmark_name = benchmark_module.BENCHMARK_NAME benchmark_uid = benchmark_name + str( benchmark_counts[benchmark_name].next()) run_status_lists.append( [benchmark_name, benchmark_uid, benchmark_status.SKIPPED]) args.append((benchmark_module, collector, i + 1, total_benchmarks, benchmark_module.GetConfig(user_config), benchmark_uid)) try: for run_args, run_status_list in zip(args, run_status_lists): benchmark_module, _, sequence_number, _, _, benchmark_uid = run_args benchmark_name = benchmark_module.BENCHMARK_NAME try: run_status_list[2] = benchmark_status.FAILED RunBenchmark(*run_args) run_status_list[2] = benchmark_status.SUCCEEDED except BaseException as e: msg = 'Benchmark {0}/{1} {2} (UID: {3}) failed.'.format( sequence_number, total_benchmarks, benchmark_name, benchmark_uid) if (isinstance(e, KeyboardInterrupt) or FLAGS.stop_after_benchmark_failure): logging.error('%s Execution will not continue.', msg) break else: logging.error('%s Execution will continue.', msg) finally: if collector.samples: collector.PublishSamples() if run_status_lists: logging.info(benchmark_status.CreateSummary(run_status_lists)) logging.info('Complete logs can be found at: %s', vm_util.PrependTempDir(LOG_FILE_NAME)) if FLAGS.run_stage not in [STAGE_ALL, STAGE_TEARDOWN]: logging.info('To run again with this setup, please use --run_uri=%s', FLAGS.run_uri) if FLAGS.archive_bucket: archive.ArchiveRun(vm_util.GetTempDir(), FLAGS.archive_bucket, gsutil_path=FLAGS.gsutil_path, prefix=FLAGS.run_uri + '_') all_benchmarks_succeeded = all(r[2] == benchmark_status.SUCCEEDED for r in run_status_lists) return 0 if all_benchmarks_succeeded else 1
def _Run(vm): """See base method. Args: vm: The vm to run the benchmark on. Returns: A list of sample.Sample objects. """ # Make changes e.g. compiler flags to spec config file. if 'gcc' in FLAGS.runspec_config: _OverwriteGccO3(vm) # swap only if necessary; free local node memory and avoid remote memory; # reset caches; set stack size to unlimited # Also consider setting enable_transparent_hugepages flag to true cmd = ('echo 1 | sudo tee /proc/sys/vm/swappiness && ' 'echo 1 | sudo tee /proc/sys/vm/zone_reclaim_mode && ' 'sync ; echo 3 | sudo tee /proc/sys/vm/drop_caches && ' 'ulimit -s unlimited && ') cmd += 'runcpu ' if FLAGS.spec17_build_only: cmd += '--action build ' if FLAGS.spec17_rebuild: cmd += '--rebuild ' version_specific_parameters = [] # rate runs require 2 GB minimum system main memory per copy, # not including os overhead. Refer to: # https://www.spec.org/cpu2017/Docs/system-requirements.html#memory copies = min(vm.NumCpusForBenchmark(), vm.total_free_memory_kb // (2 * KB_TO_GB_MULTIPLIER)) version_specific_parameters.append(' --copies=%s ' % (FLAGS.spec17_copies or copies)) version_specific_parameters.append( ' --threads=%s ' % (FLAGS.spec17_threads or vm.NumCpusForBenchmark())) if FLAGS.spec17_fdo: version_specific_parameters.append('--feedback ') vm.RemoteCommand('cd /scratch/cpu2017; mkdir fdo_profiles') start_time = time.time() stdout, _ = speccpu.Run(vm, cmd, ' '.join(FLAGS.spec17_subset), version_specific_parameters) if FLAGS.spec17_build_only: if 'Error' in stdout and 'Please review this file' in stdout: raise errors.Benchmarks.RunError('Error during SPEC compilation.') return [ sample.Sample( 'compilation_time', time.time() - start_time, 's', { 'spec17_subset': FLAGS.spec17_subset, 'gcc_version': build_tools.GetVersion(vm, 'gcc') }) ] partial_results = True # Do not allow partial results if any benchmark subset is a full suite. for benchmark_subset in FLAGS.benchmark_subset: if benchmark_subset in ['intspeed', 'fpspeed', 'intrate', 'fprate']: partial_results = False log_files = set() for test in FLAGS.spec17_subset: if test in LOG_FILENAME: log_files.add(LOG_FILENAME[test]) else: if test in INTSPEED_SUITE: log_files.add(LOG_FILENAME['intspeed']) elif test in INTRATE_SUITE: log_files.add(LOG_FILENAME['intrate']) elif test in FPSPEED_SUITE: log_files.add(LOG_FILENAME['fpspeed']) elif test in FPRATE_SUITE: log_files.add(LOG_FILENAME['fprate']) for log_file in log_files: vm.RemoteCommand( f'cp {vm.GetScratchDir()}/cpu2017/result/{log_file} ~/{log_file}.log' ) vm.PullFile(vm_util.GetTempDir(), f'~/{log_file}.log') samples = speccpu.ParseOutput(vm, log_files, partial_results, None) for item in samples: item.metadata['vm_name'] = vm.name item.metadata['spec17_gcc_flags'] = FLAGS.spec17_gcc_flags return samples
def RunBenchmarks(publish=True): """Runs all benchmarks in PerfKitBenchmarker. Args: publish: A boolean indicating whether results should be published. Returns: Exit status for the process. """ if FLAGS.version: print version.VERSION return for executable in REQUIRED_EXECUTABLES: if not vm_util.ExecutableOnPath(executable): logging.error('Could not find required executable "%s".' % executable) return 1 if FLAGS.run_uri is None: if FLAGS.run_stage not in [STAGE_ALL, STAGE_PREPARE]: # Attempt to get the last modified run directory. run_uri = vm_util.GetLastRunUri() if run_uri: FLAGS.run_uri = run_uri logging.warning( 'No run_uri specified. Attempting to run "%s" with --run_uri=%s.', FLAGS.run_stage, FLAGS.run_uri) else: logging.error('No run_uri specified. Could not run "%s".', FLAGS.run_stage) return 1 else: FLAGS.run_uri = str(uuid.uuid4())[-8:] elif not FLAGS.run_uri.isalnum() or len( FLAGS.run_uri) > MAX_RUN_URI_LENGTH: logging.error('run_uri must be alphanumeric and less than or equal ' 'to 8 characters in length.') return 1 vm_util.GenTempDir() log_util.ConfigureLogging( stderr_log_level=log_util.LOG_LEVELS[FLAGS.log_level], log_path=vm_util.PrependTempDir(LOG_FILE_NAME), run_uri=FLAGS.run_uri) logging.info('PerfKitBenchmarker version: %s', version.VERSION) _LogCommandLineFlags() if FLAGS.os_type == benchmark_spec.WINDOWS and not vm_util.RunningOnWindows( ): logging.error('In order to run benchmarks on Windows VMs, you must be ' 'running on Windows.') return 1 vm_util.SSHKeyGen() collector = SampleCollector() events.initialization_complete.send(parsed_flags=FLAGS) if FLAGS.static_vm_file: with open(FLAGS.static_vm_file) as fp: static_virtual_machine.StaticVirtualMachine.ReadStaticVirtualMachineFile( fp) if FLAGS.benchmark_config_pair: # Convert benchmark_config_pair into a {benchmark_name: file_name} # dictionary. tmp_dict = {} for config_pair in FLAGS.benchmark_config_pair: pair = config_pair.split(':') tmp_dict[pair[0]] = pair[1] FLAGS.benchmark_config_pair = tmp_dict try: benchmark_list = benchmark_sets.GetBenchmarksFromFlags() total_benchmarks = len(benchmark_list) if FLAGS.parallelism > 1: args = [((benchmark, collector, i + 1, total_benchmarks), {}) for i, benchmark in enumerate(benchmark_list)] vm_util.RunThreaded(RunBenchmark, args, max_concurrent_threads=FLAGS.parallelism) else: for i, benchmark in enumerate(benchmark_list): RunBenchmark(benchmark, collector, i + 1, total_benchmarks) finally: if collector.samples: collector.PublishSamples() logging.info('Complete logs can be found at: %s', vm_util.PrependTempDir(LOG_FILE_NAME)) if FLAGS.run_stage not in [STAGE_ALL, STAGE_CLEANUP]: logging.info('To run again with this setup, please use --run_uri=%s', FLAGS.run_uri) if FLAGS.archive_bucket: archive.ArchiveRun(vm_util.GetTempDir(), FLAGS.archive_bucket, gsutil_path=FLAGS.gsutil_path, prefix=FLAGS.run_uri + '_')
def RunBenchmarks(): """Runs all benchmarks in PerfKitBenchmarker. Returns: Exit status for the process. """ if FLAGS.version: print version.VERSION return _LogCommandLineFlags() if FLAGS.os_type == os_types.WINDOWS and not vm_util.RunningOnWindows(): logging.error('In order to run benchmarks on Windows VMs, you must be ' 'running on Windows.') return 1 collector = SampleCollector() if FLAGS.static_vm_file: with open(FLAGS.static_vm_file) as fp: static_virtual_machine.StaticVirtualMachine.ReadStaticVirtualMachineFile( fp) benchmark_run_list = _CreateBenchmarkRunList() try: for run_args, run_status_list in benchmark_run_list: benchmark_module, sequence_number, _, _, benchmark_uid = run_args benchmark_name = benchmark_module.BENCHMARK_NAME try: run_status_list[2] = benchmark_status.FAILED RunBenchmark(*run_args, collector=collector) run_status_list[2] = benchmark_status.SUCCEEDED except BaseException as e: msg = 'Benchmark {0}/{1} {2} (UID: {3}) failed.'.format( sequence_number, len(benchmark_run_list), benchmark_name, benchmark_uid) if (isinstance(e, KeyboardInterrupt) or FLAGS.stop_after_benchmark_failure): logging.error('%s Execution will not continue.', msg) break else: logging.error('%s Execution will continue.', msg) finally: if collector.samples: collector.PublishSamples() if benchmark_run_list: run_status_lists = tuple(r for _, r in benchmark_run_list) logging.info(benchmark_status.CreateSummary(run_status_lists)) logging.info('Complete logs can be found at: %s', vm_util.PrependTempDir(LOG_FILE_NAME)) if stages.TEARDOWN not in FLAGS.run_stage: logging.info('To run again with this setup, please use --run_uri=%s', FLAGS.run_uri) if FLAGS.archive_bucket: archive.ArchiveRun(vm_util.GetTempDir(), FLAGS.archive_bucket, gsutil_path=FLAGS.gsutil_path, prefix=FLAGS.run_uri + '_') all_benchmarks_succeeded = all(r[2] == benchmark_status.SUCCEEDED for _, r in benchmark_run_list) return 0 if all_benchmarks_succeeded else 1
def Run(benchmark_spec): """Run Horovod on the cluster. Args: benchmark_spec: The benchmark specification. Contains all data that is required to run the benchmark. Returns: A list of sample.Sample objects. """ _UpdateBenchmarkSpecWithFlags(benchmark_spec) vms = benchmark_spec.vms vm_util.RunThreaded(lambda vm: vm.RemoteCommand('rm -rf /tmp/models'), vms) master_vm = vms[0] # GCP should work out of the box with the deep learning image but the AWS # image requires us to use the correct Tensorflow Python environment. if FLAGS.cloud == 'AWS': master_vm.RobustRemoteCommand( '. anaconda3/bin/activate tensorflow_p36') python_interpreter = 'anaconda3/envs/tensorflow_p36/bin/python' else: python_interpreter = '/opt/conda/bin/python' nccl_params = [ 'TF_CPP_MIN_LOG_LEVEL=0', 'NCCL_SOCKET_IFNAME=^lo,docker0', 'NCCL_DEBUG=INFO', ] if benchmark_spec.timeline: nccl_params.extend([ 'HOROVOD_TIMELINE={}/timeline.json'.format(vm_util.VM_TMP_DIR), 'HOROVOD_TIMELINE_MARK_CYCLES=1', ]) if benchmark_spec.cuda_visible_devices: nccl_params.append('CUDA_VISIBLE_DEVICES={}'.format( benchmark_spec.cuda_visible_devices)) if FLAGS.nccl_extra_params: for extra_param in FLAGS.nccl_extra_params: nccl_params.append(extra_param) run_command = ('{mpi} -np {num_gpus} -hostfile {host_file} ' '-mca plm_rsh_no_tree_spawn 1 ' '--allow-run-as-root ' '-bind-to socket -map-by slot ' '{nccl_params} ' '-mca pml ob1 -mca btl ^openib ' '-mca btl_tcp_if_exclude lo,docker0 ' '{python} ').format(mpi=FLAGS.nccl_mpi, num_gpus=benchmark_spec.total_gpus, host_file=MACHINEFILE, python=python_interpreter, nccl_params=' '.join([ '-x {}'.format(param) for param in nccl_params ])) if benchmark_spec.model == 'resnet-50': run_flags = { 'arch': 'resnet50', 'mode': 'training_benchmark', 'warmup_steps': 101, 'results_dir': '/tmp/models', 'gpu_memory_fraction': 0.95, 'use_static_loss_scaling': None, 'loss_scale': 128, 'lr_init': 0.016, 'lr_warmup_epochs': 8, 'momentum': 0.875, 'weight_decay': 3.0517578125e-05, 'iter_unit': 'batch' } run_flags.update({ 'precision': benchmark_spec.precision, 'batch_size': benchmark_spec.batch_size, 'num_iter': benchmark_spec.num_steps, }) # Load ImageNet training data from GCS if benchmark is not in synthetic mode if not benchmark_spec.synthetic: run_flags[ 'data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet' run_command += 'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py ' run_command += ' '.join([ '--{}'.format(key) if value is None else '--{}={}'.format( key, value) for key, value in sorted(run_flags.items()) ]) elif benchmark_spec.model == 'resnext-101': run_flags = { 'arch': 'resnext101-32x4d', 'mode': 'training_benchmark', 'warmup_steps': 101, 'results_dir': '/tmp/models', 'gpu_memory_fraction': 0.95, 'use_static_loss_scaling': None, 'loss_scale': 128, 'lr_init': 0.016, 'lr_warmup_epochs': 8, 'momentum': 0.875, 'weight_decay': 3.0517578125e-05, 'weight_init': 'fan_in', 'iter_unit': 'batch' } run_flags.update({ 'precision': benchmark_spec.precision, 'batch_size': benchmark_spec.batch_size, 'num_iter': benchmark_spec.num_steps, }) # Load ImageNet training data from GCS if benchmark is not in synthetic mode if not benchmark_spec.synthetic: run_flags[ 'data_dir'] = 'gs://cloud-ml-nas-public/classification/imagenet' run_command += 'DeepLearningExamples/TensorFlow/Classification/ConvNets/main.py ' run_command += ' '.join([ '--{}'.format(key) if value is None else '--{}={}'.format( key, value) for key, value in sorted(run_flags.items()) ]) elif benchmark_spec.model.startswith('bert'): # bert if not benchmark_spec.bert_finetune: raise NotImplementedError('BERT pretraining is not supported.') bert_dir = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/google_pretrained_weights/{}'.format( 'uncased_L-12_H-768_A-12' if benchmark_spec.model == 'bert-base' else 'uncased_L-24_H-1024_A-16') squad_train_file = 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/data/download/squad/v1.1/train-v1.1.json' run_flags = { 'vocab_file': '{}/vocab.txt'.format(bert_dir), 'bert_config_file': '{}/bert_config.json'.format(bert_dir), 'init_checkpoint': '{}/bert_model.ckpt'.format(bert_dir), 'do_train': None, 'train_file': squad_train_file, 'learning_rate': 5e-6, 'output_dir': '/tmp/models', 'horovod': None, 'dllog_path': '/tmp/bert_dllog.json', } run_flags.update({ 'precision': benchmark_spec.precision, 'train_batch_size': benchmark_spec.batch_size, 'num_train_epochs': benchmark_spec.num_steps, 'max_seq_length': benchmark_spec.max_seq_len, 'doc_stride': 64 if benchmark_spec.max_seq_len == 128 else 128, 'amp': benchmark_spec.precision == 'fp16' }) run_command += 'DeepLearningExamples/TensorFlow/LanguageModeling/BERT/run_squad.py ' run_command += ' '.join([ '--{}'.format(key) if value is None else '--{}={}'.format( key, value) for key, value in sorted(run_flags.items()) ]) else: run_command += ( 'tensorpack/examples/FasterRCNN/train.py --config ' 'BACKBONE.WEIGHTS=ImageNet-R50-AlignPadding.npz ' 'DATA.BASEDIR=coco ' 'TRAINER=horovod ' 'TRAIN.EVAL_PERIOD=0 ' # LR_SCHEDULE means equivalent steps when the total batch size is 8. 'TRAIN.LR_SCHEDULE="[{step}, {step}, {step}]" ' '--logdir {log_dir}/maskrcnn ').format( log_dir=vm_util.VM_TMP_DIR, step=benchmark_spec.num_steps * benchmark_spec.total_gpus // 8) stdout, stderr = master_vm.RobustRemoteCommand(run_command, should_log=True) if benchmark_spec.timeline: master_vm.PullFile(vm_util.GetTempDir(), '{}/timeline.json'.format(vm_util.VM_TMP_DIR)) return _MakeSamplesFromOutput(benchmark_spec, stdout, stderr)
def RunWithExec(vm, exec_path, remote_job_file_path, job_file_contents): """Spawn fio and gather the results. Args: vm: vm to run the benchmark on. exec_path: string path to the fio executable. remote_job_file_path: path, on the vm, to the location of the job file. job_file_contents: string contents of the fio job file. Returns: A list of sample.Sample objects. """ logging.info('FIO running on %s', vm) disk = vm.scratch_disks[0] mount_point = disk.mount_point if FLAGS.fio_write_against_multiple_clients: mount_point = '%s/%s' % (disk.mount_point, vm.name) logging.info('FIO mount point changed to %s', mount_point) job_file_string = GetOrGenerateJobFileString( FLAGS.fio_jobfile, FLAGS.fio_generate_scenarios, AgainstDevice(), disk, FLAGS.fio_io_depths, FLAGS.fio_num_jobs, FLAGS.fio_working_set_size, FLAGS.fio_blocksize, FLAGS.fio_runtime, _DIRECT_IO.value, FLAGS.fio_parameters, job_file_contents) job_file_path = vm_util.PrependTempDir(vm.name + LOCAL_JOB_FILE_SUFFIX) with open(job_file_path, 'w') as job_file: job_file.write(job_file_string) logging.info('Wrote fio job file at %s', job_file_path) logging.info(job_file_string) vm.PushFile(job_file_path, remote_job_file_path) if AgainstDevice(): fio_command = ( f'{exec_path} --output-format=json ' f'--random_generator={FLAGS.fio_rng} ' f'--filename={disk.GetDevicePath()} {remote_job_file_path}') else: fio_command = (f'{exec_path} --output-format=json ' f'--random_generator={FLAGS.fio_rng} ' f'--directory={mount_point} {remote_job_file_path}') collect_logs = any([ FLAGS.fio_lat_log, FLAGS.fio_bw_log, FLAGS.fio_iops_log, FLAGS.fio_hist_log ]) log_file_base = '' if collect_logs: log_file_base = '%s_%s' % (PKB_FIO_LOG_FILE_NAME, str(time.time())) fio_command = ' '.join([fio_command, GetLogFlags(log_file_base)]) # TODO(user): This only gives results at the end of a job run # so the program pauses here with no feedback to the user. # This is a pretty lousy experience. logging.info('FIO Results:') start_time = time.time() stdout, _ = vm.RobustRemoteCommand(fio_command, should_log=True, timeout=FLAGS.fio_command_timeout_sec) end_time = time.time() bin_vals = [] if collect_logs: vm.PullFile(vm_util.GetTempDir(), '%s*.log' % log_file_base) if FLAGS.fio_hist_log: num_logs = int( vm.RemoteCommand('ls %s_clat_hist.*.log | wc -l' % log_file_base)[0]) bin_vals += [ fio.ComputeHistogramBinVals( vm, '%s_clat_hist.%s.log' % (log_file_base, idx + 1)) for idx in range(num_logs) ] samples = fio.ParseResults(job_file_string, json.loads(stdout), log_file_base=log_file_base, bin_vals=bin_vals) samples.append( sample.Sample('start_time', start_time, 'sec', samples[0].metadata)) samples.append( sample.Sample('end_time', end_time, 'sec', samples[0].metadata)) return samples