def test_register_benchmark(): """Test interface BenchmarkRegistry.register_benchmark().""" # Register the benchmark for all platform if use default platform. BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark) for platform in Platform: context = BenchmarkRegistry.create_benchmark_context('accumulation', platform=platform) assert (BenchmarkRegistry.is_benchmark_registered(context)) # Register the benchmark for CUDA platform if use platform=Platform.CUDA. BenchmarkRegistry.register_benchmark('accumulation-cuda', AccumulationBenchmark, platform=Platform.CUDA) context = BenchmarkRegistry.create_benchmark_context( 'accumulation-cuda', platform=Platform.CUDA) assert (BenchmarkRegistry.is_benchmark_registered(context)) context = BenchmarkRegistry.create_benchmark_context( 'accumulation-cuda', platform=Platform.ROCM) assert (BenchmarkRegistry.is_benchmark_registered(context) is False)
def test_get_benchmark_configurable_settings(): """Test BenchmarkRegistry interface. BenchmarkRegistry.get_benchmark_configurable_settings(). """ # Register benchmarks for testing. BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark) context = BenchmarkRegistry.create_benchmark_context('accumulation', platform=Platform.CPU) settings = BenchmarkRegistry.get_benchmark_configurable_settings(context) expected = """optional arguments: --duration int The elapsed time of benchmark in seconds. --log_raw_data Log raw data into file instead of saving it into result object. --lower_bound int The lower bound for accumulation. --run_count int The run count of benchmark. --upper_bound int The upper bound for accumulation.""" assert (settings == expected)
def create_benchmark(params='--num_steps 8'): """Register and create benchmark.""" # Register the FakeModelBenchmark benchmark. BenchmarkRegistry.register_benchmark( 'pytorch-fake-model', FakeModelBenchmark, parameters='--hidden_size 2', platform=Platform.CUDA, ) context = BenchmarkRegistry.create_benchmark_context( 'fake-model', platform=Platform.CUDA, parameters=params, framework=Framework.PYTORCH) name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context) assert (name) (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( name, context.platform) assert (benchmark_class) return benchmark_class(name, predefine_params + ' ' + context.parameters)
def test_get_benchmark_name(): """Test interface BenchmarkRegistry.get_benchmark_name().""" # Register benchmarks for testing. benchmark_names = [ 'accumulation', 'pytorch-accumulation', 'tf1-accumulation', 'onnxruntime-accumulation' ] for name in benchmark_names: BenchmarkRegistry.register_benchmark(name, AccumulationBenchmark) # Test benchmark name for different Frameworks. benchmark_frameworks = [ Framework.NONE, Framework.PYTORCH, Framework.TENSORFLOW1, Framework.ONNXRUNTIME ] for i in range(len(benchmark_names)): context = BenchmarkRegistry.create_benchmark_context( 'accumulation', platform=Platform.CPU, framework=benchmark_frameworks[i]) name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name( context) assert (name == benchmark_names[i])
def test_pytorch_empty_cache(): """Test PytorchBase class.""" # Register mnist benchmark. BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST) # Test cache empty by manually calling torch.cuda.empty_cache(). parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train' benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters) assert (benchmark) assert (benchmark._preprocess()) assert (benchmark._benchmark()) del benchmark assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] > 0) torch.cuda.empty_cache() assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0) # Test automatic cache empty. context = BenchmarkRegistry.create_benchmark_context( 'pytorch-mnist', parameters='--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train' ) benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
""" precision = self._precision_need_to_run[cmd_idx] self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data) valid = True flops = list() content = raw_output.splitlines() try: for line in content: for item in self.__parse_logline: if item in line: flops.append(float(line.split(',')[-1])) except BaseException: valid = False finally: if valid is False or len(flops) == 0: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.' .format(self._curr_run_index, self._name, raw_output)) return False self._result.add_result(self._metric_map[precision], max(flops)) return True BenchmarkRegistry.register_benchmark('gemm-flops', CudaGemmFlopsBenchmark, platform=Platform.CUDA)
def test_pytorch_base(): """Test PytorchBase class.""" # Register mnist benchmark. BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST) # Launch benchmark with --no_gpu for testing. parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu --force_fp32' benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters) assert (benchmark) assert (benchmark._preprocess()) assert (benchmark._benchmark()) assert (benchmark.name == 'pytorch-mnist') assert (benchmark.return_code == ReturnCode.SUCCESS) # Test results. for metric in [ 'fp32_train_step_time', 'fp32_inference_step_time', 'fp32_train_throughput', 'fp32_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == 1) assert (len(benchmark.raw_data[metric][0]) == 64) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) # Test _cal_params_count(). assert (benchmark._cal_params_count() == 1199882) # Test _judge_gpu_availability(). assert (benchmark._gpu_available is False) # Test _set_force_fp32(). assert (benchmark._args.force_fp32 is True) # Test _init_distributed_setting(). assert (benchmark._args.distributed_impl is None) assert (benchmark._args.distributed_backend is None) assert (benchmark._init_distributed_setting() is True) benchmark._args.distributed_impl = DistributedImpl.DDP benchmark._args.distributed_backend = DistributedBackend.NCCL assert (benchmark._init_distributed_setting() is False) benchmark._args.distributed_impl = DistributedImpl.MIRRORED assert (benchmark._init_distributed_setting() is False) # Test _init_dataloader(). benchmark._args.distributed_impl = None assert (benchmark._init_dataloader() is True) benchmark._args.distributed_impl = DistributedImpl.DDP assert (benchmark._init_dataloader() is False) benchmark._args.distributed_impl = DistributedImpl.MIRRORED assert (benchmark._init_dataloader() is False) # Test _create_optimizer(). assert (isinstance(benchmark._optimizer, transformers.AdamW)) benchmark._optimizer_type = Optimizer.ADAM assert (benchmark._create_optimizer() is True) assert (isinstance(benchmark._optimizer, torch.optim.Adam)) benchmark._optimizer_type = Optimizer.SGD assert (benchmark._create_optimizer() is True) assert (isinstance(benchmark._optimizer, torch.optim.SGD)) benchmark._optimizer_type = None assert (benchmark._create_optimizer() is False) # Test _sync_result(). step_time = [2.0, 2.0] benchmark._args.distributed_impl = DistributedImpl.DDP step_time = benchmark._sync_result(step_time) assert (not step_time) benchmark._args.distributed_impl = None # Test _postprocess(). assert (benchmark._postprocess())
"""Do inference given the ORT inference session. Args: ort_sess (InferenceSession): inference session for ORT. Return: elapse_times (List[float]): latency of every iterations. """ precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32 input_tensor = np.random.randn(self._args.batch_size, 3, 224, 224).astype(dtype=precision) for i in range(self._args.num_warmup): ort_sess.run(None, {'input': input_tensor}) elapse_times = list() for i in range(self._args.num_steps): start = time.time() ort_sess.run(None, {'input': input_tensor}) end = time.time() elapse_times.append((end - start) * 1000) return elapse_times BenchmarkRegistry.register_benchmark( 'ort-inference', ORTInferenceBenchmark, platform=Platform.CUDA, )
self._result.add_raw_data('raw_output', raw_output, self._args.log_raw_data) content = raw_output.splitlines(False) try: result_header = 'benchmark implementation mode config score' found = False for line in content: if result_header in line: found = True elif found: items = line.split(' ') if len(items) == 7: name = '_'.join(items[0:4] + [items[5]]) for char in ['-', ' ', '=', '/']: name = name.replace(char, '_') score = float(items[4]) self._result.add_result(name.lower(), score) except BaseException as e: logger.error( 'The result format is invalid - round: {}, benchmark: {}, message: {}.' .format(self._curr_run_index, self._name, str(e))) return False return True BenchmarkRegistry.register_benchmark('fambench', FAMBenchBenchmark, platform=Platform.CUDA)
mem_bw = -1 value_index = -1 valid = True content = raw_output.splitlines() try: metric = self._metrics[self._mem_types.index(self._args.mem_type[cmd_idx])] parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]] for line in content: if parse_logline in line and value_index != -1: line = line.split() mem_bw = max(mem_bw, float(line[value_index])) elif 'mean' in line: line = line.split() value_index = line.index('mean') except BaseException: valid = False finally: if valid is False or mem_bw == -1: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( self._curr_run_index, self._name, raw_output ) ) return False self._result.add_result(metric, mem_bw) return True BenchmarkRegistry.register_benchmark('mem-bw', RocmMemBwBenchmark, platform=Platform.ROCM)
res = line.split('|') res = [result.strip() for result in res] suc = int(res[labels.index('Successed')]) fail = int(res[labels.index('Failed')]) rate = float( res[labels.index('Success Rate')].strip('%')) mininum = float( res[labels.index('Minimum')].strip('ms')) maximum = float( res[labels.index('Maximum')].strip('ms')) average = float( res[labels.index('Average')].strip('ms')) self._result.add_result(host + '_successed_count', suc) self._result.add_result(host + '_failed_count', fail) self._result.add_result(host + '_success_rate', rate) self._result.add_result(host + '_time_min', mininum) self._result.add_result(host + '_time_max', maximum) self._result.add_result(host + '_time_avg', average) except Exception as e: logger.error( 'The result format is invalid - round: {}, benchmark: {}, address: {}, raw output: {}, message: {}.' .format(self._curr_run_index, self._name, host, raw_output, str(e))) return False return True BenchmarkRegistry.register_benchmark('tcp-connectivity', TCPConnectivityBenchmark)
model = self._args.pytorch_models[cmd_idx] for line in raw_output.strip().splitlines(): line = line.strip() if '[I] mean:' in line or '[I] percentile:' in line: tag = 'mean' if '[I] mean:' in line else '99' lats = re.findall(r'(\d+\.\d+) ms', line) if len(lats) == 1: self._result.add_result(f'{model}_gpu_time_{tag}', float(lats[0])) elif len(lats) == 2: self._result.add_result(f'{model}_host_time_{tag}', float(lats[0])) self._result.add_result( f'{model}_end_to_end_time_{tag}', float(lats[1])) success = True except BaseException as e: self._result.set_return_code( ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.' .format(self._curr_run_index, self._name, raw_output, str(e))) return False return success BenchmarkRegistry.register_benchmark( 'tensorrt-inference', TensorRTInferenceBenchmark, platform=Platform.CUDA, )
if curr_step > self._args.num_warmup: # Save the step time of every training/inference step, unit is millisecond. duration.append((end - start) * 1000) if self._is_finished(curr_step, end): return duration # Register CNN benchmarks. # Reference: https://pytorch.org/vision/0.8/models.html # https://github.com/pytorch/vision/tree/v0.8.0/torchvision/models MODELS = [ 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'googlenet', 'inception_v3', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0', 'mnasnet1_3', 'mobilenet_v2', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2', 'shufflenet_v2_x0_5', 'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19' ] for model in MODELS: if hasattr(models, model): BenchmarkRegistry.register_benchmark('pytorch-' + model, PytorchCNN, parameters='--model_type ' + model) else: logger.warning( 'model missing in torchvision.models - model: {}'.format(model))
metric_set = set() for line in content: try: values = list(filter(None, line.split())) if len(values) != 5: continue # Extract value from the line size = int(values[0]) avg_bw = float(values[-2]) / 8.0 metric = f'{self.__support_ib_commands[self._args.commands[cmd_idx]]}_{size}:{self._args.ib_index}' # Filter useless value in client output if metric not in metric_set: metric_set.add(metric) self._result.add_result(metric, avg_bw) valid = True except BaseException: pass if valid is False: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( self._curr_run_index, self._name, raw_output ) ) return False return True BenchmarkRegistry.register_benchmark('ib-loopback', IBLoopbackBenchmark)
def test_launch_benchmark(): """Test interface BenchmarkRegistry.launch_benchmark().""" # Register benchmarks for testing. BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark, parameters='--upper_bound 5', platform=Platform.CPU) # Launch benchmark. context = BenchmarkRegistry.create_benchmark_context( 'accumulation', platform=Platform.CPU, parameters='--lower_bound 1') benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) assert (benchmark.name == 'accumulation') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.raw_data == {'accumulation_result': ['1,3,6,10']}) assert (benchmark.result == { 'return_code': [0], 'accumulation_result': [10] }) # Replace the timestamp as null. result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null', benchmark.serialized_result) expected = ( '{"name": "accumulation", "type": "micro", "run_count": 1, ' '"return_code": 0, "start_time": null, "end_time": null, ' '"raw_data": {"accumulation_result": ["1,3,6,10"]}, ' '"result": {"return_code": [0], "accumulation_result": [10]}, ' '"reduce_op": {"return_code": null, "accumulation_result": null}}') assert (result == expected) # Launch benchmark with overridden parameters. context = BenchmarkRegistry.create_benchmark_context( 'accumulation', platform=Platform.CPU, parameters='--lower_bound 1 --upper_bound 4') benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) assert (benchmark.name == 'accumulation') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.raw_data == {'accumulation_result': ['1,3,6']}) assert (benchmark.result == { 'return_code': [0], 'accumulation_result': [6] }) # Replace the timestamp as null. result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null', benchmark.serialized_result) expected = ( '{"name": "accumulation", "type": "micro", "run_count": 1, ' '"return_code": 0, "start_time": null, "end_time": null, ' '"raw_data": {"accumulation_result": ["1,3,6"]}, ' '"result": {"return_code": [0], "accumulation_result": [6]}, ' '"reduce_op": {"return_code": null, "accumulation_result": null}}') assert (result == expected) # Failed to launch benchmark due to 'benchmark not found'. context = BenchmarkRegistry.create_benchmark_context( 'accumulation-fail', Platform.CPU, parameters='--lower_bound 1 --upper_bound 4', framework=Framework.PYTORCH) benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark is None) # Failed to launch benchmark due to 'unknown arguments'. context = BenchmarkRegistry.create_benchmark_context( 'accumulation', platform=Platform.CPU, parameters='--lower_bound 1 --test 4') benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT) # Failed to launch benchmark due to 'invalid arguments'. context = BenchmarkRegistry.create_benchmark_context( 'accumulation', platform=Platform.CPU, parameters='--lower_bound 1 --upper_bound x') benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)
and self._name == 'gpcnet-network-load-test': name_prefix = items[1].replace(' ', '') for i in range(2, len(items) - 1): if labels[i] != 'Units': self._result.add_result( self.__metrics_x[name_prefix] + '_' + labels[i].lower(), float(items[i].strip('X')) ) elif 'ERROR: this application must be run on at least' in raw_output: return True else: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( self._curr_run_index, self._name, raw_output ) ) return False except Exception as e: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format( self._curr_run_index, self._name, raw_output, str(e) ) ) return False return True BenchmarkRegistry.register_benchmark('gpcnet-network-test', GPCNetBenchmark) BenchmarkRegistry.register_benchmark('gpcnet-network-load-test', GPCNetBenchmark)
such as float32, float16. Return: The latency list of every inference operation. """ duration = [] curr_step = 0 with torch.no_grad(): self._model.eval() while True: for idx, sample in enumerate(self._dataloader): sample = sample.to(dtype=getattr(torch, precision.value)) start = self._timer() if self._gpu_available: sample = sample.cuda() self._model(sample) end = self._timer() curr_step += 1 if curr_step > self._args.num_warmup: # Save the step time of every training/inference step, unit is millisecond. duration.append((end - start) * 1000) if self._is_finished(curr_step, end): return duration # Register LSTM benchmark. BenchmarkRegistry.register_benchmark( 'pytorch-lstm', PytorchLSTM, parameters='--input_size=256 --hidden_size=1024 --num_layers=8')
raw_data = line[line.index('[raw_data]: ') + len('[raw_data]: '):] raw_data = raw_data.split(',') raw_data.pop() raw_data = [float(item) for item in raw_data] self._result.add_result(metric.lower() + '_time', statistics.mean(raw_data)) self._result.add_raw_data(metric.lower() + '_time', raw_data, self._args.log_raw_data) if 'Error' in line: error = True except BaseException as e: logger.error( 'Cannot extract results from cublas functions - round: {}, index of cmd: {}, \ benchmark: {}, raw data: {}, message: {}'.format( self._curr_run_index, cmd_idx, self._name, raw_output, str(e))) return False if error: logger.error( 'Error in running cublas test - round: {}, index of cmd: {}, benchmark: {}, raw data: {}' .format(self._curr_run_index, cmd_idx, self._name, raw_output)) return False return True BenchmarkRegistry.register_benchmark('cublas-function', CublasBenchmark, platform=Platform.CUDA)
start = self._timer() if self._gpu_available: sample = sample.cuda() self._model(sample) end = self._timer() curr_step += 1 if curr_step > self._args.num_warmup: # Save the step time of every training/inference step, unit is millisecond. duration.append((end - start) * 1000) if self._is_finished(curr_step, end): return duration # Register BERT Large benchmark. # Reference: https://huggingface.co/transformers/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-bert-large', PytorchBERT, parameters= '--hidden_size=1024 --num_hidden_layers=24 --num_attention_heads=16 --intermediate_size=4096' ) # Register BERT Base benchmark. # Reference: https://huggingface.co/transformers/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-bert-base', PytorchBERT, parameters= '--hidden_size=768 --num_hidden_layers=12 --num_attention_heads=12 --intermediate_size=3072' )
out_table = dict() for line in raw_output.splitlines(): if line.strip() == '': continue # only lines starting with a digit is of interest if line.lstrip()[0].isdigit(): vals = line.split() if len(vals) < 2: continue numa_index = 'numa_%s' % vals[0] out_table[numa_index] = vals[1:] return out_table def _parse_max_bw(self, raw_output): out_table = dict() # the very last line is empty and only the last 5 lines of the output are of interest for line in raw_output.splitlines()[-6:]: if line.strip() == '': continue vals = line.split() if len(vals) < 2: continue key = '_'.join(vals[0:2]).rstrip(':').replace(':', '_') # making a list to be consistent with the _parse_bw_latency output out_table[key] = [vals[-1]] return out_table BenchmarkRegistry.register_benchmark('cpu-memory-bw-latency', CpuMemBwLatencyBenchmark)
sample = sample.cuda() self._model(sample) end = self._timer() curr_step += 1 if curr_step > self._args.num_warmup: # Save the step time of every training/inference step, unit is millisecond. duration.append((end - start) * 1000) if self._is_finished(curr_step, end): return duration # Register GPT2 benchmark with 117M parameters. # Reference: https://huggingface.co/transformers/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-small', PytorchGPT2, parameters= '--hidden_size=768 --num_hidden_layers=12 --num_attention_heads=12') # Register GPT2 benchmark with 345M parameters. # Reference: https://huggingface.co/transformers/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-medium', PytorchGPT2, parameters= '--hidden_size=1024 --num_hidden_layers=24 --num_attention_heads=16') # Register GPT2 benchmark with 774M parameters. # Reference: https://huggingface.co/transformers/pretrained_models.html BenchmarkRegistry.register_benchmark( 'pytorch-gpt2-large',
valid = False else: content = content[result_index:] for line_index, line in enumerate(content): line_result = list(filter(None, line.strip().split(','))) for pair_index, pair_result in enumerate(line_result): rank_results = list( filter(None, pair_result.strip().split(' '))) for rank_index, rank_result in enumerate(rank_results): metric = f'{command}_{line_index}_{pair_index}:{self.__config[config_index]}:{rank_index}' value = float(rank_result) # Check if the value is valid before the base conversion if 'bw' in command and value >= 0.0: value = value / 8.0 self._result.add_result(metric, value) valid = True config_index += 1 except Exception: valid = False if valid is False or config_index != len(self.__config): logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.' .format(self._curr_run_index, self._name, raw_output)) return False return True BenchmarkRegistry.register_benchmark('ib-traffic', IBBenchmark)
raw_data = line[line.index('[raw_data]: ') + len('[raw_data]: '):] raw_data = raw_data.split(',') raw_data.pop() raw_data = [float(item) for item in raw_data] self._result.add_result(metric.lower() + '_time', statistics.mean(raw_data) * 1000) self._result.add_raw_data(metric.lower() + '_time', raw_data, self._args.log_raw_data) if 'Error' in line: error = True except BaseException as e: logger.error( 'Cannot extract results from cudnn functions - round: {}, index of cmd: {}, \ benchmark: {}, raw data: {}, message: {}'.format( self._curr_run_index, cmd_idx, self._name, raw_output, str(e))) return False if error: logger.error( 'Error in running cudnn test - round: {}, index of cmd: {}, benchmark: {}, raw data: {}' .format(self._curr_run_index, cmd_idx, self._name, raw_output)) return False return True BenchmarkRegistry.register_benchmark('cudnn-function', CudnnBenchmark, platform=Platform.CUDA)
def _postprocess(self): """Postprocess/cleanup operations after the benchmarking. Return: True if _postprocess() succeed. """ if not super()._postprocess(): return False try: if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode: torch.distributed.destroy_process_group() except BaseException as e: self._result.set_return_code( ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE) logger.error( 'Post process failed - benchmark: {}, mode: {}, message: {}.'. format(self._name, self._args.mode, str(e))) return False return True BenchmarkRegistry.register_benchmark('pytorch-sharding-matmul', ShardingMatmul, parameters='--mode allreduce allgather') BenchmarkRegistry.register_benchmark('pytorch-matmul', ShardingMatmul, parameters='--mode nosharding')
kernel, (compute_end - start) * 1000 / self._args.num_steps)) return True def _postprocess(self): """Postprocess/cleanup operations after the benchmarking. Return: True if _postprocess() succeed. """ if not super()._postprocess(): return False try: torch.distributed.destroy_process_group() except BaseException as e: self._result.set_return_code( ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE) logger.error( 'Post process failed - benchmark: {}, message: {}.'.format( self._name, str(e))) return False return True BenchmarkRegistry.register_benchmark( 'pytorch-computation-communication-overlap', ComputationCommunicationOverlap, parameters='--kernel mul matmul')
iops = fio_output['jobs'][0][io_type]['iops'] self._result.add_result('%s_iops' % io_type_prefix, float(iops)) for lat_unit in lat_units: if lat_unit in fio_output['jobs'][0][io_type] and \ 'percentile' in fio_output['jobs'][0][io_type][lat_unit]: lat_unit_prefix = '%s_%s' % (io_type_prefix, lat_unit) for lat_percentile in [ '95.000000', '99.000000', '99.900000' ]: lat = fio_output['jobs'][0][io_type][lat_unit][ 'percentile'][lat_percentile] self._result.add_result( '%s_%s' % (lat_unit_prefix, lat_percentile[:-5]), float(lat)) break except BaseException as e: self._result.set_return_code( ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.' .format(self._curr_run_index, self._name, raw_output, str(e))) return False return True BenchmarkRegistry.register_benchmark('disk-benchmark', DiskBenchmark)
self._result.add_raw_data() and self._result.add_result() need to be called to save the results. Args: cmd_idx (int): the index of command corresponding with the raw_output. raw_output (str): raw output string of the micro-benchmark. Return: True if the raw output string is valid and result can be extracted. """ self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) try: output_lines = [x.strip() for x in raw_output.strip().splitlines()] for output_line in output_lines: tag, bw_str = output_line.split() self._result.add_result(tag + '_bw', float(bw_str)) except BaseException as e: self._result.set_return_code( ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.' .format(self._curr_run_index, self._name, raw_output, str(e))) return False return True BenchmarkRegistry.register_benchmark('gpu-copy-bw', GpuCopyBwBenchmark)
mem_bw = -1 valid = True content = raw_output.splitlines() try: metric = self._metrics[self._mem_types.index(self._args.mem_type[cmd_idx])] parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]] for line in content: if parse_logline in line: line = line.split(',')[1] value = re.search(r'(\d+.\d+)', line) if value: mem_bw = max(mem_bw, float(value.group(0))) except BaseException: valid = False finally: if valid is False or mem_bw == -1: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( self._curr_run_index, self._name, raw_output ) ) return False self._result.add_result(metric, mem_bw) return True BenchmarkRegistry.register_benchmark('mem-bw', CudaMemBwBenchmark, platform=Platform.CUDA)
Return: True if the raw output string is valid and result can be extracted. """ self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) pattern = r'\d+\.\d+' result = re.findall(pattern, raw_output) if len(result) != 2: logger.error( 'Cannot extract kernel launch overhead in event and wall mode - round: {}, benchmark: {}, raw data: {}.' .format(self._curr_run_index, self._name, raw_output)) return False try: result = [float(item) for item in result] except BaseException as e: logger.error( 'The result format is invalid - round: {}, benchmark: {}, result: {}, message: {}.' .format(self._curr_run_index, self._name, result, str(e))) return False self._result.add_result('event_time', result[0]) self._result.add_result('wall_time', result[1]) return True BenchmarkRegistry.register_benchmark('kernel-launch', KernelLaunch)
algbw_out = float(line[algbw_index]) self._result.add_result( self._args.operation + '_' + str(size) + '_busbw', busbw_out) self._result.add_result( self._args.operation + '_' + str(size) + '_algbw', algbw_out) self._result.add_result( self._args.operation + '_' + str(size) + '_time', time_out) except BaseException as e: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.' .format(self._curr_run_index, self._name, raw_output, str(e))) return False if out_of_place_index == -1 or out_of_bound_index == -1 or busbw_out == -1: logger.error( 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.' .format(self._curr_run_index, self._name, raw_output)) return False return True BenchmarkRegistry.register_benchmark('nccl-bw', CudaNcclBwBenchmark, platform=Platform.CUDA) BenchmarkRegistry.register_benchmark('rccl-bw', CudaNcclBwBenchmark, platform=Platform.ROCM)