def test_fambench(): """Test FAMBench benchmarks.""" benchmark_name = 'fambench' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) assert (benchmark._benchmark_type == BenchmarkType.DOCKER) assert (benchmark._image_uri == 'superbench/benchmark:cuda11.1.1-fambench') assert (benchmark._container_name == 'fambench-benchmarks') assert (benchmark._entrypoint == '/workspace/FAMBench/benchmarks/run_all_benchmarks.sh') assert (benchmark._cmd is None) benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS) benchmark._args = SimpleNamespace(log_raw_data=False) raw_output = """ benchmark implementation mode config score units batch_latency_95_sec DLRM OOTB eval tiny 152.800399 ex/s 0.515052 DLRM OOTB train tiny 35.483686 ex/s None DLRM UBENCH train linear_[(2,2,2,2,2)] 3.679281e-07 TF/s None XLMR OOTB eval default-config 1.015586 ex/s 16.463461 """ assert (benchmark._process_raw_result(0, raw_output)) assert (benchmark.result['dlrm_ootb_eval_tiny_ex_s'][0] == 152.800399) assert (benchmark.result['dlrm_ootb_train_tiny_ex_s'][0] == 35.483686) assert (benchmark.result['dlrm_ubench_train_linear_[(2,2,2,2,2)]_tf_s'][0] == 3.679281e-07) assert (benchmark.result['xlmr_ootb_eval_default_config_ex_s'][0] == 1.015586)
def test_tensorrt_inference_result_parsing(self, test_raw_log): """Test tensorrt-inference benchmark result parsing.""" (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( self.benchmark_name, Platform.CUDA) benchmark = benchmark_cls(self.benchmark_name, parameters='') benchmark._args = SimpleNamespace( pytorch_models=['model_0', 'model_1'], log_raw_data=False) benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) # Positive case - valid raw output self.assertTrue(benchmark._process_raw_result(0, test_raw_log)) self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result)) for tag in ['mean', '99']: self.assertEqual(0.5, benchmark.result[f'model_0_gpu_time_{tag}'][0]) self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0]) self.assertEqual( 1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0]) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
def _test_gpu_copy_bw_performance_result_parsing(self, platform, test_raw_output): """Test gpu-copy benchmark result parsing.""" benchmark_name = 'gpu-copy-bw' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform) assert (benchmark_class) benchmark = benchmark_class(benchmark_name, parameters='') assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'gpu-copy-bw') assert (benchmark.type == BenchmarkType.MICRO) # Positive case - valid raw output. assert (benchmark._process_raw_result(0, test_raw_output)) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (1 == len(benchmark.raw_data)) print(test_raw_output.splitlines()) test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()} assert (len(test_raw_output_dict) + benchmark.default_metric_count == len(benchmark.result)) for output_key in benchmark.result: if output_key == 'return_code': assert (benchmark.result[output_key] == [0]) else: assert (len(benchmark.result[output_key]) == 1) assert (isinstance(benchmark.result[output_key][0], numbers.Number)) assert (output_key.strip('_bw') in test_raw_output_dict) assert (test_raw_output_dict[output_key.strip('_bw')] == benchmark.result[output_key][0]) # Negative case - invalid raw output. assert (benchmark._process_raw_result(1, 'Invalid raw output') is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
def test_disk_performance_benchmark_disabled(self, mock_is_block_device): """Test disk-performance benchmark command generation with all benchmarks disabled.""" mock_is_block_device.return_value = True benchmark_name = 'disk-benchmark' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) block_devices = ['/dev/nvme0n1', '/dev/nvme1n1'] block_device_option = '--block_devices ' + ' '.join(block_devices) param_str = block_device_option param_str += ' --rand_precond_time=0' param_str += ' --seq_read_runtime=0' param_str += ' --rand_read_runtime=0' benchmark = benchmark_class(benchmark_name, parameters=param_str) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'disk-benchmark') assert (benchmark.type == BenchmarkType.MICRO) # Command list should be empty assert (0 == len(benchmark._commands))
def test_tensorrt_inference_cls(self): """Test tensorrt-inference benchmark class.""" for platform in Platform: (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( self.benchmark_name, platform) if platform is Platform.CUDA: self.assertIsNotNone(benchmark_cls) else: self.assertIsNone(benchmark_cls)
def test_ort_inference_performance(mock_ort_session_run, mock_get_dir): """Test ort-inference benchmark.""" benchmark_name = 'ort-inference' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) mock_get_dir.return_value = '/tmp/superbench/' benchmark = benchmark_class( benchmark_name, parameters= '--pytorch_models resnet50 --graph_opt_level 1 --precision float16' ' --batch_size 16 --num_warmup 128 --num_steps 512') assert (isinstance(benchmark, ORTInferenceBenchmark)) assert (benchmark._preprocess()) # Check basic information. assert (benchmark.name == 'ort-inference') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._ORTInferenceBenchmark__model_cache_path == Path( torch.hub.get_dir()) / 'checkpoints') for model in benchmark._args.pytorch_models: assert (hasattr(torchvision.models, model)) file_name = '{model}.{precision}.onnx'.format( model=model, precision=benchmark._args.precision) assert ((benchmark._ORTInferenceBenchmark__model_cache_path / file_name).is_file()) # Check parameters specified in BenchmarkContext. assert (benchmark._args.pytorch_models == ['resnet50']) assert (benchmark._args.graph_opt_level == 1) assert (benchmark._args.precision == Precision.FLOAT16) assert (benchmark._args.batch_size == 16) assert (benchmark._args.num_warmup == 128) assert (benchmark._args.num_steps == 512) # Check results and metrics. assert (benchmark._benchmark()) shutil.rmtree(benchmark._ORTInferenceBenchmark__model_cache_path) assert (benchmark.return_code == ReturnCode.SUCCESS) precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'} for model in benchmark._args.pytorch_models: if benchmark._args.precision.value in precision_metric: precision = precision_metric[benchmark._args.precision.value] else: precision = benchmark._args.precision.value metric = '{}_{}_time'.format(precision, model) assert (metric in benchmark.result) assert (metric in benchmark.raw_data)
def test_gpcnet_network_test(self, raw_output, raw_output_no_execution): """Test gpcnet-network-test benchmark.""" # Check registry. benchmark_name = 'gpcnet-network-test' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU) assert (benchmark_class) # Check preprocess benchmark = benchmark_class(benchmark_name) ret = benchmark._preprocess() assert (ret) expect_command = 'network_test' command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1] assert (command == expect_command) assert (benchmark._process_raw_result(0, raw_output_no_execution)) assert (len(benchmark.result) == benchmark.default_metric_count) # Check function process_raw_data. # Positive case - valid raw output. assert (benchmark._process_raw_result(0, raw_output)) metric_list = [ 'rr_two-sided_lat', 'rr_get_lat', 'rr_two-sided_bw', 'rr_put_bw', 'rr_two-sided+sync_bw', 'nat_two-sided_bw', 'multiple_allreduce_time', 'multiple_alltoall_bw', ] for metric_medium in metric_list: for suffix in ['avg', '99%']: metric = metric_medium + '_' + suffix assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) # Negative case - Add invalid raw output. assert (benchmark._process_raw_result(0, 'ERROR') is False) # Check basic information. assert (benchmark.name == 'gpcnet-network-test') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._bin_name == 'network_test')
def _test_gpu_copy_bw_performance_command_generation(self, platform): """Test gpu-copy benchmark command generation.""" benchmark_name = 'gpu-copy-bw' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform) assert (benchmark_class) size = 1048576 num_warm_up = 20 num_loops = 10000 mem_types = ['htod', 'dtoh', 'dtod'] copy_types = ['sm', 'dma'] parameters = '--mem_type %s --copy_type %s --size %d ' \ '--num_warm_up %d --num_loops %d --bidirectional --check_data' % \ (' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops) benchmark = benchmark_class(benchmark_name, parameters=parameters) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == benchmark_name) assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.mem_type == mem_types) assert (benchmark._args.copy_type == copy_types) assert (benchmark._args.size == size) assert (benchmark._args.num_warm_up == num_warm_up) assert (benchmark._args.num_loops == num_loops) assert (benchmark._args.bidirectional) assert (benchmark._args.check_data) # Check command assert (1 == len(benchmark._commands)) assert (benchmark._commands[0].startswith(benchmark._GpuCopyBwBenchmark__bin_path)) for mem_type in mem_types: assert ('--%s' % mem_type in benchmark._commands[0]) for copy_type in copy_types: assert ('--%s_copy' % copy_type in benchmark._commands[0]) assert ('--size %d' % size in benchmark._commands[0]) assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0]) assert ('--num_loops %d' % num_loops in benchmark._commands[0]) assert ('--bidirectional' in benchmark._commands[0]) assert ('--check_data' in benchmark._commands[0])
def test_gpu_burn(self, results): """Test gpu-burn benchmark command generation.""" benchmark_name = 'gpu-burn' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) time = 10 parameters = '--doubles --tensor_core --time ' + str(time) benchmark = benchmark_class(benchmark_name, parameters=parameters) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == benchmark_name) assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.time == time) assert (benchmark._args.doubles) assert (benchmark._args.tensor_core) # Check command compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./' compare_rm = 'rm ' + 'compare.ptx' assert (1 == len(benchmark._commands)) assert (benchmark._commands[0].startswith(compare_copy)) assert ('-d' in benchmark._commands[0]) assert ('-tc' in benchmark._commands[0]) assert (str(time) in benchmark._commands[0]) assert (compare_rm in benchmark._commands[0]) # Check results assert (benchmark._process_raw_result(0, results)) assert (benchmark.result['return_code'][0] == 0) assert (benchmark.result['time'][0] == time) for device in range(8): assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1) assert (benchmark.result['abort'][0] == 0)
def test_disk_performance_empty_param(self): """Test disk-performance benchmark command generation with empty parameter.""" benchmark_name = 'disk-benchmark' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) benchmark = benchmark_class(benchmark_name, parameters='') # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'disk-benchmark') assert (benchmark.type == BenchmarkType.MICRO) # Command list should be empty assert (0 == len(benchmark._commands))
def test_cpu_mem_bw_latency_benchmark_empty_param(self): """Test cpu-memory-bw-latency benchmark command generation with empty parameter.""" benchmark_name = 'cpu-memory-bw-latency' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU) assert (benchmark_class) default_mlc_test = 'bandwidth_matrix' benchmark = benchmark_class(benchmark_name, parameters='') # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'cpu-memory-bw-latency') assert (benchmark.type == BenchmarkType.MICRO) # Check commands assert (1 == len(benchmark._commands)) assert ('mlc --%s;' % default_mlc_test in benchmark._commands[0])
def create_benchmark(params='--num_steps 8'): """Register and create benchmark.""" # Register the FakeModelBenchmark benchmark. BenchmarkRegistry.register_benchmark( 'pytorch-fake-model', FakeModelBenchmark, parameters='--hidden_size 2', platform=Platform.CUDA, ) context = BenchmarkRegistry.create_benchmark_context( 'fake-model', platform=Platform.CUDA, parameters=params, framework=Framework.PYTORCH) name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context) assert (name) (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( name, context.platform) assert (benchmark_class) return benchmark_class(name, predefine_params + ' ' + context.parameters)
def test_cuda_memory_bw_performance(self, raw_output_h2d, raw_output_d2h, raw_output_d2d): """Test cuda mem-bw benchmark.""" benchmark_name = 'mem-bw' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) benchmark = benchmark_class(benchmark_name, parameters='--shmoo_mode --memory=pinned') ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) # Check basic information. assert (benchmark) assert (benchmark.name == 'mem-bw') assert (benchmark.type == BenchmarkType.MICRO) # Check command list expected_command = [ 'bandwidthTest --htod mode=shmoo memory=pinned --csv', 'bandwidthTest --dtoh mode=shmoo memory=pinned --csv', 'bandwidthTest --dtod mode=shmoo memory=pinned --csv' ] for i in range(len(expected_command)): command = benchmark._bin_name + benchmark._commands[i].split( benchmark._bin_name)[1] assert (command == expected_command[i]) # Check results and metrics. raw_output = [raw_output_h2d, raw_output_d2h, raw_output_d2d] for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']): assert (benchmark._process_raw_result(i, raw_output[i])) assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number))
def test_disk_performance_invalid_block_device(self, mock_is_block_device): """Test disk-performance benchmark command generation with invalid block device.""" mock_is_block_device.return_value = False benchmark_name = 'disk-benchmark' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) block_devices = ['mock_block_device_0'] block_device_option = '--block_devices ' + ' '.join(block_devices) benchmark = benchmark_class(benchmark_name, parameters=block_device_option) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is False) assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT) assert (benchmark.name == 'disk-benchmark') assert (benchmark.type == BenchmarkType.MICRO)
def test_rocm_memory_bw_performance(self, raw_output_h2d, raw_output_d2h): """Test rocm mem-bw benchmark.""" benchmark_name = 'mem-bw' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.ROCM) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) # Check basic information. assert (benchmark) assert (benchmark.name == 'mem-bw') assert (benchmark.type == BenchmarkType.MICRO) # Check command list expected_command = ['hipBusBandwidth --h2d', 'hipBusBandwidth --d2h'] for i in range(len(expected_command)): commnad = benchmark._bin_name + benchmark._commands[i].split( benchmark._bin_name)[1] assert (commnad == expected_command[i]) # Check results and metrics. raw_output = [raw_output_h2d, raw_output_d2h] for i, metric in enumerate(['h2d_bw', 'd2h_bw']): assert (benchmark._process_raw_result(i, raw_output[i])) assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) assert (benchmark.result['h2d_bw'][0] == 25.2351) assert (benchmark.result['d2h_bw'][0] == 27.9348)
def test_ib_traffic_performance(self, mock_gpu): """Test ib-traffic benchmark.""" # Test without ib devices # Check registry. benchmark_name = 'ib-traffic' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) # Check preprocess # Negative cases parameters = '--ib_dev 0 --iters 2000 --pattern one-to-one --hostfile hostfile' benchmark = benchmark_class(benchmark_name, parameters=parameters) ret = benchmark._preprocess() assert (ret is False) # no hostfile assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT) hosts = ['node0\n', 'node1\n', 'node2\n', 'node3\n'] with open('hostfile', 'w') as f: f.writelines(hosts) parameters = '--ib_dev 0 --msg_size invalid --pattern one-to-one --hostfile hostfile' benchmark = benchmark_class(benchmark_name, parameters=parameters) ret = benchmark._preprocess() assert (ret is False) assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT) # Positive cases os.environ['OMPI_COMM_WORLD_SIZE'] = '3' parameters = '--ib_dev 0 --iters 2000 --pattern one-to-one --hostfile hostfile' benchmark = benchmark_class(benchmark_name, parameters=parameters) ret = benchmark._preprocess() assert (ret is True) # Generate config parameters = '--ib_dev mlx5_0 --iters 2000 --msg_size 33554432 --hostfile hostfile' benchmark = benchmark_class(benchmark_name, parameters=parameters) os.environ['OMPI_COMM_WORLD_SIZE'] = '4' ret = benchmark._preprocess() Path('config.txt').unlink() assert (ret) expect_command = "ib_validation --cmd_prefix '" + benchmark._args.bin_dir + \ "/ib_write_bw -F -n 2000 -d mlx5_0 -s 33554432 --report_gbits' " + \ f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt' command = benchmark._bin_name + benchmark._commands[0].split( benchmark._bin_name)[1] assert (command == expect_command) parameters = '--ib_dev mlx5_0 --msg_size 0 --iters 2000 --pattern one-to-one --hostfile hostfile --gpu_dev 0' mock_gpu.return_value = 'nvidia' benchmark = benchmark_class(benchmark_name, parameters=parameters) ret = benchmark._preprocess() expect_command = "ib_validation --cmd_prefix '" + benchmark._args.bin_dir + \ "/ib_write_bw -F -n 2000 -d mlx5_0 -a --use_cuda=0 --report_gbits' " + \ f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt' command = benchmark._bin_name + benchmark._commands[0].split( benchmark._bin_name)[1] assert (command == expect_command) mock_gpu.return_value = 'amd' benchmark = benchmark_class(benchmark_name, parameters=parameters) ret = benchmark._preprocess() expect_command = expect_command.replace('cuda', 'rocm') command = benchmark._bin_name + benchmark._commands[0].split( benchmark._bin_name)[1] assert (command == expect_command) # Custom config config = ['0,1', '1,0;0,1', '0,1;1,0', '1,0;0,1'] with open('test_config.txt', 'w') as f: for line in config: f.write(line + '\n') parameters = '--ib_dev mlx5_0 --timeout 180 --iters 2000 --msg_size 33554432 ' + \ '--config test_config.txt --hostfile hostfile' benchmark = benchmark_class(benchmark_name, parameters=parameters) os.environ['OMPI_COMM_WORLD_SIZE'] = '2' ret = benchmark._preprocess() Path('test_config.txt').unlink() assert (ret) expect_command = "ib_validation --cmd_prefix '" + benchmark._args.bin_dir + \ "/ib_write_bw -F -n 2000 -d mlx5_0 -s 33554432 --report_gbits' " + \ '--timeout 180 --hostfile hostfile --input_config test_config.txt' command = benchmark._bin_name + benchmark._commands[0].split( benchmark._bin_name)[1] assert (command == expect_command) # suppose gpu driver mismatch issue or other traffic issue cause -1 result raw_output_0 = """ The prefix of cmd to run is: ib_write_bw -a -d ibP257p0s0 Load the config file from: config.txt Output will be saved to: config: 0,1 1,0;0,1 0,1;1,0 1,0;0,1 config end results from rank ROOT_RANK: -1, -1,-1 -1,-1 -1,-1 """ raw_output_1 = """ The prefix of cmd to run is: ib_write_bw -a -d ibP257p0s0 Load the config file from: config.txt Output will be saved to: config: 0,1 1,0;0,1 0,1;1,0 1,0;0,1 config end results from rank ROOT_RANK: 23452.6, 22212.6,22433 22798.8,23436.3 23435.3,22766.5 """ raw_output_2 = """ The prefix of cmd to run is: ib_write_bw -F -n 2000 -d mlx5_0 -s 33554432 Load the config file from: config.txt Output will be saved to: config: 0,1 1,0;0,1 0,1;1,0 1,0;0,1 config end results from rank ROOT_RANK: 23452.6, 22212.6,22433, 22798.8,23436.3, """ raw_output_3 = """ -------------------------------------------------------------------------- mpirun was unable to launch the specified application as it could not access or execute an executable: while attempting to start process rank 0. -------------------------------------------------------------------------- 2 total processes failed to start """ # Check function process_raw_data. # Positive cases - valid raw output. os.environ['OMPI_COMM_WORLD_RANK'] = '0' assert (benchmark._process_raw_result(0, raw_output_0)) for metric in benchmark.result: assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) values = list(benchmark.result.values())[1:] assert (all(value == [-1.0] for value in values)) assert (benchmark._process_raw_result(0, raw_output_1)) for index, metric in enumerate(benchmark.result): assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1 if index == 0 else len(benchmark.result[metric]) == 2) assert (isinstance(benchmark.result[metric][0], numbers.Number)) # Negative cases - invalid raw output. assert (benchmark._process_raw_result(0, raw_output_2) is False) assert (benchmark._process_raw_result(0, raw_output_3) is False) os.environ.pop('OMPI_COMM_WORLD_RANK') # Check basic information. assert (benchmark.name == 'ib-traffic') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._bin_name == 'ib_validation') # Check parameters specified in BenchmarkContext. assert (benchmark._args.ib_dev == 'mlx5_0') assert (benchmark._args.iters == 2000) assert (benchmark._args.msg_size == 33554432) assert (benchmark._args.command == 'ib_write_bw')
def test_disk_performance_result_parsing(self, test_raw_output): """Test disk-performance benchmark result parsing.""" benchmark_name = 'disk-benchmark' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) benchmark = benchmark_class(benchmark_name, parameters='') assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'disk-benchmark') assert (benchmark.type == BenchmarkType.MICRO) # Positive case - valid raw output. jobname_prefix = 'nvme0n1_rand_read_write' assert (benchmark._process_raw_result(0, test_raw_output)) assert (benchmark.return_code == ReturnCode.SUCCESS) # bs + <read, write> x <iops, 95th, 99th, 99.9th> assert (9 + benchmark.default_metric_count == len( benchmark.result.keys())) assert (1 == len(benchmark.result[jobname_prefix + '_bs'])) assert (4096 == benchmark.result[jobname_prefix + '_bs'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_read_iops'])) assert (85138.890741 == benchmark.result[jobname_prefix + '_read_iops'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_write_iops'])) assert (85066.128925 == benchmark.result[jobname_prefix + '_write_iops'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_95.0'])) assert (1941504 == benchmark.result[jobname_prefix + '_read_lat_ns_95.0'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.0'])) assert (2244608 == benchmark.result[jobname_prefix + '_read_lat_ns_99.0'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_read_lat_ns_99.9'])) assert (3620864 == benchmark.result[jobname_prefix + '_read_lat_ns_99.9'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_95.0'])) assert (1908736 == benchmark.result[jobname_prefix + '_write_lat_ns_95.0'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.0'])) assert (2072576 == benchmark.result[jobname_prefix + '_write_lat_ns_99.0'][0]) assert (1 == len(benchmark.result[jobname_prefix + '_write_lat_ns_99.9'])) assert (2605056 == benchmark.result[jobname_prefix + '_write_lat_ns_99.9'][0]) # Negative case - invalid raw output. assert (benchmark._process_raw_result(1, 'Invalid raw output') is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
def test_ib_loopback_all_sizes(self, raw_output, mock_ib_devices, mock_numa_cores): """Test ib-loopback benchmark for all sizes.""" # Test without ib devices # Check registry. benchmark_name = 'ib-loopback' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) # Check preprocess # Negative case parameters = '--ib_index 0 --numa 0 --iters 2000' benchmark = benchmark_class(benchmark_name, parameters=parameters) mock_ib_devices.return_value = None ret = benchmark._preprocess() assert (ret is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE) parameters = '--ib_index 0 --numa 0 --iters 2000' benchmark = benchmark_class(benchmark_name, parameters=parameters) mock_numa_cores.return_value = None ret = benchmark._preprocess() assert (ret is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE) # Positive case parameters = '--ib_index 0 --numa 0 --iters 2000' benchmark = benchmark_class(benchmark_name, parameters=parameters) mock_ib_devices.return_value = ['mlx5_0'] mock_numa_cores.return_value = [0, 1, 2, 3] os.environ['PROC_RANK'] = '0' os.environ['IB_DEVICES'] = '0,2,4,6' os.environ['NUMA_NODES'] = '1,0,3,2' ret = benchmark._preprocess() assert (ret) port = benchmark._IBLoopbackBenchmark__sock_fds[-1].getsockname()[1] expect_command = 'run_perftest_loopback 3 1 ' + benchmark._args.bin_dir + \ f'/ib_write_bw -a -F --iters=2000 -d mlx5_0 -p {port} -x 0 --report_gbits' command = benchmark._bin_name + benchmark._commands[0].split( benchmark._bin_name)[1] assert (command == expect_command) assert (benchmark._process_raw_result(0, raw_output)) # Check function process_raw_data. # Positive case - valid raw output. metric_list = [] for ib_command in benchmark._args.commands: for size in ['8388608', '4194304', '1024', '2']: metric = 'ib_{}_bw_{}:{}'.format(ib_command, size, str(benchmark._args.ib_index)) metric_list.append(metric) for metric in metric_list: assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) # Negative case - Add invalid raw output. assert (benchmark._process_raw_result(0, 'Invalid raw output') is False) # Check basic information. assert (benchmark.name == 'ib-loopback') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._bin_name == 'run_perftest_loopback') # Check parameters specified in BenchmarkContext. assert (benchmark._args.ib_index == 0) assert (benchmark._args.numa == 1) assert (benchmark._args.iters == 2000) assert (benchmark._args.commands == ['write'])
def test_generate_config(self, tp_hosts, tp_expected_config): # noqa: C901 """Test util functions .""" test_config_file = 'test_gen_config.txt' hostlist = [] def read_config(filename): config = [] with open(filename, 'r') as f: lines = f.readlines() for line in lines: pairs = line.strip().split(';') config.append(pairs) return config expected_config = {} expected_config['one-to-one'] = [['0,3', '1,2'], ['0,1', '2,3'], ['0,2', '3,1']] expected_config['many-to-one'] = [['0,1', '0,2', '0,3'], ['1,0', '1,2', '1,3'], ['2,0', '2,1', '2,3'], ['3,0', '3,1', '3,2']] expected_config['one-to-many'] = [['1,0', '2,0', '3,0'], ['0,1', '2,1', '3,1'], ['0,2', '1,2', '3,2'], ['0,3', '1,3', '2,3']] benchmark_name = 'ib-traffic' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) # Small scale test node_num = 4 gen_hostlist(hostlist, node_num) for m in ['one-to-one', 'one-to-many', 'many-to-one']: benchmark.gen_traffic_pattern(hostlist, m, test_config_file) config = read_config(test_config_file) assert (config == expected_config[m]) # Large scale test node_num = 1000 gen_hostlist(hostlist, node_num) # check for 'one-to-many' and 'many-to-one' # In Nth step, the count of N is (N-1), others are all 1 for m in ['one-to-many', 'many-to-one']: benchmark.gen_traffic_pattern(hostlist, m, test_config_file) config = read_config(test_config_file) assert (len(config) == node_num) assert (len(config[0]) == node_num - 1) for step in range(node_num): server = defaultdict(int) client = defaultdict(int) for pair in config[step]: pair = pair.split(',') server[int(pair[0])] += 1 client[int(pair[1])] += 1 for i in range(node_num): if m == 'many-to-one': if i == step: assert (server[i] == node_num - 1) else: assert (client[i] == 1) elif m == 'one-to-many': if i == step: assert (client[i] == node_num - 1) else: assert (server[i] == 1) # check for 'one-to-one' # Each index appears 1 time in each step # Each index has been combined once with all the remaining indexes benchmark.gen_traffic_pattern(hostlist, 'one-to-one', test_config_file) config = read_config(test_config_file) if node_num % 2 == 1: assert (len(config) == node_num) assert (len(config[0]) == node_num // 2) else: assert (len(config) == node_num - 1) assert (len(config[0]) == node_num // 2) test_pairs = defaultdict(list) for step in range(len(config)): node = defaultdict(int) for pair in config[step]: pair = pair.split(',') node[int(pair[0])] += 1 node[int(pair[1])] += 1 test_pairs[int(pair[0])].append(int(pair[1])) test_pairs[int(pair[1])].append(int(pair[0])) for index in node: assert (node[index] == 1) for node in range(node_num): assert (sorted(test_pairs[node]) == [(i) for i in range(node_num) if i != node]) # check for 'topo-aware' # compare generated config file with pre-saved expected config file tp_ibstat_path = 'tests/data/ib_traffic_topo_aware_ibstat.txt' tp_ibnetdiscover_path = 'tests/data/ib_traffic_topo_aware_ibnetdiscover.txt' hostlist = tp_hosts.split() expected_config = tp_expected_config.split() config = gen_topo_aware_config(hostlist, tp_ibstat_path, tp_ibnetdiscover_path, 2, 6) assert (config == expected_config) Path(test_config_file).unlink()
def test_rocm_flops_performance(self): """Test gemm-flops benchmark.""" benchmark_name = 'gemm-flops' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.ROCM) assert (benchmark_class) # Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE. benchmark = benchmark_class(benchmark_name, parameters='--m 7680 --n 8192 --k 8192') ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) # Check basic information. assert (benchmark.name == 'gemm-flops') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._bin_name == 'rocblas-bench') # Check parameters specified in BenchmarkContext. assert (benchmark._args.m == 7680) assert (benchmark._args.n == 8192) assert (benchmark._args.k == 8192) params = '--transposeA N --transposeB T -m 7680 -n 8192 -k 8192' + \ ' --alpha 1 --beta 0 --lda 8384 --ldb 8384 --ldc 8384 --ldd 8384' # Check command list expected_command = [ 'rocblas-bench -r f64_r -f gemm ' + params, 'rocblas-bench -r f32_r -f gemm_ex --compute_type f32_r ' + params, 'rocblas-bench -r f16_r -f gemm_ex --compute_type f32_r ' + params, 'rocblas-bench -r bf16_r -f gemm_ex --compute_type f32_r ' + params, 'rocblas-bench --a_type i8_r --b_type i8_r --c_type i32_r --d_type i32_r -f gemm_ex --compute_type i32_r ' + params ] for i in range(len(expected_command)): commnad = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1] print(benchmark._commands) assert (commnad == expected_command[i]) # Check results and metrics. raw_output_FP64 = """ transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,rocblas-Gflops,us N,T,7680,8192,8192,1,8384,0,8384,8384, 10037.5, 102694 """ raw_output_FP32_X = """ transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us N,T,8640,8640,8640,1,8640,0,8640,8640,8640,1, 39441.6, 32705.2 """ raw_output_FP16_X = """ transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us N,T,7680,8192,8192,1,8384,0,8384,8384,8384,1, 153728, 6705.3 """ raw_output_BF16_X = """ transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us N,T,7680,8192,8192,1,8384,0,8384,8384,8384,1, 81374.3, 12667.3 """ raw_output_INT8_X = """ transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us T,N,7680,8192,8192,1,8416,0,8416,8416,8416,1, 162675, 6336.5 """ assert (benchmark._process_raw_result(0, raw_output_FP64)) assert (benchmark._process_raw_result(1, raw_output_FP32_X)) assert (benchmark._process_raw_result(2, raw_output_FP16_X)) assert (benchmark._process_raw_result(3, raw_output_BF16_X)) assert (benchmark._process_raw_result(4, raw_output_INT8_X)) assert (benchmark.result['fp64_flops'][0] == 10037.5) assert (benchmark.result['fp32_xdlops_flops'][0] == 39441.6) assert (benchmark.result['fp16_xdlops_flops'][0] == 153728) assert (benchmark.result['bf16_xdlops_flops'][0] == 81374.3) assert (benchmark.result['int8_xdlops_iops'][0] == 162675) # Negative case - Add invalid raw output. assert (benchmark._process_raw_result(4, 'Invalid raw output') is False)
def test_tensorrt_inference_params(self): """Test tensorrt-inference benchmark preprocess with different parameters.""" (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( self.benchmark_name, Platform.CUDA) test_cases = [ { 'precision': 'fp32', }, { 'pytorch_models': ['resnet50', 'mnasnet0_5'], 'precision': 'fp16', }, { 'pytorch_models': ['resnet50'], 'batch_size': 4, }, { 'pytorch_models': ['lstm', 'bert-base', 'gpt2-small'], 'batch_size': 4, 'seq_length': 128, 'iterations': 256, }, ] for test_case in test_cases: with self.subTest(msg='Testing with case', test_case=test_case): parameter_list = [] if 'pytorch_models' in test_case: parameter_list.append( f'--pytorch_models {" ".join(test_case["pytorch_models"])}' ) if 'precision' in test_case: parameter_list.append( f'--precision {test_case["precision"]}') if 'batch_size' in test_case: parameter_list.append( f'--batch_size {test_case["batch_size"]}') if 'seq_length' in test_case: parameter_list.append( f'--seq_length {test_case["seq_length"]}') if 'iterations' in test_case: parameter_list.append( f'--iterations {test_case["iterations"]}') # Check basic information benchmark = benchmark_cls(self.benchmark_name, parameters=' '.join(parameter_list)) self.assertTrue(benchmark) # Limit model number benchmark._pytorch_models = benchmark._pytorch_models[:1] # Preprocess ret = benchmark._preprocess() self.assertTrue(ret) self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(BenchmarkType.MICRO, benchmark.type) self.assertEqual(self.benchmark_name, benchmark.name) # Check parameters self.assertEqual( test_case.get('pytorch_models', benchmark._pytorch_models), benchmark._args.pytorch_models, ) self.assertEqual( test_case.get('precision', 'int8'), benchmark._args.precision, ) self.assertEqual( test_case.get('batch_size', 32), benchmark._args.batch_size, ) self.assertEqual( test_case.get('iterations', 2048), benchmark._args.iterations, ) # Check models for model in benchmark._args.pytorch_models: self.assertTrue( (self._model_path / f'{model}.onnx').is_file()) # Command list should equal to default model number self.assertEqual( len( test_case.get('pytorch_models', benchmark._pytorch_models)), len(benchmark._commands))
def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast, reducescatter, alltoall): """Test nccl-bw benchmark.""" benchmark_name = 'nccl-bw' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8') ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) # Check basic information. assert (benchmark) assert (benchmark.name == 'nccl-bw') assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.operation == 'allreduce') assert (benchmark._args.ngpus == 8) assert (benchmark._args.minbytes == '8') assert (benchmark._args.maxbytes == '8G') assert (benchmark._args.stepfactor == 2) assert (benchmark._args.check == 0) assert (benchmark._args.iters == 20) assert (benchmark._args.warmup_iters == 5) # Check command list bin_names = [ 'all_reduce_perf', 'all_gather_perf', 'broadcast_perf', 'reduce_perf', 'reduce_scatter_perf', 'alltoall_perf' ] command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1] expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format( bin_names[0]) assert (command == expected_command) # Check results and metrics. # Case with no raw_output assert (benchmark._process_raw_result(0, '') is False) # Case with valid raw_output raw_output = { 'allgather': allgather, 'allreduce': allreduce, 'reduce': reduce, 'broadcast': broadcast, 'reducescatter': reducescatter, 'alltoall': alltoall, } for op in raw_output.keys(): benchmark._args.operation = op assert (benchmark._process_raw_result(0, raw_output[op])) for name in ['time', 'algbw', 'busbw']: for size in [ '8589934592', '4294967296', '2147483648', '1073741824', '536870912', '32' ]: metric = op + '_' + size + '_' + name assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) assert (benchmark.result['allreduce_8589934592_time'][0] == 63896.0) assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.44) assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.26) assert (benchmark.result['alltoall_8589934592_time'][0] == 33508.0) assert (benchmark.result['alltoall_8589934592_algbw'][0] == 256.36) assert (benchmark.result['alltoall_8589934592_busbw'][0] == 224.31)
def test_cpu_mem_bw_latency_benchmark_result_parsing(self): """Test cpu-memory-bw-latency benchmark result parsing.""" benchmark_name = 'cpu-memory-bw-latency' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU) assert (benchmark_class) all_mlc_tests = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth'] param_str = '--tests %s' % ' '.join(all_mlc_tests) benchmark = benchmark_class(benchmark_name, parameters=param_str) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'cpu-memory-bw-latency') assert (benchmark.type == BenchmarkType.MICRO) # Check commands assert (len(all_mlc_tests) == len(benchmark._commands)) for mlc_test, command in zip(all_mlc_tests, benchmark._commands): assert ('mlc --%s;' % mlc_test in command) # Positive case - valid bandwidth matrix output. test_raw_output = """ Intel(R) Memory Latency Checker - v3.9a Command line parameters: --bandwidth_matrix Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes *** Unable to modify prefetchers (try executing 'modprobe msr') *** So, enabling random access for latency measurements Measuring Memory Bandwidths between nodes within system Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec) Using all the threads from each core if Hyper-threading is enabled Using Read-only traffic type Numa node Numa node 0 1 0 82542.2 76679.9 1 76536.0 82986.5 """ assert (benchmark._process_raw_result(0, test_raw_output)) assert (benchmark.return_code == ReturnCode.SUCCESS) assert ('raw_output_0' in benchmark.raw_data) assert ([test_raw_output] == benchmark.raw_data['raw_output_0']) assert ([82542.2] == benchmark.result['mem_bandwidth_matrix_numa_0_0_bw']) assert ([76679.9] == benchmark.result['mem_bandwidth_matrix_numa_0_1_bw']) assert ([76536.0] == benchmark.result['mem_bandwidth_matrix_numa_1_0_bw']) assert ([82986.5] == benchmark.result['mem_bandwidth_matrix_numa_1_1_bw']) # Positive case - valid latency matrix output. test_raw_output = """ Intel(R) Memory Latency Checker - v3.9a Command line parameters: --latency_matrix Using buffer size of 600.000MiB *** Unable to modify prefetchers (try executing 'modprobe msr') *** So, enabling random access for latency measurements Measuring idle latencies (in ns)... Numa node Numa node 0 1 0 87.0 101.0 1 101.9 86.9 """ assert (benchmark._process_raw_result(1, test_raw_output)) assert (benchmark.return_code == ReturnCode.SUCCESS) assert ('raw_output_1' in benchmark.raw_data) assert ([test_raw_output] == benchmark.raw_data['raw_output_1']) assert ([87.0] == benchmark.result['mem_latency_matrix_numa_0_0_lat']) assert ([101.0] == benchmark.result['mem_latency_matrix_numa_0_1_lat']) assert ([101.9] == benchmark.result['mem_latency_matrix_numa_1_0_lat']) assert ([86.9] == benchmark.result['mem_latency_matrix_numa_1_1_lat']) # Positive case - valid max bandwidth output. test_raw_output = """ Intel(R) Memory Latency Checker - v3.9a Command line parameters: --max_bandwidth Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes *** Unable to modify prefetchers (try executing 'modprobe msr') *** So, enabling random access for latency measurements Measuring Maximum Memory Bandwidths for the system Will take several minutes to complete as multiple injection rates will be tried to get the best bandwidth Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec) Using all the threads from each core if Hyper-threading is enabled Using traffic with the following read-write ratios ALL Reads : 165400.60 3:1 Reads-Writes : 154975.19 2:1 Reads-Writes : 158433.32 1:1 Reads-Writes : 157352.05 Stream-triad like: 157878.32 """ assert (benchmark._process_raw_result(2, test_raw_output)) assert (benchmark.return_code == ReturnCode.SUCCESS) assert ('raw_output_2' in benchmark.raw_data) assert ([test_raw_output] == benchmark.raw_data['raw_output_2']) assert ([165400.60] == benchmark.result['mem_max_bandwidth_all_reads_bw']) assert ([154975.19] == benchmark.result['mem_max_bandwidth_3_1_reads-writes_bw']) assert ([158433.32] == benchmark.result['mem_max_bandwidth_2_1_reads-writes_bw']) assert ([157352.05] == benchmark.result['mem_max_bandwidth_1_1_reads-writes_bw']) assert ([157878.32] == benchmark.result['mem_max_bandwidth_stream-triad_like_bw']) # Negative case - invalid raw output. assert (benchmark._process_raw_result(0, 'Invalid raw output') is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
def test_flops_performance_cuda(self): """Test gemm-flops benchmark.""" benchmark_name = 'gemm-flops' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) # Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE. benchmark = benchmark_class( benchmark_name, parameters= '--num_warmup 200 --n 1024 --k 512 --m 2048 --precision fp32 tf32_tc fp16_tc int8_tc' ) ret = benchmark._preprocess() if dm.device_manager.get_device_compute_capability( ) not in benchmark._CudaGemmFlopsBenchmark__kernel_map: assert (ret is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE) else: assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) # Check basic information. assert (benchmark.name == 'gemm-flops') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._bin_name == 'cutlass_profiler') # Check parameters specified in BenchmarkContext. assert (benchmark._args.num_warmup == 200) assert (benchmark._args.n == 1024) assert (benchmark._args.k == 512) assert (benchmark._args.m == 2048) assert (benchmark._args.precision == [ 'fp32', 'tf32_tc', 'fp16_tc', 'int8_tc' ]) benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = [ 'fp32', 'tf32_tc', 'fp16_tc', 'int8_tc' ] # Check results and metrics. raw_output_fp32 = """ CSV Results: Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs 1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nn_align1,passed,success,universal,16384,16384,16384,f32:column,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.022,6.23672,18287.4 1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nt_align1,passed,success,universal,16384,16384,16384,f32:column,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,478.866,6.2648,18369.7 1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tn_align1,passed,success,universal,16384,16384,16384,f32:row,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,482.034,6.22363,18249 1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tt_align1,passed,success,universal,16384,16384,16384,f32:row,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.838,6.22616,18256.4 """ raw_output_tf32_tc = """ CSV Results: Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs 1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nn_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,88.5764,33.8691,99311.2 1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nt_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,70.3503,42.6438,125040 1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tn_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,86.5167,34.6754,101676 1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tt_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,68.3621,43.884,128677 """ raw_output_fp16_tc = """ CSV Results: Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nn_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.1575,43.9142,257531 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nt_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.6153,43.3334,254126 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tn_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,39.0413,38.4209,225316 1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tt_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,31.2994,47.9243,281048 """ assert (benchmark._process_raw_result(0, raw_output_fp32)) assert (benchmark._process_raw_result(1, raw_output_tf32_tc)) assert (benchmark._process_raw_result(2, raw_output_fp16_tc)) assert (benchmark.result['fp32_flops'][0] == 18369.7) assert (benchmark.result['tf32_tc_flops'][0] == 128677) assert (benchmark.result['fp16_tc_flops'][0] == 281048) # Negative case - Add invalid raw output. assert (benchmark._process_raw_result(3, 'Invalid raw output') is False)
def test_disk_performance_benchmark_enabled(self, mock_is_block_device): """Test disk-performance benchmark command generation with all benchmarks enabled.""" mock_is_block_device.return_value = True benchmark_name = 'disk-benchmark' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) block_devices = ['mock_block_device_0', 'mock_block_device_1'] block_device_option = '--block_devices ' + ' '.join(block_devices) init_test_magic = 45 curr_test_magic = init_test_magic param_str = block_device_option # Sequential precondition param_str += ' --enable_seq_precond' # Random precondition param_str += ' --rand_precond_time=%d' % curr_test_magic curr_test_magic += 1 # Seq/rand read/write for io_pattern in ['seq', 'rand']: for io_type in ['read', 'write', 'readwrite']: io_str = '%s_%s' % (io_pattern, io_type) param_str += ' --%s_ramp_time=%d' % (io_str, curr_test_magic) curr_test_magic += 1 param_str += ' --%s_runtime=%d' % (io_str, curr_test_magic) curr_test_magic += 1 param_str += ' --%s_iodepth=%d' % (io_str, curr_test_magic) curr_test_magic += 1 param_str += ' --%s_numjobs=%d' % (io_str, curr_test_magic) curr_test_magic += 1 benchmark = benchmark_class(benchmark_name, parameters=param_str) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'disk-benchmark') assert (benchmark.type == BenchmarkType.MICRO) # Check command list # 2 files * (2 preconditions + 3 io_patterns * 2 io_types) = 16 commands assert (16 == len(benchmark._commands)) # Check parameter assignments command_idx = 0 default_rwmixread = 80 for block_device in block_devices: curr_test_magic = init_test_magic # Sequential precondition assert ('--filename=%s' % block_device in benchmark._commands[command_idx]) command_idx += 1 # Random precondition assert ('--filename=%s' % block_device in benchmark._commands[command_idx]) assert ('--runtime=%d' % curr_test_magic in benchmark._commands[command_idx]) curr_test_magic += 1 command_idx += 1 # Seq/rand read/write for io_pattern in ['seq', 'rand']: for io_type in ['read', 'write', 'rw']: assert ('--filename=%s' % block_device in benchmark._commands[command_idx]) fio_rw = '%s%s' % (io_pattern if io_pattern == 'rand' else '', io_type) assert ('--rw=%s' % fio_rw in benchmark._commands[command_idx]) assert ('--ramp_time=%d' % curr_test_magic in benchmark._commands[command_idx]) curr_test_magic += 1 assert ('--runtime=%d' % curr_test_magic in benchmark._commands[command_idx]) curr_test_magic += 1 assert ('--iodepth=%d' % curr_test_magic in benchmark._commands[command_idx]) curr_test_magic += 1 assert ('--numjobs=%d' % curr_test_magic in benchmark._commands[command_idx]) curr_test_magic += 1 if io_type == 'rw': assert ('--rwmixread=%d' % default_rwmixread in benchmark._commands[command_idx]) command_idx += 1
def test_rocm_onnxruntime_performance(): """Test onnxruntime model benchmark.""" benchmark_name = 'onnxruntime-ort-models' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.ROCM) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) assert (benchmark._benchmark_type == BenchmarkType.DOCKER) assert (benchmark._image_uri == 'superbench/benchmark:rocm4.3.1-onnxruntime1.9.0') assert (benchmark._container_name == 'rocm-onnxruntime-model-benchmarks') assert ( benchmark._entrypoint == '/stage/onnxruntime-training-examples/huggingface/azureml/run_benchmark.sh' ) assert (benchmark._cmd is None) benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS) benchmark._args = SimpleNamespace(log_raw_data=False) raw_output = """ __superbench__ begin bert-large-uncased ngpu=1 "samples_per_second": 21.829 __superbench__ begin bert-large-uncased ngpu=8 "samples_per_second": 147.181 __superbench__ begin distilbert-base-uncased ngpu=1 "samples_per_second": 126.827 __superbench__ begin distilbert-base-uncased ngpu=8 "samples_per_second": 966.796 __superbench__ begin gpt2 ngpu=1 "samples_per_second": 20.46 __superbench__ begin gpt2 ngpu=8 "samples_per_second": 151.089 __superbench__ begin facebook/bart-large ngpu=1 "samples_per_second": 66.171 __superbench__ begin facebook/bart-large ngpu=8 "samples_per_second": 370.343 __superbench__ begin roberta-large ngpu=1 "samples_per_second": 37.103 __superbench__ begin roberta-large ngpu=8 "samples_per_second": 274.455 """ assert (benchmark._process_raw_result(0, raw_output)) assert ( benchmark.result['bert_large_uncased_ngpu_1_throughput'][0] == 21.829) assert ( benchmark.result['bert_large_uncased_ngpu_8_throughput'][0] == 147.181) assert (benchmark.result['distilbert_base_uncased_ngpu_1_throughput'][0] == 126.827) assert (benchmark.result['distilbert_base_uncased_ngpu_8_throughput'][0] == 966.796) assert (benchmark.result['gpt2_ngpu_1_throughput'][0] == 20.46) assert (benchmark.result['gpt2_ngpu_8_throughput'][0] == 151.089) assert ( benchmark.result['facebook_bart_large_ngpu_1_throughput'][0] == 66.171) assert (benchmark.result['facebook_bart_large_ngpu_8_throughput'][0] == 370.343) assert (benchmark.result['roberta_large_ngpu_1_throughput'][0] == 37.103) assert (benchmark.result['roberta_large_ngpu_8_throughput'][0] == 274.455)