Esempio n. 1
0
def test_fambench():
    """Test FAMBench benchmarks."""
    benchmark_name = 'fambench'
    (benchmark_class,
     predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
    assert (benchmark_class)
    benchmark = benchmark_class(benchmark_name)
    assert (benchmark._benchmark_type == BenchmarkType.DOCKER)
    assert (benchmark._image_uri == 'superbench/benchmark:cuda11.1.1-fambench')
    assert (benchmark._container_name == 'fambench-benchmarks')
    assert (benchmark._entrypoint == '/workspace/FAMBench/benchmarks/run_all_benchmarks.sh')
    assert (benchmark._cmd is None)
    benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS)
    benchmark._args = SimpleNamespace(log_raw_data=False)

    raw_output = """
benchmark implementation mode config score units batch_latency_95_sec
DLRM OOTB eval tiny 152.800399 ex/s 0.515052
DLRM OOTB train tiny 35.483686 ex/s None
DLRM UBENCH train linear_[(2,2,2,2,2)] 3.679281e-07 TF/s None
XLMR OOTB eval default-config 1.015586 ex/s 16.463461
"""
    assert (benchmark._process_raw_result(0, raw_output))
    assert (benchmark.result['dlrm_ootb_eval_tiny_ex_s'][0] == 152.800399)
    assert (benchmark.result['dlrm_ootb_train_tiny_ex_s'][0] == 35.483686)
    assert (benchmark.result['dlrm_ubench_train_linear_[(2,2,2,2,2)]_tf_s'][0] == 3.679281e-07)
    assert (benchmark.result['xlmr_ootb_eval_default_config_ex_s'][0] == 1.015586)
    def test_tensorrt_inference_result_parsing(self, test_raw_log):
        """Test tensorrt-inference benchmark result parsing."""
        (benchmark_cls,
         _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             self.benchmark_name, Platform.CUDA)
        benchmark = benchmark_cls(self.benchmark_name, parameters='')
        benchmark._args = SimpleNamespace(
            pytorch_models=['model_0', 'model_1'], log_raw_data=False)
        benchmark._result = BenchmarkResult(self.benchmark_name,
                                            BenchmarkType.MICRO,
                                            ReturnCode.SUCCESS,
                                            run_count=1)

        # Positive case - valid raw output
        self.assertTrue(benchmark._process_raw_result(0, test_raw_log))
        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)

        self.assertEqual(6 + benchmark.default_metric_count,
                         len(benchmark.result))
        for tag in ['mean', '99']:
            self.assertEqual(0.5,
                             benchmark.result[f'model_0_gpu_time_{tag}'][0])
            self.assertEqual(0.6,
                             benchmark.result[f'model_0_host_time_{tag}'][0])
            self.assertEqual(
                1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0])

        # Negative case - invalid raw output
        self.assertFalse(benchmark._process_raw_result(1,
                                                       'Invalid raw output'))
Esempio n. 3
0
    def _test_gpu_copy_bw_performance_result_parsing(self, platform, test_raw_output):
        """Test gpu-copy benchmark result parsing."""
        benchmark_name = 'gpu-copy-bw'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
        assert (benchmark_class)
        benchmark = benchmark_class(benchmark_name, parameters='')
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'gpu-copy-bw')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Positive case - valid raw output.
        assert (benchmark._process_raw_result(0, test_raw_output))
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        assert (1 == len(benchmark.raw_data))
        print(test_raw_output.splitlines())
        test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()}
        assert (len(test_raw_output_dict) + benchmark.default_metric_count == len(benchmark.result))
        for output_key in benchmark.result:
            if output_key == 'return_code':
                assert (benchmark.result[output_key] == [0])
            else:
                assert (len(benchmark.result[output_key]) == 1)
                assert (isinstance(benchmark.result[output_key][0], numbers.Number))
                assert (output_key.strip('_bw') in test_raw_output_dict)
                assert (test_raw_output_dict[output_key.strip('_bw')] == benchmark.result[output_key][0])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
Esempio n. 4
0
    def test_disk_performance_benchmark_disabled(self, mock_is_block_device):
        """Test disk-performance benchmark command generation with all benchmarks disabled."""
        mock_is_block_device.return_value = True

        benchmark_name = 'disk-benchmark'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)

        block_devices = ['/dev/nvme0n1', '/dev/nvme1n1']
        block_device_option = '--block_devices ' + ' '.join(block_devices)

        param_str = block_device_option
        param_str += ' --rand_precond_time=0'
        param_str += ' --seq_read_runtime=0'
        param_str += ' --rand_read_runtime=0'
        benchmark = benchmark_class(benchmark_name, parameters=param_str)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'disk-benchmark')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Command list should be empty
        assert (0 == len(benchmark._commands))
 def test_tensorrt_inference_cls(self):
     """Test tensorrt-inference benchmark class."""
     for platform in Platform:
         (benchmark_cls,
          _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
              self.benchmark_name, platform)
         if platform is Platform.CUDA:
             self.assertIsNotNone(benchmark_cls)
         else:
             self.assertIsNone(benchmark_cls)
def test_ort_inference_performance(mock_ort_session_run, mock_get_dir):
    """Test ort-inference benchmark."""
    benchmark_name = 'ort-inference'
    (benchmark_class, predefine_params
     ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
         benchmark_name, Platform.CUDA)
    assert (benchmark_class)

    mock_get_dir.return_value = '/tmp/superbench/'
    benchmark = benchmark_class(
        benchmark_name,
        parameters=
        '--pytorch_models resnet50 --graph_opt_level 1 --precision float16'
        ' --batch_size 16 --num_warmup 128 --num_steps 512')

    assert (isinstance(benchmark, ORTInferenceBenchmark))
    assert (benchmark._preprocess())

    # Check basic information.
    assert (benchmark.name == 'ort-inference')
    assert (benchmark.type == BenchmarkType.MICRO)
    assert (benchmark._ORTInferenceBenchmark__model_cache_path == Path(
        torch.hub.get_dir()) / 'checkpoints')
    for model in benchmark._args.pytorch_models:
        assert (hasattr(torchvision.models, model))
        file_name = '{model}.{precision}.onnx'.format(
            model=model, precision=benchmark._args.precision)
        assert ((benchmark._ORTInferenceBenchmark__model_cache_path /
                 file_name).is_file())

    # Check parameters specified in BenchmarkContext.
    assert (benchmark._args.pytorch_models == ['resnet50'])
    assert (benchmark._args.graph_opt_level == 1)
    assert (benchmark._args.precision == Precision.FLOAT16)
    assert (benchmark._args.batch_size == 16)
    assert (benchmark._args.num_warmup == 128)
    assert (benchmark._args.num_steps == 512)

    # Check results and metrics.
    assert (benchmark._benchmark())
    shutil.rmtree(benchmark._ORTInferenceBenchmark__model_cache_path)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'}
    for model in benchmark._args.pytorch_models:
        if benchmark._args.precision.value in precision_metric:
            precision = precision_metric[benchmark._args.precision.value]
        else:
            precision = benchmark._args.precision.value
        metric = '{}_{}_time'.format(precision, model)
        assert (metric in benchmark.result)
        assert (metric in benchmark.raw_data)
Esempio n. 7
0
    def test_gpcnet_network_test(self, raw_output, raw_output_no_execution):
        """Test gpcnet-network-test benchmark."""
        # Check registry.
        benchmark_name = 'gpcnet-network-test'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
        assert (benchmark_class)

        # Check preprocess
        benchmark = benchmark_class(benchmark_name)
        ret = benchmark._preprocess()
        assert (ret)

        expect_command = 'network_test'
        command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
        assert (command == expect_command)

        assert (benchmark._process_raw_result(0, raw_output_no_execution))
        assert (len(benchmark.result) == benchmark.default_metric_count)

        # Check function process_raw_data.
        # Positive case - valid raw output.
        assert (benchmark._process_raw_result(0, raw_output))
        metric_list = [
            'rr_two-sided_lat',
            'rr_get_lat',
            'rr_two-sided_bw',
            'rr_put_bw',
            'rr_two-sided+sync_bw',
            'nat_two-sided_bw',
            'multiple_allreduce_time',
            'multiple_alltoall_bw',
        ]
        for metric_medium in metric_list:
            for suffix in ['avg', '99%']:
                metric = metric_medium + '_' + suffix
                assert (metric in benchmark.result)
                assert (len(benchmark.result[metric]) == 1)
                assert (isinstance(benchmark.result[metric][0], numbers.Number))

        # Negative case - Add invalid raw output.
        assert (benchmark._process_raw_result(0, 'ERROR') is False)

        # Check basic information.
        assert (benchmark.name == 'gpcnet-network-test')
        assert (benchmark.type == BenchmarkType.MICRO)
        assert (benchmark._bin_name == 'network_test')
Esempio n. 8
0
    def _test_gpu_copy_bw_performance_command_generation(self, platform):
        """Test gpu-copy benchmark command generation."""
        benchmark_name = 'gpu-copy-bw'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
        assert (benchmark_class)

        size = 1048576
        num_warm_up = 20
        num_loops = 10000
        mem_types = ['htod', 'dtoh', 'dtod']
        copy_types = ['sm', 'dma']

        parameters = '--mem_type %s --copy_type %s --size %d ' \
            '--num_warm_up %d --num_loops %d --bidirectional --check_data' % \
            (' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops)
        benchmark = benchmark_class(benchmark_name, parameters=parameters)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == benchmark_name)
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.mem_type == mem_types)
        assert (benchmark._args.copy_type == copy_types)
        assert (benchmark._args.size == size)
        assert (benchmark._args.num_warm_up == num_warm_up)
        assert (benchmark._args.num_loops == num_loops)
        assert (benchmark._args.bidirectional)
        assert (benchmark._args.check_data)

        # Check command
        assert (1 == len(benchmark._commands))
        assert (benchmark._commands[0].startswith(benchmark._GpuCopyBwBenchmark__bin_path))
        for mem_type in mem_types:
            assert ('--%s' % mem_type in benchmark._commands[0])
        for copy_type in copy_types:
            assert ('--%s_copy' % copy_type in benchmark._commands[0])
        assert ('--size %d' % size in benchmark._commands[0])
        assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
        assert ('--num_loops %d' % num_loops in benchmark._commands[0])
        assert ('--bidirectional' in benchmark._commands[0])
        assert ('--check_data' in benchmark._commands[0])
Esempio n. 9
0
    def test_gpu_burn(self, results):
        """Test gpu-burn benchmark command generation."""
        benchmark_name = 'gpu-burn'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CUDA)
        assert (benchmark_class)

        time = 10

        parameters = '--doubles --tensor_core --time ' + str(time)
        benchmark = benchmark_class(benchmark_name, parameters=parameters)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == benchmark_name)
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.time == time)
        assert (benchmark._args.doubles)
        assert (benchmark._args.tensor_core)

        # Check command
        compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./'
        compare_rm = 'rm ' + 'compare.ptx'
        assert (1 == len(benchmark._commands))
        assert (benchmark._commands[0].startswith(compare_copy))
        assert ('-d' in benchmark._commands[0])
        assert ('-tc' in benchmark._commands[0])
        assert (str(time) in benchmark._commands[0])
        assert (compare_rm in benchmark._commands[0])

        # Check results
        assert (benchmark._process_raw_result(0, results))
        assert (benchmark.result['return_code'][0] == 0)
        assert (benchmark.result['time'][0] == time)
        for device in range(8):
            assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
        assert (benchmark.result['abort'][0] == 0)
Esempio n. 10
0
    def test_disk_performance_empty_param(self):
        """Test disk-performance benchmark command generation with empty parameter."""
        benchmark_name = 'disk-benchmark'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)

        benchmark = benchmark_class(benchmark_name, parameters='')

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'disk-benchmark')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Command list should be empty
        assert (0 == len(benchmark._commands))
    def test_cpu_mem_bw_latency_benchmark_empty_param(self):
        """Test cpu-memory-bw-latency benchmark command generation with empty parameter."""
        benchmark_name = 'cpu-memory-bw-latency'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
        assert (benchmark_class)

        default_mlc_test = 'bandwidth_matrix'
        benchmark = benchmark_class(benchmark_name, parameters='')

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'cpu-memory-bw-latency')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check commands
        assert (1 == len(benchmark._commands))
        assert ('mlc --%s;' % default_mlc_test in benchmark._commands[0])
Esempio n. 12
0
def create_benchmark(params='--num_steps 8'):
    """Register and create benchmark."""
    # Register the FakeModelBenchmark benchmark.
    BenchmarkRegistry.register_benchmark(
        'pytorch-fake-model',
        FakeModelBenchmark,
        parameters='--hidden_size 2',
        platform=Platform.CUDA,
    )
    context = BenchmarkRegistry.create_benchmark_context(
        'fake-model',
        platform=Platform.CUDA,
        parameters=params,
        framework=Framework.PYTORCH)
    name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context)
    assert (name)
    (benchmark_class, predefine_params
     ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
         name, context.platform)
    assert (benchmark_class)
    return benchmark_class(name, predefine_params + ' ' + context.parameters)
Esempio n. 13
0
    def test_cuda_memory_bw_performance(self, raw_output_h2d, raw_output_d2h,
                                        raw_output_d2d):
        """Test cuda mem-bw benchmark."""
        benchmark_name = 'mem-bw'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CUDA)
        assert (benchmark_class)

        benchmark = benchmark_class(benchmark_name,
                                    parameters='--shmoo_mode --memory=pinned')

        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # Check basic information.
        assert (benchmark)
        assert (benchmark.name == 'mem-bw')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check command list
        expected_command = [
            'bandwidthTest --htod mode=shmoo memory=pinned --csv',
            'bandwidthTest --dtoh mode=shmoo memory=pinned --csv',
            'bandwidthTest --dtod mode=shmoo memory=pinned --csv'
        ]
        for i in range(len(expected_command)):
            command = benchmark._bin_name + benchmark._commands[i].split(
                benchmark._bin_name)[1]
            assert (command == expected_command[i])

        # Check results and metrics.
        raw_output = [raw_output_h2d, raw_output_d2h, raw_output_d2d]
        for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
            assert (benchmark._process_raw_result(i, raw_output[i]))
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1)
            assert (isinstance(benchmark.result[metric][0], numbers.Number))
Esempio n. 14
0
    def test_disk_performance_invalid_block_device(self, mock_is_block_device):
        """Test disk-performance benchmark command generation with invalid block device."""
        mock_is_block_device.return_value = False

        benchmark_name = 'disk-benchmark'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)

        block_devices = ['mock_block_device_0']
        block_device_option = '--block_devices ' + ' '.join(block_devices)

        benchmark = benchmark_class(benchmark_name,
                                    parameters=block_device_option)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is False)
        assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)
        assert (benchmark.name == 'disk-benchmark')
        assert (benchmark.type == BenchmarkType.MICRO)
Esempio n. 15
0
    def test_rocm_memory_bw_performance(self, raw_output_h2d, raw_output_d2h):
        """Test rocm mem-bw benchmark."""
        benchmark_name = 'mem-bw'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.ROCM)
        assert (benchmark_class)

        benchmark = benchmark_class(benchmark_name)

        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # Check basic information.
        assert (benchmark)
        assert (benchmark.name == 'mem-bw')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check command list
        expected_command = ['hipBusBandwidth --h2d', 'hipBusBandwidth --d2h']
        for i in range(len(expected_command)):
            commnad = benchmark._bin_name + benchmark._commands[i].split(
                benchmark._bin_name)[1]
            assert (commnad == expected_command[i])

        # Check results and metrics.
        raw_output = [raw_output_h2d, raw_output_d2h]
        for i, metric in enumerate(['h2d_bw', 'd2h_bw']):
            assert (benchmark._process_raw_result(i, raw_output[i]))
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1)
            assert (isinstance(benchmark.result[metric][0], numbers.Number))

        assert (benchmark.result['h2d_bw'][0] == 25.2351)
        assert (benchmark.result['d2h_bw'][0] == 27.9348)
Esempio n. 16
0
    def test_ib_traffic_performance(self, mock_gpu):
        """Test ib-traffic benchmark."""
        # Test without ib devices
        # Check registry.
        benchmark_name = 'ib-traffic'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)

        # Check preprocess
        # Negative cases
        parameters = '--ib_dev 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        ret = benchmark._preprocess()
        assert (ret is False)
        # no hostfile
        assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)

        hosts = ['node0\n', 'node1\n', 'node2\n', 'node3\n']
        with open('hostfile', 'w') as f:
            f.writelines(hosts)

        parameters = '--ib_dev 0 --msg_size invalid --pattern one-to-one --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        ret = benchmark._preprocess()
        assert (ret is False)
        assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)

        # Positive cases
        os.environ['OMPI_COMM_WORLD_SIZE'] = '3'
        parameters = '--ib_dev 0 --iters 2000 --pattern one-to-one --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        ret = benchmark._preprocess()
        assert (ret is True)

        # Generate config
        parameters = '--ib_dev mlx5_0 --iters 2000 --msg_size 33554432 --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        os.environ['OMPI_COMM_WORLD_SIZE'] = '4'
        ret = benchmark._preprocess()
        Path('config.txt').unlink()
        assert (ret)
        expect_command = "ib_validation --cmd_prefix '" + benchmark._args.bin_dir + \
            "/ib_write_bw -F -n 2000 -d mlx5_0 -s 33554432 --report_gbits' " + \
            f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt'
        command = benchmark._bin_name + benchmark._commands[0].split(
            benchmark._bin_name)[1]
        assert (command == expect_command)

        parameters = '--ib_dev mlx5_0 --msg_size 0 --iters 2000 --pattern one-to-one --hostfile hostfile --gpu_dev 0'
        mock_gpu.return_value = 'nvidia'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        ret = benchmark._preprocess()
        expect_command = "ib_validation --cmd_prefix '" + benchmark._args.bin_dir + \
            "/ib_write_bw -F -n 2000 -d mlx5_0 -a --use_cuda=0 --report_gbits' " + \
            f'--timeout 120 --hostfile hostfile --input_config {os.getcwd()}/config.txt'
        command = benchmark._bin_name + benchmark._commands[0].split(
            benchmark._bin_name)[1]
        assert (command == expect_command)
        mock_gpu.return_value = 'amd'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        ret = benchmark._preprocess()
        expect_command = expect_command.replace('cuda', 'rocm')
        command = benchmark._bin_name + benchmark._commands[0].split(
            benchmark._bin_name)[1]
        assert (command == expect_command)

        # Custom config
        config = ['0,1', '1,0;0,1', '0,1;1,0', '1,0;0,1']
        with open('test_config.txt', 'w') as f:
            for line in config:
                f.write(line + '\n')
        parameters = '--ib_dev mlx5_0 --timeout 180 --iters 2000 --msg_size 33554432 ' + \
            '--config test_config.txt --hostfile hostfile'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
        ret = benchmark._preprocess()
        Path('test_config.txt').unlink()
        assert (ret)
        expect_command = "ib_validation --cmd_prefix '" + benchmark._args.bin_dir + \
            "/ib_write_bw -F -n 2000 -d mlx5_0 -s 33554432 --report_gbits' " + \
            '--timeout 180 --hostfile hostfile --input_config test_config.txt'

        command = benchmark._bin_name + benchmark._commands[0].split(
            benchmark._bin_name)[1]
        assert (command == expect_command)
        # suppose gpu driver mismatch issue or other traffic issue cause -1 result
        raw_output_0 = """
The prefix of cmd to run is: ib_write_bw -a -d ibP257p0s0
Load the config file from: config.txt
Output will be saved to:
config:
0,1
1,0;0,1
0,1;1,0
1,0;0,1
config end
results from rank ROOT_RANK:
-1,
-1,-1
-1,-1
-1,-1
"""
        raw_output_1 = """
The prefix of cmd to run is: ib_write_bw -a -d ibP257p0s0
Load the config file from: config.txt
Output will be saved to:
config:
0,1
1,0;0,1
0,1;1,0
1,0;0,1
config end
results from rank ROOT_RANK:
23452.6,
22212.6,22433
22798.8,23436.3
23435.3,22766.5
"""
        raw_output_2 = """
The prefix of cmd to run is: ib_write_bw -F -n 2000 -d mlx5_0 -s 33554432
Load the config file from: config.txt
Output will be saved to:
config:
0,1
1,0;0,1
0,1;1,0
1,0;0,1
config end
results from rank ROOT_RANK:
23452.6,
22212.6,22433,
22798.8,23436.3,
"""
        raw_output_3 = """
--------------------------------------------------------------------------
mpirun was unable to launch the specified application as it could not access
or execute an executable:

while attempting to start process rank 0.
--------------------------------------------------------------------------
2 total processes failed to start
"""

        # Check function process_raw_data.
        # Positive cases - valid raw output.
        os.environ['OMPI_COMM_WORLD_RANK'] = '0'
        assert (benchmark._process_raw_result(0, raw_output_0))
        for metric in benchmark.result:
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1)
            assert (isinstance(benchmark.result[metric][0], numbers.Number))
        values = list(benchmark.result.values())[1:]
        assert (all(value == [-1.0] for value in values))

        assert (benchmark._process_raw_result(0, raw_output_1))
        for index, metric in enumerate(benchmark.result):
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1
                    if index == 0 else len(benchmark.result[metric]) == 2)
            assert (isinstance(benchmark.result[metric][0], numbers.Number))

        # Negative cases - invalid raw output.
        assert (benchmark._process_raw_result(0, raw_output_2) is False)
        assert (benchmark._process_raw_result(0, raw_output_3) is False)
        os.environ.pop('OMPI_COMM_WORLD_RANK')

        # Check basic information.
        assert (benchmark.name == 'ib-traffic')
        assert (benchmark.type == BenchmarkType.MICRO)
        assert (benchmark._bin_name == 'ib_validation')

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.ib_dev == 'mlx5_0')
        assert (benchmark._args.iters == 2000)
        assert (benchmark._args.msg_size == 33554432)
        assert (benchmark._args.command == 'ib_write_bw')
Esempio n. 17
0
    def test_disk_performance_result_parsing(self, test_raw_output):
        """Test disk-performance benchmark result parsing."""
        benchmark_name = 'disk-benchmark'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)
        benchmark = benchmark_class(benchmark_name, parameters='')
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'disk-benchmark')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Positive case - valid raw output.
        jobname_prefix = 'nvme0n1_rand_read_write'
        assert (benchmark._process_raw_result(0, test_raw_output))
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # bs + <read, write> x <iops, 95th, 99th, 99.9th>
        assert (9 + benchmark.default_metric_count == len(
            benchmark.result.keys()))

        assert (1 == len(benchmark.result[jobname_prefix + '_bs']))
        assert (4096 == benchmark.result[jobname_prefix + '_bs'][0])

        assert (1 == len(benchmark.result[jobname_prefix + '_read_iops']))
        assert (85138.890741 == benchmark.result[jobname_prefix +
                                                 '_read_iops'][0])
        assert (1 == len(benchmark.result[jobname_prefix + '_write_iops']))
        assert (85066.128925 == benchmark.result[jobname_prefix +
                                                 '_write_iops'][0])

        assert (1 == len(benchmark.result[jobname_prefix +
                                          '_read_lat_ns_95.0']))
        assert (1941504 == benchmark.result[jobname_prefix +
                                            '_read_lat_ns_95.0'][0])
        assert (1 == len(benchmark.result[jobname_prefix +
                                          '_read_lat_ns_99.0']))
        assert (2244608 == benchmark.result[jobname_prefix +
                                            '_read_lat_ns_99.0'][0])
        assert (1 == len(benchmark.result[jobname_prefix +
                                          '_read_lat_ns_99.9']))
        assert (3620864 == benchmark.result[jobname_prefix +
                                            '_read_lat_ns_99.9'][0])

        assert (1 == len(benchmark.result[jobname_prefix +
                                          '_write_lat_ns_95.0']))
        assert (1908736 == benchmark.result[jobname_prefix +
                                            '_write_lat_ns_95.0'][0])
        assert (1 == len(benchmark.result[jobname_prefix +
                                          '_write_lat_ns_99.0']))
        assert (2072576 == benchmark.result[jobname_prefix +
                                            '_write_lat_ns_99.0'][0])
        assert (1 == len(benchmark.result[jobname_prefix +
                                          '_write_lat_ns_99.9']))
        assert (2605056 == benchmark.result[jobname_prefix +
                                            '_write_lat_ns_99.9'][0])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(1, 'Invalid raw output') is
                False)
        assert (benchmark.return_code ==
                ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
Esempio n. 18
0
    def test_ib_loopback_all_sizes(self, raw_output, mock_ib_devices,
                                   mock_numa_cores):
        """Test ib-loopback benchmark for all sizes."""
        # Test without ib devices
        # Check registry.
        benchmark_name = 'ib-loopback'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)

        # Check preprocess
        # Negative case
        parameters = '--ib_index 0 --numa 0 --iters 2000'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        mock_ib_devices.return_value = None
        ret = benchmark._preprocess()
        assert (ret is False)
        assert (benchmark.return_code ==
                ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
        parameters = '--ib_index 0 --numa 0 --iters 2000'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)
        mock_numa_cores.return_value = None
        ret = benchmark._preprocess()
        assert (ret is False)
        assert (benchmark.return_code ==
                ReturnCode.MICROBENCHMARK_DEVICE_GETTING_FAILURE)
        # Positive case
        parameters = '--ib_index 0 --numa 0 --iters 2000'
        benchmark = benchmark_class(benchmark_name, parameters=parameters)

        mock_ib_devices.return_value = ['mlx5_0']
        mock_numa_cores.return_value = [0, 1, 2, 3]
        os.environ['PROC_RANK'] = '0'
        os.environ['IB_DEVICES'] = '0,2,4,6'
        os.environ['NUMA_NODES'] = '1,0,3,2'
        ret = benchmark._preprocess()
        assert (ret)

        port = benchmark._IBLoopbackBenchmark__sock_fds[-1].getsockname()[1]
        expect_command = 'run_perftest_loopback 3 1 ' + benchmark._args.bin_dir + \
            f'/ib_write_bw -a -F --iters=2000 -d mlx5_0 -p {port} -x 0 --report_gbits'
        command = benchmark._bin_name + benchmark._commands[0].split(
            benchmark._bin_name)[1]
        assert (command == expect_command)

        assert (benchmark._process_raw_result(0, raw_output))

        # Check function process_raw_data.
        # Positive case - valid raw output.
        metric_list = []
        for ib_command in benchmark._args.commands:
            for size in ['8388608', '4194304', '1024', '2']:
                metric = 'ib_{}_bw_{}:{}'.format(ib_command, size,
                                                 str(benchmark._args.ib_index))
                metric_list.append(metric)
        for metric in metric_list:
            assert (metric in benchmark.result)
            assert (len(benchmark.result[metric]) == 1)
            assert (isinstance(benchmark.result[metric][0], numbers.Number))

        # Negative case - Add invalid raw output.
        assert (benchmark._process_raw_result(0, 'Invalid raw output') is
                False)

        # Check basic information.
        assert (benchmark.name == 'ib-loopback')
        assert (benchmark.type == BenchmarkType.MICRO)
        assert (benchmark._bin_name == 'run_perftest_loopback')

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.ib_index == 0)
        assert (benchmark._args.numa == 1)
        assert (benchmark._args.iters == 2000)
        assert (benchmark._args.commands == ['write'])
Esempio n. 19
0
    def test_generate_config(self, tp_hosts, tp_expected_config):  # noqa: C901
        """Test util functions ."""
        test_config_file = 'test_gen_config.txt'
        hostlist = []

        def read_config(filename):
            config = []
            with open(filename, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    pairs = line.strip().split(';')
                    config.append(pairs)
            return config

        expected_config = {}
        expected_config['one-to-one'] = [['0,3', '1,2'], ['0,1', '2,3'],
                                         ['0,2', '3,1']]
        expected_config['many-to-one'] = [['0,1', '0,2', '0,3'],
                                          ['1,0', '1,2', '1,3'],
                                          ['2,0', '2,1', '2,3'],
                                          ['3,0', '3,1', '3,2']]
        expected_config['one-to-many'] = [['1,0', '2,0', '3,0'],
                                          ['0,1', '2,1', '3,1'],
                                          ['0,2', '1,2', '3,2'],
                                          ['0,3', '1,3', '2,3']]
        benchmark_name = 'ib-traffic'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)
        benchmark = benchmark_class(benchmark_name)
        # Small scale test
        node_num = 4
        gen_hostlist(hostlist, node_num)
        for m in ['one-to-one', 'one-to-many', 'many-to-one']:
            benchmark.gen_traffic_pattern(hostlist, m, test_config_file)
            config = read_config(test_config_file)
            assert (config == expected_config[m])
        # Large scale test
        node_num = 1000
        gen_hostlist(hostlist, node_num)
        # check for 'one-to-many' and 'many-to-one'
        # In Nth step, the count of N is (N-1), others are all 1
        for m in ['one-to-many', 'many-to-one']:
            benchmark.gen_traffic_pattern(hostlist, m, test_config_file)
            config = read_config(test_config_file)
            assert (len(config) == node_num)
            assert (len(config[0]) == node_num - 1)
            for step in range(node_num):
                server = defaultdict(int)
                client = defaultdict(int)
                for pair in config[step]:
                    pair = pair.split(',')
                    server[int(pair[0])] += 1
                    client[int(pair[1])] += 1
                for i in range(node_num):
                    if m == 'many-to-one':
                        if i == step:
                            assert (server[i] == node_num - 1)
                        else:
                            assert (client[i] == 1)
                    elif m == 'one-to-many':
                        if i == step:
                            assert (client[i] == node_num - 1)
                        else:
                            assert (server[i] == 1)
        # check for 'one-to-one'
        # Each index appears 1 time in each step
        # Each index has been combined once with all the remaining indexes
        benchmark.gen_traffic_pattern(hostlist, 'one-to-one', test_config_file)
        config = read_config(test_config_file)
        if node_num % 2 == 1:
            assert (len(config) == node_num)
            assert (len(config[0]) == node_num // 2)
        else:
            assert (len(config) == node_num - 1)
            assert (len(config[0]) == node_num // 2)
        test_pairs = defaultdict(list)
        for step in range(len(config)):
            node = defaultdict(int)
            for pair in config[step]:
                pair = pair.split(',')
                node[int(pair[0])] += 1
                node[int(pair[1])] += 1
                test_pairs[int(pair[0])].append(int(pair[1]))
                test_pairs[int(pair[1])].append(int(pair[0]))
            for index in node:
                assert (node[index] == 1)
        for node in range(node_num):
            assert (sorted(test_pairs[node]) == [(i) for i in range(node_num)
                                                 if i != node])

        # check for 'topo-aware'
        # compare generated config file with pre-saved expected config file
        tp_ibstat_path = 'tests/data/ib_traffic_topo_aware_ibstat.txt'
        tp_ibnetdiscover_path = 'tests/data/ib_traffic_topo_aware_ibnetdiscover.txt'
        hostlist = tp_hosts.split()
        expected_config = tp_expected_config.split()
        config = gen_topo_aware_config(hostlist, tp_ibstat_path,
                                       tp_ibnetdiscover_path, 2, 6)
        assert (config == expected_config)

        Path(test_config_file).unlink()
    def test_rocm_flops_performance(self):
        """Test gemm-flops benchmark."""
        benchmark_name = 'gemm-flops'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.ROCM)
        assert (benchmark_class)

        # Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE.
        benchmark = benchmark_class(benchmark_name, parameters='--m 7680 --n 8192 --k 8192')

        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # Check basic information.
        assert (benchmark.name == 'gemm-flops')
        assert (benchmark.type == BenchmarkType.MICRO)
        assert (benchmark._bin_name == 'rocblas-bench')

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.m == 7680)
        assert (benchmark._args.n == 8192)
        assert (benchmark._args.k == 8192)

        params = '--transposeA N --transposeB T -m 7680 -n 8192 -k 8192' + \
            ' --alpha 1 --beta 0 --lda 8384 --ldb 8384 --ldc 8384 --ldd 8384'
        # Check command list
        expected_command = [
            'rocblas-bench -r f64_r -f gemm ' + params,
            'rocblas-bench -r f32_r -f gemm_ex --compute_type f32_r ' + params,
            'rocblas-bench -r f16_r -f gemm_ex --compute_type f32_r ' + params,
            'rocblas-bench -r bf16_r -f gemm_ex --compute_type f32_r ' + params,
            'rocblas-bench --a_type i8_r --b_type i8_r --c_type i32_r --d_type i32_r -f gemm_ex --compute_type i32_r ' +
            params
        ]
        for i in range(len(expected_command)):
            commnad = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
            print(benchmark._commands)
            assert (commnad == expected_command[i])

        # Check results and metrics.
        raw_output_FP64 = """
transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,rocblas-Gflops,us
N,T,7680,8192,8192,1,8384,0,8384,8384, 10037.5, 102694
"""
        raw_output_FP32_X = """
transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us
N,T,8640,8640,8640,1,8640,0,8640,8640,8640,1, 39441.6, 32705.2
"""
        raw_output_FP16_X = """
transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us
N,T,7680,8192,8192,1,8384,0,8384,8384,8384,1, 153728, 6705.3
"""
        raw_output_BF16_X = """
transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us
N,T,7680,8192,8192,1,8384,0,8384,8384,8384,1, 81374.3, 12667.3
"""
        raw_output_INT8_X = """
transA,transB,M,N,K,alpha,lda,beta,ldb,ldc,ldd,batch_count,rocblas-Gflops,us
T,N,7680,8192,8192,1,8416,0,8416,8416,8416,1, 162675, 6336.5
"""
        assert (benchmark._process_raw_result(0, raw_output_FP64))
        assert (benchmark._process_raw_result(1, raw_output_FP32_X))
        assert (benchmark._process_raw_result(2, raw_output_FP16_X))
        assert (benchmark._process_raw_result(3, raw_output_BF16_X))
        assert (benchmark._process_raw_result(4, raw_output_INT8_X))

        assert (benchmark.result['fp64_flops'][0] == 10037.5)
        assert (benchmark.result['fp32_xdlops_flops'][0] == 39441.6)
        assert (benchmark.result['fp16_xdlops_flops'][0] == 153728)
        assert (benchmark.result['bf16_xdlops_flops'][0] == 81374.3)
        assert (benchmark.result['int8_xdlops_iops'][0] == 162675)

        # Negative case - Add invalid raw output.
        assert (benchmark._process_raw_result(4, 'Invalid raw output') is False)
    def test_tensorrt_inference_params(self):
        """Test tensorrt-inference benchmark preprocess with different parameters."""
        (benchmark_cls,
         _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             self.benchmark_name, Platform.CUDA)

        test_cases = [
            {
                'precision': 'fp32',
            },
            {
                'pytorch_models': ['resnet50', 'mnasnet0_5'],
                'precision': 'fp16',
            },
            {
                'pytorch_models': ['resnet50'],
                'batch_size': 4,
            },
            {
                'pytorch_models': ['lstm', 'bert-base', 'gpt2-small'],
                'batch_size': 4,
                'seq_length': 128,
                'iterations': 256,
            },
        ]
        for test_case in test_cases:
            with self.subTest(msg='Testing with case', test_case=test_case):
                parameter_list = []
                if 'pytorch_models' in test_case:
                    parameter_list.append(
                        f'--pytorch_models {" ".join(test_case["pytorch_models"])}'
                    )
                if 'precision' in test_case:
                    parameter_list.append(
                        f'--precision {test_case["precision"]}')
                if 'batch_size' in test_case:
                    parameter_list.append(
                        f'--batch_size {test_case["batch_size"]}')
                if 'seq_length' in test_case:
                    parameter_list.append(
                        f'--seq_length {test_case["seq_length"]}')
                if 'iterations' in test_case:
                    parameter_list.append(
                        f'--iterations {test_case["iterations"]}')

                # Check basic information
                benchmark = benchmark_cls(self.benchmark_name,
                                          parameters=' '.join(parameter_list))
                self.assertTrue(benchmark)

                # Limit model number
                benchmark._pytorch_models = benchmark._pytorch_models[:1]

                # Preprocess
                ret = benchmark._preprocess()
                self.assertTrue(ret)
                self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
                self.assertEqual(BenchmarkType.MICRO, benchmark.type)
                self.assertEqual(self.benchmark_name, benchmark.name)

                # Check parameters
                self.assertEqual(
                    test_case.get('pytorch_models', benchmark._pytorch_models),
                    benchmark._args.pytorch_models,
                )
                self.assertEqual(
                    test_case.get('precision', 'int8'),
                    benchmark._args.precision,
                )
                self.assertEqual(
                    test_case.get('batch_size', 32),
                    benchmark._args.batch_size,
                )
                self.assertEqual(
                    test_case.get('iterations', 2048),
                    benchmark._args.iterations,
                )

                # Check models
                for model in benchmark._args.pytorch_models:
                    self.assertTrue(
                        (self._model_path / f'{model}.onnx').is_file())

                # Command list should equal to default model number
                self.assertEqual(
                    len(
                        test_case.get('pytorch_models',
                                      benchmark._pytorch_models)),
                    len(benchmark._commands))
    def test_nccl_bw_performance(self, allgather, allreduce, reduce, broadcast,
                                 reducescatter, alltoall):
        """Test nccl-bw benchmark."""
        benchmark_name = 'nccl-bw'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CUDA)
        assert (benchmark_class)

        benchmark = benchmark_class(benchmark_name, parameters='--ngpus 8')

        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)

        # Check basic information.
        assert (benchmark)
        assert (benchmark.name == 'nccl-bw')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.operation == 'allreduce')
        assert (benchmark._args.ngpus == 8)
        assert (benchmark._args.minbytes == '8')
        assert (benchmark._args.maxbytes == '8G')
        assert (benchmark._args.stepfactor == 2)
        assert (benchmark._args.check == 0)
        assert (benchmark._args.iters == 20)
        assert (benchmark._args.warmup_iters == 5)

        # Check command list
        bin_names = [
            'all_reduce_perf', 'all_gather_perf', 'broadcast_perf',
            'reduce_perf', 'reduce_scatter_perf', 'alltoall_perf'
        ]

        command = bin_names[0] + benchmark._commands[0].split(bin_names[0])[1]
        expected_command = '{} -b 8 -e 8G -f 2 -g 8 -c 0 -n 20 -w 5'.format(
            bin_names[0])
        assert (command == expected_command)

        # Check results and metrics.
        # Case with no raw_output
        assert (benchmark._process_raw_result(0, '') is False)

        # Case with valid raw_output
        raw_output = {
            'allgather': allgather,
            'allreduce': allreduce,
            'reduce': reduce,
            'broadcast': broadcast,
            'reducescatter': reducescatter,
            'alltoall': alltoall,
        }

        for op in raw_output.keys():
            benchmark._args.operation = op
            assert (benchmark._process_raw_result(0, raw_output[op]))

            for name in ['time', 'algbw', 'busbw']:
                for size in [
                        '8589934592', '4294967296', '2147483648', '1073741824',
                        '536870912', '32'
                ]:
                    metric = op + '_' + size + '_' + name
                    assert (metric in benchmark.result)
                    assert (len(benchmark.result[metric]) == 1)
                    assert (isinstance(benchmark.result[metric][0],
                                       numbers.Number))

        assert (benchmark.result['allreduce_8589934592_time'][0] == 63896.0)
        assert (benchmark.result['allreduce_8589934592_algbw'][0] == 134.44)
        assert (benchmark.result['allreduce_8589934592_busbw'][0] == 235.26)
        assert (benchmark.result['alltoall_8589934592_time'][0] == 33508.0)
        assert (benchmark.result['alltoall_8589934592_algbw'][0] == 256.36)
        assert (benchmark.result['alltoall_8589934592_busbw'][0] == 224.31)
    def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
        """Test cpu-memory-bw-latency benchmark result parsing."""
        benchmark_name = 'cpu-memory-bw-latency'
        (benchmark_class,
         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
        assert (benchmark_class)

        all_mlc_tests = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']
        param_str = '--tests %s' % ' '.join(all_mlc_tests)
        benchmark = benchmark_class(benchmark_name, parameters=param_str)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'cpu-memory-bw-latency')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check commands
        assert (len(all_mlc_tests) == len(benchmark._commands))
        for mlc_test, command in zip(all_mlc_tests, benchmark._commands):
            assert ('mlc --%s;' % mlc_test in command)

        # Positive case - valid bandwidth matrix output.
        test_raw_output = """
Intel(R) Memory Latency Checker - v3.9a
Command line parameters: --bandwidth_matrix

Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes
*** Unable to modify prefetchers (try executing 'modprobe msr')
*** So, enabling random access for latency measurements
Measuring Memory Bandwidths between nodes within system
Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
Using all the threads from each core if Hyper-threading is enabled
Using Read-only traffic type
                Numa node
Numa node            0       1
       0        82542.2 76679.9
       1        76536.0 82986.5
"""
        assert (benchmark._process_raw_result(0, test_raw_output))
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert ('raw_output_0' in benchmark.raw_data)
        assert ([test_raw_output] == benchmark.raw_data['raw_output_0'])
        assert ([82542.2] == benchmark.result['mem_bandwidth_matrix_numa_0_0_bw'])
        assert ([76679.9] == benchmark.result['mem_bandwidth_matrix_numa_0_1_bw'])
        assert ([76536.0] == benchmark.result['mem_bandwidth_matrix_numa_1_0_bw'])
        assert ([82986.5] == benchmark.result['mem_bandwidth_matrix_numa_1_1_bw'])

        # Positive case - valid latency matrix output.
        test_raw_output = """
Intel(R) Memory Latency Checker - v3.9a
Command line parameters: --latency_matrix

Using buffer size of 600.000MiB
*** Unable to modify prefetchers (try executing 'modprobe msr')
*** So, enabling random access for latency measurements
Measuring idle latencies (in ns)...
                Numa node
Numa node            0       1
       0          87.0   101.0
       1         101.9    86.9
"""
        assert (benchmark._process_raw_result(1, test_raw_output))
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert ('raw_output_1' in benchmark.raw_data)
        assert ([test_raw_output] == benchmark.raw_data['raw_output_1'])

        assert ([87.0] == benchmark.result['mem_latency_matrix_numa_0_0_lat'])
        assert ([101.0] == benchmark.result['mem_latency_matrix_numa_0_1_lat'])
        assert ([101.9] == benchmark.result['mem_latency_matrix_numa_1_0_lat'])
        assert ([86.9] == benchmark.result['mem_latency_matrix_numa_1_1_lat'])

        # Positive case - valid max bandwidth output.
        test_raw_output = """
Intel(R) Memory Latency Checker - v3.9a
Command line parameters: --max_bandwidth

Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes
*** Unable to modify prefetchers (try executing 'modprobe msr')
*** So, enabling random access for latency measurements

Measuring Maximum Memory Bandwidths for the system
Will take several minutes to complete as multiple injection rates will be tried to get the best bandwidth
Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
Using all the threads from each core if Hyper-threading is enabled
Using traffic with the following read-write ratios
ALL Reads        :      165400.60
3:1 Reads-Writes :      154975.19
2:1 Reads-Writes :      158433.32
1:1 Reads-Writes :      157352.05
Stream-triad like:      157878.32

"""
        assert (benchmark._process_raw_result(2, test_raw_output))
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert ('raw_output_2' in benchmark.raw_data)
        assert ([test_raw_output] == benchmark.raw_data['raw_output_2'])
        assert ([165400.60] == benchmark.result['mem_max_bandwidth_all_reads_bw'])
        assert ([154975.19] == benchmark.result['mem_max_bandwidth_3_1_reads-writes_bw'])
        assert ([158433.32] == benchmark.result['mem_max_bandwidth_2_1_reads-writes_bw'])
        assert ([157352.05] == benchmark.result['mem_max_bandwidth_1_1_reads-writes_bw'])
        assert ([157878.32] == benchmark.result['mem_max_bandwidth_stream-triad_like_bw'])

        # Negative case - invalid raw output.
        assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
    def test_flops_performance_cuda(self):
        """Test gemm-flops benchmark."""
        benchmark_name = 'gemm-flops'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CUDA)
        assert (benchmark_class)

        # Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE.
        benchmark = benchmark_class(
            benchmark_name,
            parameters=
            '--num_warmup 200 --n 1024 --k 512 --m 2048 --precision fp32 tf32_tc fp16_tc int8_tc'
        )

        ret = benchmark._preprocess()
        if dm.device_manager.get_device_compute_capability(
        ) not in benchmark._CudaGemmFlopsBenchmark__kernel_map:
            assert (ret is False)
            assert (benchmark.return_code ==
                    ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
        else:
            assert (ret is True)
            assert (benchmark.return_code == ReturnCode.SUCCESS)

        # Check basic information.
        assert (benchmark.name == 'gemm-flops')
        assert (benchmark.type == BenchmarkType.MICRO)
        assert (benchmark._bin_name == 'cutlass_profiler')

        # Check parameters specified in BenchmarkContext.
        assert (benchmark._args.num_warmup == 200)
        assert (benchmark._args.n == 1024)
        assert (benchmark._args.k == 512)
        assert (benchmark._args.m == 2048)
        assert (benchmark._args.precision == [
            'fp32', 'tf32_tc', 'fp16_tc', 'int8_tc'
        ])
        benchmark._CudaGemmFlopsBenchmark__precision_need_to_run = [
            'fp32', 'tf32_tc', 'fp16_tc', 'int8_tc'
        ]

        # Check results and metrics.
        raw_output_fp32 = """
CSV Results:

Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nn_align1,passed,success,universal,16384,16384,16384,f32:column,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.022,6.23672,18287.4
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nt_align1,passed,success,universal,16384,16384,16384,f32:column,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,478.866,6.2648,18369.7
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tn_align1,passed,success,universal,16384,16384,16384,f32:row,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,482.034,6.22363,18249
1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tt_align1,passed,success,universal,16384,16384,16384,f32:row,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.838,6.22616,18256.4
"""
        raw_output_tf32_tc = """
CSV Results:

Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nn_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,88.5764,33.8691,99311.2
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nt_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,70.3503,42.6438,125040
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tn_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,86.5167,34.6754,101676
1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tt_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,68.3621,43.884,128677
"""
        raw_output_fp16_tc = """
CSV Results:

Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nn_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.1575,43.9142,257531
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nt_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.6153,43.3334,254126
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tn_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,39.0413,38.4209,225316
1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tt_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,31.2994,47.9243,281048
"""
        assert (benchmark._process_raw_result(0, raw_output_fp32))
        assert (benchmark._process_raw_result(1, raw_output_tf32_tc))
        assert (benchmark._process_raw_result(2, raw_output_fp16_tc))

        assert (benchmark.result['fp32_flops'][0] == 18369.7)
        assert (benchmark.result['tf32_tc_flops'][0] == 128677)
        assert (benchmark.result['fp16_tc_flops'][0] == 281048)

        # Negative case - Add invalid raw output.
        assert (benchmark._process_raw_result(3, 'Invalid raw output') is
                False)
Esempio n. 25
0
    def test_disk_performance_benchmark_enabled(self, mock_is_block_device):
        """Test disk-performance benchmark command generation with all benchmarks enabled."""
        mock_is_block_device.return_value = True

        benchmark_name = 'disk-benchmark'
        (benchmark_class, predefine_params
         ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
             benchmark_name, Platform.CPU)
        assert (benchmark_class)

        block_devices = ['mock_block_device_0', 'mock_block_device_1']
        block_device_option = '--block_devices ' + ' '.join(block_devices)

        init_test_magic = 45
        curr_test_magic = init_test_magic
        param_str = block_device_option
        # Sequential precondition
        param_str += ' --enable_seq_precond'
        # Random precondition
        param_str += ' --rand_precond_time=%d' % curr_test_magic
        curr_test_magic += 1
        # Seq/rand read/write
        for io_pattern in ['seq', 'rand']:
            for io_type in ['read', 'write', 'readwrite']:
                io_str = '%s_%s' % (io_pattern, io_type)
                param_str += ' --%s_ramp_time=%d' % (io_str, curr_test_magic)
                curr_test_magic += 1
                param_str += ' --%s_runtime=%d' % (io_str, curr_test_magic)
                curr_test_magic += 1
                param_str += ' --%s_iodepth=%d' % (io_str, curr_test_magic)
                curr_test_magic += 1
                param_str += ' --%s_numjobs=%d' % (io_str, curr_test_magic)
                curr_test_magic += 1
        benchmark = benchmark_class(benchmark_name, parameters=param_str)

        # Check basic information
        assert (benchmark)
        ret = benchmark._preprocess()
        assert (ret is True)
        assert (benchmark.return_code == ReturnCode.SUCCESS)
        assert (benchmark.name == 'disk-benchmark')
        assert (benchmark.type == BenchmarkType.MICRO)

        # Check command list
        # 2 files * (2 preconditions + 3 io_patterns * 2 io_types) = 16 commands
        assert (16 == len(benchmark._commands))

        # Check parameter assignments
        command_idx = 0
        default_rwmixread = 80
        for block_device in block_devices:
            curr_test_magic = init_test_magic

            # Sequential precondition
            assert ('--filename=%s' % block_device
                    in benchmark._commands[command_idx])
            command_idx += 1
            # Random precondition
            assert ('--filename=%s' % block_device
                    in benchmark._commands[command_idx])
            assert ('--runtime=%d' % curr_test_magic
                    in benchmark._commands[command_idx])
            curr_test_magic += 1
            command_idx += 1
            # Seq/rand read/write
            for io_pattern in ['seq', 'rand']:
                for io_type in ['read', 'write', 'rw']:
                    assert ('--filename=%s' % block_device
                            in benchmark._commands[command_idx])
                    fio_rw = '%s%s' % (io_pattern if io_pattern == 'rand' else
                                       '', io_type)
                    assert ('--rw=%s' % fio_rw
                            in benchmark._commands[command_idx])
                    assert ('--ramp_time=%d' % curr_test_magic
                            in benchmark._commands[command_idx])
                    curr_test_magic += 1
                    assert ('--runtime=%d' % curr_test_magic
                            in benchmark._commands[command_idx])
                    curr_test_magic += 1
                    assert ('--iodepth=%d' % curr_test_magic
                            in benchmark._commands[command_idx])
                    curr_test_magic += 1
                    assert ('--numjobs=%d' % curr_test_magic
                            in benchmark._commands[command_idx])
                    curr_test_magic += 1
                    if io_type == 'rw':
                        assert ('--rwmixread=%d' % default_rwmixread
                                in benchmark._commands[command_idx])
                    command_idx += 1
def test_rocm_onnxruntime_performance():
    """Test onnxruntime model benchmark."""
    benchmark_name = 'onnxruntime-ort-models'
    (benchmark_class, predefine_params
     ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
         benchmark_name, Platform.ROCM)
    assert (benchmark_class)
    benchmark = benchmark_class(benchmark_name)
    assert (benchmark._benchmark_type == BenchmarkType.DOCKER)
    assert (benchmark._image_uri ==
            'superbench/benchmark:rocm4.3.1-onnxruntime1.9.0')
    assert (benchmark._container_name == 'rocm-onnxruntime-model-benchmarks')
    assert (
        benchmark._entrypoint ==
        '/stage/onnxruntime-training-examples/huggingface/azureml/run_benchmark.sh'
    )
    assert (benchmark._cmd is None)
    benchmark._result = BenchmarkResult(benchmark._name,
                                        benchmark._benchmark_type,
                                        ReturnCode.SUCCESS)
    benchmark._args = SimpleNamespace(log_raw_data=False)

    raw_output = """
__superbench__ begin bert-large-uncased ngpu=1
    "samples_per_second": 21.829
__superbench__ begin bert-large-uncased ngpu=8
    "samples_per_second": 147.181
__superbench__ begin distilbert-base-uncased ngpu=1
    "samples_per_second": 126.827
__superbench__ begin distilbert-base-uncased ngpu=8
    "samples_per_second": 966.796
__superbench__ begin gpt2 ngpu=1
    "samples_per_second": 20.46
__superbench__ begin gpt2 ngpu=8
    "samples_per_second": 151.089
__superbench__ begin facebook/bart-large ngpu=1
    "samples_per_second": 66.171
__superbench__ begin facebook/bart-large ngpu=8
    "samples_per_second": 370.343
__superbench__ begin roberta-large ngpu=1
    "samples_per_second": 37.103
__superbench__ begin roberta-large ngpu=8
    "samples_per_second": 274.455
"""
    assert (benchmark._process_raw_result(0, raw_output))
    assert (
        benchmark.result['bert_large_uncased_ngpu_1_throughput'][0] == 21.829)
    assert (
        benchmark.result['bert_large_uncased_ngpu_8_throughput'][0] == 147.181)
    assert (benchmark.result['distilbert_base_uncased_ngpu_1_throughput'][0] ==
            126.827)
    assert (benchmark.result['distilbert_base_uncased_ngpu_8_throughput'][0] ==
            966.796)
    assert (benchmark.result['gpt2_ngpu_1_throughput'][0] == 20.46)
    assert (benchmark.result['gpt2_ngpu_8_throughput'][0] == 151.089)
    assert (
        benchmark.result['facebook_bart_large_ngpu_1_throughput'][0] == 66.171)
    assert (benchmark.result['facebook_bart_large_ngpu_8_throughput'][0] ==
            370.343)
    assert (benchmark.result['roberta_large_ngpu_1_throughput'][0] == 37.103)
    assert (benchmark.result['roberta_large_ngpu_8_throughput'][0] == 274.455)