def test_kernel_launch_overhead(): """Test kernel-launch benchmark.""" context = BenchmarkRegistry.create_benchmark_context( 'kernel-launch', parameters='--num_warmup 200 --num_steps 20000 --interval 100') assert (BenchmarkRegistry.is_benchmark_context_valid(context)) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (benchmark.name == 'kernel-launch') assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.num_warmup == 200) assert (benchmark._args.num_steps == 20000) assert (benchmark._args.interval == 100) # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) assert ('raw_output_0' in benchmark.raw_data) assert (len(benchmark.raw_data['raw_output_0']) == 1) assert (isinstance(benchmark.raw_data['raw_output_0'][0], str)) for metric in ['event_time', 'wall_time']: assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number))
def test_pytorch_computation_communication_overlap_normal(): """Test pytorch-computation-communication-overlap benchmark on distributed normal case.""" context = BenchmarkRegistry.create_benchmark_context( 'computation-communication-overlap', parameters='--num_warmup 5 --num_steps 10 --ratio 5', framework=Framework.PYTORCH ) world_size = 2 assert (BenchmarkRegistry.is_benchmark_context_valid(context)) results = utils.simulated_ddp_distributed_benchmark(context, world_size) assert (results) for benchmark in results: # Check basic information. assert (benchmark) assert (isinstance(benchmark, ComputationCommunicationOverlap)) assert (benchmark.name == 'pytorch-computation-communication-overlap') assert (benchmark.type == BenchmarkType.MICRO) # Check predefined parameters of sharding-matmul benchmark. assert (benchmark._args.kernel == [ComputationKernelType.MUL, ComputationKernelType.MATMUL]) # Check parameters specified in BenchmarkContext. assert (benchmark._args.num_steps == 10) # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (len(benchmark.raw_data) == len(benchmark._args.kernel)) assert (len(benchmark.result) == len(benchmark._args.kernel) + benchmark.default_metric_count)
def test_pytorch_matmul(): """Test pytorch-matmul benchmark.""" context = BenchmarkRegistry.create_benchmark_context( 'matmul', platform=Platform.CUDA, parameters='--run_count 2 --num_steps 20', framework=Framework.PYTORCH) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (benchmark.name == 'pytorch-matmul') assert (benchmark.type == BenchmarkType.MICRO) # Check predefined parameters of sharding-matmul benchmark. assert (benchmark._args.mode == [ShardingMode.NOSHARDING]) # Check parameters specified in BenchmarkContext. assert (benchmark._args.run_count == 2) assert (benchmark._args.num_steps == 20) # Check results and metrics. assert (benchmark.run_count == 2) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (len(benchmark.raw_data['nosharding_time']) == benchmark.run_count) assert (len( benchmark.raw_data['nosharding_time'][0]) == benchmark._args.num_steps) assert (len(benchmark.result['nosharding_time']) == benchmark.run_count)
def test_pytorch_computation_communication_overlap_fake_distributed(): """Test pytorch-computation-communication-overlap benchmark on single gpu.""" context = BenchmarkRegistry.create_benchmark_context( 'computation-communication-overlap', parameters='--num_warmup 5 --num_steps 10 --ratio 5', framework=Framework.PYTORCH ) port = network.get_free_port() assert (port) utils.setup_simulated_ddp_distributed_env(1, 0, port) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (isinstance(benchmark, ComputationCommunicationOverlap)) assert (benchmark.name == 'pytorch-computation-communication-overlap') assert (benchmark.type == BenchmarkType.MICRO) # Check predefined parameters of sharding-matmul benchmark. assert (benchmark._args.kernel == [ComputationKernelType.MUL, ComputationKernelType.MATMUL]) # Check parameters specified in BenchmarkContext. assert (benchmark._args.num_steps == 10) # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (len(benchmark.raw_data) == len(benchmark._args.kernel)) assert (len(benchmark.result) == len(benchmark._args.kernel) + benchmark.default_metric_count) utils.clean_simulated_ddp_distributed_env()
def exec(self): """Run the SuperBench benchmarks locally.""" for benchmark_name in self._sb_benchmarks: if benchmark_name not in self._sb_enabled: continue benchmark_config = self._sb_benchmarks[benchmark_name] benchmark_results = list() self.__create_benchmark_dir(benchmark_name) cwd = os.getcwd() os.chdir(self.__get_benchmark_dir(benchmark_name)) monitor = None if self.__get_rank_id( ) == 0 and self._sb_monitor_config and self._sb_monitor_config.enable: if self.__get_platform() == Platform.CUDA: monitor = Monitor( None, int(self._sb_monitor_config.sample_duration or 10), int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name)) monitor.start() else: logger.warning( 'Monitor can not support ROCM/CPU platform.') benchmark_real_name = benchmark_name.split(':')[0] for framework in benchmark_config.frameworks or [ Framework.NONE.value ]: if benchmark_real_name == 'model-benchmarks' or ( ':' not in benchmark_name and benchmark_name.endswith('_models')): for model in benchmark_config.models: full_name = f'{benchmark_name}/{framework}-{model}' logger.info('Executor is going to execute %s.', full_name) context = BenchmarkRegistry.create_benchmark_context( model, platform=self.__get_platform(), framework=Framework(framework.lower()), parameters=self.__get_arguments( benchmark_config.parameters)) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) else: full_name = benchmark_name logger.info('Executor is going to execute %s.', full_name) context = BenchmarkRegistry.create_benchmark_context( benchmark_real_name, platform=self.__get_platform(), framework=Framework(framework.lower()), parameters=self.__get_arguments( benchmark_config.parameters)) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) if monitor: monitor.stop() self.__write_benchmark_results(benchmark_name, benchmark_results) os.chdir(cwd)
def benchmark_list_params_command_handler(name=None): """List parameters for benchmarks which match the regular expression. Args: name (str, optional): Benchmark name or regular expression. Defaults to None. Raises: CLIError: If cannot find the matching benchmark. """ for benchmark_name in benchmark_list_command_handler(name): format_help = '' for platform in Platform: if platform in BenchmarkRegistry.benchmarks[benchmark_name]: format_help = BenchmarkRegistry.get_benchmark_configurable_settings( BenchmarkRegistry.create_benchmark_context(benchmark_name, platform=platform) ) break print( ( f'=== {benchmark_name} ===\n\n' f'{format_help}\n\n' f'default values:\n' f'{pformat(BenchmarkRegistry.benchmarks[benchmark_name]["predefine_param"])}\n' ) )
def test_is_benchmark_context_valid(): """Test interface BenchmarkRegistry.is_benchmark_context_valid().""" # Positive case. context = BenchmarkRegistry.create_benchmark_context('accumulation', platform=Platform.CPU) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) # Negative case. context = 'context' assert (BenchmarkRegistry.is_benchmark_context_valid(context) is False) context = None assert (BenchmarkRegistry.is_benchmark_context_valid(context) is False)
def test_pytorch_bert_base(): """Test pytorch-bert-base benchmark.""" context = BenchmarkRegistry.create_benchmark_context( 'bert-base', platform=Platform.CUDA, parameters= '--batch_size 1 --num_classes 5 --seq_len 8 --num_warmup 2 --num_steps 4 \ --model_action train inference', framework=Framework.PYTORCH) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (isinstance(benchmark, PytorchBERT)) assert (benchmark.name == 'pytorch-bert-base') assert (benchmark.type == BenchmarkType.MODEL) # Check predefined parameters of resnet101 model. assert (benchmark._args.hidden_size == 768) assert (benchmark._args.num_hidden_layers == 12) assert (benchmark._args.num_attention_heads == 12) assert (benchmark._args.intermediate_size == 3072) # Check parameters specified in BenchmarkContext. assert (benchmark._args.batch_size == 1) assert (benchmark._args.num_classes == 5) assert (benchmark._args.seq_len == 8) assert (benchmark._args.num_warmup == 2) assert (benchmark._args.num_steps == 4) # Check dataset scale. assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in [ 'fp32_train_step_time', 'fp32_train_throughput', 'fp16_train_step_time', 'fp16_train_throughput', 'fp32_inference_step_time', 'fp32_inference_throughput', 'fp16_inference_step_time', 'fp16_inference_throughput' ]: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len( benchmark.raw_data[metric][0]) == benchmark._args.num_steps) assert (len(benchmark.result[metric]) == benchmark.run_count)
def test_tensorrt_inference_result_parsing(self, test_raw_log): """Test tensorrt-inference benchmark result parsing.""" (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( self.benchmark_name, Platform.CUDA) benchmark = benchmark_cls(self.benchmark_name, parameters='') benchmark._args = SimpleNamespace( pytorch_models=['model_0', 'model_1'], log_raw_data=False) benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1) # Positive case - valid raw output self.assertTrue(benchmark._process_raw_result(0, test_raw_log)) self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code) self.assertEqual(6 + benchmark.default_metric_count, len(benchmark.result)) for tag in ['mean', '99']: self.assertEqual(0.5, benchmark.result[f'model_0_gpu_time_{tag}'][0]) self.assertEqual(0.6, benchmark.result[f'model_0_host_time_{tag}'][0]) self.assertEqual( 1.0, benchmark.result[f'model_0_end_to_end_time_{tag}'][0]) # Negative case - invalid raw output self.assertFalse(benchmark._process_raw_result(1, 'Invalid raw output'))
def test_fambench(): """Test FAMBench benchmarks.""" benchmark_name = 'fambench' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA) assert (benchmark_class) benchmark = benchmark_class(benchmark_name) assert (benchmark._benchmark_type == BenchmarkType.DOCKER) assert (benchmark._image_uri == 'superbench/benchmark:cuda11.1.1-fambench') assert (benchmark._container_name == 'fambench-benchmarks') assert (benchmark._entrypoint == '/workspace/FAMBench/benchmarks/run_all_benchmarks.sh') assert (benchmark._cmd is None) benchmark._result = BenchmarkResult(benchmark._name, benchmark._benchmark_type, ReturnCode.SUCCESS) benchmark._args = SimpleNamespace(log_raw_data=False) raw_output = """ benchmark implementation mode config score units batch_latency_95_sec DLRM OOTB eval tiny 152.800399 ex/s 0.515052 DLRM OOTB train tiny 35.483686 ex/s None DLRM UBENCH train linear_[(2,2,2,2,2)] 3.679281e-07 TF/s None XLMR OOTB eval default-config 1.015586 ex/s 16.463461 """ assert (benchmark._process_raw_result(0, raw_output)) assert (benchmark.result['dlrm_ootb_eval_tiny_ex_s'][0] == 152.800399) assert (benchmark.result['dlrm_ootb_train_tiny_ex_s'][0] == 35.483686) assert (benchmark.result['dlrm_ubench_train_linear_[(2,2,2,2,2)]_tf_s'][0] == 3.679281e-07) assert (benchmark.result['xlmr_ootb_eval_default_config_ex_s'][0] == 1.015586)
def _test_gpu_copy_bw_performance_result_parsing(self, platform, test_raw_output): """Test gpu-copy benchmark result parsing.""" benchmark_name = 'gpu-copy-bw' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform) assert (benchmark_class) benchmark = benchmark_class(benchmark_name, parameters='') assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'gpu-copy-bw') assert (benchmark.type == BenchmarkType.MICRO) # Positive case - valid raw output. assert (benchmark._process_raw_result(0, test_raw_output)) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (1 == len(benchmark.raw_data)) print(test_raw_output.splitlines()) test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()} assert (len(test_raw_output_dict) + benchmark.default_metric_count == len(benchmark.result)) for output_key in benchmark.result: if output_key == 'return_code': assert (benchmark.result[output_key] == [0]) else: assert (len(benchmark.result[output_key]) == 1) assert (isinstance(benchmark.result[output_key][0], numbers.Number)) assert (output_key.strip('_bw') in test_raw_output_dict) assert (test_raw_output_dict[output_key.strip('_bw')] == benchmark.result[output_key][0]) # Negative case - invalid raw output. assert (benchmark._process_raw_result(1, 'Invalid raw output') is False) assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
def test_disk_performance_benchmark_disabled(self, mock_is_block_device): """Test disk-performance benchmark command generation with all benchmarks disabled.""" mock_is_block_device.return_value = True benchmark_name = 'disk-benchmark' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CPU) assert (benchmark_class) block_devices = ['/dev/nvme0n1', '/dev/nvme1n1'] block_device_option = '--block_devices ' + ' '.join(block_devices) param_str = block_device_option param_str += ' --rand_precond_time=0' param_str += ' --seq_read_runtime=0' param_str += ' --rand_read_runtime=0' benchmark = benchmark_class(benchmark_name, parameters=param_str) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == 'disk-benchmark') assert (benchmark.type == BenchmarkType.MICRO) # Command list should be empty assert (0 == len(benchmark._commands))
def __exec_benchmark(self, benchmark_full_name, context): """Launch benchmark for context. Args: benchmark_full_name (str): Benchmark full name. context (BenchmarkContext): Benchmark context to launch. Return: dict: Benchmark result. """ try: benchmark = BenchmarkRegistry.launch_benchmark(context) if benchmark: logger.info('benchmark: %s, return code: %s, result: %s.', benchmark.name, benchmark.return_code, benchmark.result) if benchmark.return_code.value == 0: logger.info('Executor succeeded in %s.', benchmark_full_name) else: logger.error('Executor failed in %s.', benchmark_full_name) result = json.loads(benchmark.serialized_result) result['name'] = benchmark_full_name return result else: logger.error('Executor failed in %s, invalid context.', benchmark_full_name) except Exception as e: logger.error(e) logger.error('Executor failed in %s.', benchmark_full_name) return None
def test_tcp_connectivity(self): """Test tcp-connectivity benchmark.""" context = BenchmarkRegistry.create_benchmark_context( 'tcp-connectivity', parameters= '--hostfile /tmp/superbench/hostfile.test --port 80 --parallel 2', ) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (isinstance(benchmark, TCPConnectivityBenchmark)) assert (benchmark.name == 'tcp-connectivity') assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.hostfile == '/tmp/superbench/hostfile.test') assert (benchmark._args.port == 80) assert (benchmark._args.count == 10) assert (benchmark._args.timeout == 1) assert (benchmark._args.parallel == 2) print(benchmark.result) assert (benchmark.result) # Check results and metrics. assert (benchmark.result['api.github.com_successed_count'][0] == 10) assert (benchmark.result['api.github.com_failed_count'][0] == 0) assert (benchmark.result['api.github.com_success_rate'][0] == 100.0) assert (isinstance(benchmark.result['api.github.com_time_min'][0], numbers.Number)) assert (isinstance(benchmark.result['api.github.com_time_max'][0], numbers.Number)) assert (isinstance(benchmark.result['api.github.com_time_avg'][0], numbers.Number)) assert (isinstance(benchmark.result['localhost_successed_count'][0], numbers.Number)) assert (isinstance(benchmark.result['localhost_failed_count'][0], numbers.Number)) assert (isinstance(benchmark.result['localhost_time_max'][0], numbers.Number)) assert (isinstance(benchmark.result['localhost_time_min'][0], numbers.Number)) assert (isinstance(benchmark.result['localhost_time_avg'][0], numbers.Number)) assert (benchmark.return_code == ReturnCode.SUCCESS)
def benchmark_in_one_process(context, world_size, local_rank, port, queue): """Function to setup env for DDP initialization and run the benchmark in each single process.""" setup_simulated_ddp_distributed_env(world_size, local_rank, port) benchmark = BenchmarkRegistry.launch_benchmark(context) # parser object must be removed becaues it can not be serialized. benchmark._parser = None queue.put(benchmark) clean_simulated_ddp_distributed_env()
def test_tensorrt_inference_cls(self): """Test tensorrt-inference benchmark class.""" for platform in Platform: (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( self.benchmark_name, platform) if platform is Platform.CUDA: self.assertIsNotNone(benchmark_cls) else: self.assertIsNone(benchmark_cls)
def test_register_benchmark(): """Test interface BenchmarkRegistry.register_benchmark().""" # Register the benchmark for all platform if use default platform. BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark) for platform in Platform: context = BenchmarkRegistry.create_benchmark_context('accumulation', platform=platform) assert (BenchmarkRegistry.is_benchmark_registered(context)) # Register the benchmark for CUDA platform if use platform=Platform.CUDA. BenchmarkRegistry.register_benchmark('accumulation-cuda', AccumulationBenchmark, platform=Platform.CUDA) context = BenchmarkRegistry.create_benchmark_context( 'accumulation-cuda', platform=Platform.CUDA) assert (BenchmarkRegistry.is_benchmark_registered(context)) context = BenchmarkRegistry.create_benchmark_context( 'accumulation-cuda', platform=Platform.ROCM) assert (BenchmarkRegistry.is_benchmark_registered(context) is False)
def run_pytorch_lstm(parameters='', check_metrics=[]): """Test pytorch-lstm benchmark.""" context = BenchmarkRegistry.create_benchmark_context( 'lstm', platform=Platform.CUDA, parameters=parameters, framework=Framework.PYTORCH) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (isinstance(benchmark, PytorchLSTM)) assert (benchmark.name == 'pytorch-lstm') assert (benchmark.type == BenchmarkType.MODEL) # Check predefined parameters of lstm model. assert (benchmark._args.input_size == 256) assert (benchmark._args.hidden_size == 1024) assert (benchmark._args.num_layers == 8) # Check parameters specified in BenchmarkContext. assert (benchmark._args.batch_size == 1) assert (benchmark._args.num_classes == 5) assert (benchmark._args.seq_len == 8) assert (benchmark._args.num_warmup == 2) assert (benchmark._args.num_steps == 4) # Check dataset scale. assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in check_metrics: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len( benchmark.raw_data[metric][0]) == benchmark._args.num_steps) assert (len(benchmark.result[metric]) == benchmark.run_count)
def test_get_benchmark_configurable_settings(): """Test BenchmarkRegistry interface. BenchmarkRegistry.get_benchmark_configurable_settings(). """ # Register benchmarks for testing. BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark) context = BenchmarkRegistry.create_benchmark_context('accumulation', platform=Platform.CPU) settings = BenchmarkRegistry.get_benchmark_configurable_settings(context) expected = """optional arguments: --duration int The elapsed time of benchmark in seconds. --log_raw_data Log raw data into file instead of saving it into result object. --lower_bound int The lower bound for accumulation. --run_count int The run count of benchmark. --upper_bound int The upper bound for accumulation.""" assert (settings == expected)
def run_pytorch_cnn(models=[], parameters='', check_metrics=[]): """Run pytorch cnn benchmarks.""" for model in models: context = BenchmarkRegistry.create_benchmark_context( model, platform=Platform.CUDA, parameters=parameters, framework=Framework.PYTORCH) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (isinstance(benchmark, PytorchCNN)) assert (benchmark.name == 'pytorch-' + model) assert (benchmark.type == BenchmarkType.MODEL) # Check predefined parameters of resnet101 model. assert (benchmark._args.model_type == model) # Check parameters specified in BenchmarkContext. assert (benchmark._args.batch_size == 1) assert (benchmark._args.image_size == 224) assert (benchmark._args.num_classes == 5) assert (benchmark._args.num_warmup == 2) assert (benchmark._args.num_steps == 4) # Check Dataset. assert (len(benchmark._dataset) == benchmark._args.sample_count * benchmark._world_size) # Check results and metrics. assert (benchmark.run_count == 1) assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in check_metrics: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len( benchmark.raw_data[metric][0]) == benchmark._args.num_steps) assert (len(benchmark.result[metric]) == benchmark.run_count)
def create_benchmark(params='--num_steps 8'): """Register and create benchmark.""" # Register the FakeModelBenchmark benchmark. BenchmarkRegistry.register_benchmark( 'pytorch-fake-model', FakeModelBenchmark, parameters='--hidden_size 2', platform=Platform.CUDA, ) context = BenchmarkRegistry.create_benchmark_context( 'fake-model', platform=Platform.CUDA, parameters=params, framework=Framework.PYTORCH) name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context) assert (name) (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( name, context.platform) assert (benchmark_class) return benchmark_class(name, predefine_params + ' ' + context.parameters)
def test_pytorch_sharding_matmul(): """Test pytorch-sharding-matmul benchmark.""" context = BenchmarkRegistry.create_benchmark_context( 'sharding-matmul', platform=Platform.CUDA, parameters='--run_count 2 --num_steps 20', framework=Framework.PYTORCH) assert (BenchmarkRegistry.is_benchmark_context_valid(context)) port = network.get_free_port() assert (port) utils.setup_simulated_ddp_distributed_env(1, 0, port) benchmark = BenchmarkRegistry.launch_benchmark(context) # Check basic information. assert (benchmark) assert (isinstance(benchmark, ShardingMatmul)) assert (benchmark.name == 'pytorch-sharding-matmul') assert (benchmark.type == BenchmarkType.MICRO) # Check predefined parameters of sharding-matmul benchmark. assert (benchmark._args.mode == [ ShardingMode.ALLREDUCE, ShardingMode.ALLGATHER ]) # Check parameters specified in BenchmarkContext. assert (benchmark._args.run_count == 2) assert (benchmark._args.num_steps == 20) # Check results and metrics. assert (benchmark.run_count == 2) assert (benchmark.return_code == ReturnCode.SUCCESS) for metric in ['allreduce_time', 'allgather_time']: assert (len(benchmark.raw_data[metric]) == benchmark.run_count) assert (len( benchmark.raw_data[metric][0]) == benchmark._args.num_steps) assert (len(benchmark.result[metric]) == benchmark.run_count) utils.clean_simulated_ddp_distributed_env()
def test_ort_inference_performance(mock_ort_session_run, mock_get_dir): """Test ort-inference benchmark.""" benchmark_name = 'ort-inference' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) mock_get_dir.return_value = '/tmp/superbench/' benchmark = benchmark_class( benchmark_name, parameters= '--pytorch_models resnet50 --graph_opt_level 1 --precision float16' ' --batch_size 16 --num_warmup 128 --num_steps 512') assert (isinstance(benchmark, ORTInferenceBenchmark)) assert (benchmark._preprocess()) # Check basic information. assert (benchmark.name == 'ort-inference') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._ORTInferenceBenchmark__model_cache_path == Path( torch.hub.get_dir()) / 'checkpoints') for model in benchmark._args.pytorch_models: assert (hasattr(torchvision.models, model)) file_name = '{model}.{precision}.onnx'.format( model=model, precision=benchmark._args.precision) assert ((benchmark._ORTInferenceBenchmark__model_cache_path / file_name).is_file()) # Check parameters specified in BenchmarkContext. assert (benchmark._args.pytorch_models == ['resnet50']) assert (benchmark._args.graph_opt_level == 1) assert (benchmark._args.precision == Precision.FLOAT16) assert (benchmark._args.batch_size == 16) assert (benchmark._args.num_warmup == 128) assert (benchmark._args.num_steps == 512) # Check results and metrics. assert (benchmark._benchmark()) shutil.rmtree(benchmark._ORTInferenceBenchmark__model_cache_path) assert (benchmark.return_code == ReturnCode.SUCCESS) precision_metric = {'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8'} for model in benchmark._args.pytorch_models: if benchmark._args.precision.value in precision_metric: precision = precision_metric[benchmark._args.precision.value] else: precision = benchmark._args.precision.value metric = '{}_{}_time'.format(precision, model) assert (metric in benchmark.result) assert (metric in benchmark.raw_data)
def test_get_benchmark_name(): """Test interface BenchmarkRegistry.get_benchmark_name().""" # Register benchmarks for testing. benchmark_names = [ 'accumulation', 'pytorch-accumulation', 'tf1-accumulation', 'onnxruntime-accumulation' ] for name in benchmark_names: BenchmarkRegistry.register_benchmark(name, AccumulationBenchmark) # Test benchmark name for different Frameworks. benchmark_frameworks = [ Framework.NONE, Framework.PYTORCH, Framework.TENSORFLOW1, Framework.ONNXRUNTIME ] for i in range(len(benchmark_names)): context = BenchmarkRegistry.create_benchmark_context( 'accumulation', platform=Platform.CPU, framework=benchmark_frameworks[i]) name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name( context) assert (name == benchmark_names[i])
def test_pytorch_empty_cache(): """Test PytorchBase class.""" # Register mnist benchmark. BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST) # Test cache empty by manually calling torch.cuda.empty_cache(). parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train' benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters) assert (benchmark) assert (benchmark._preprocess()) assert (benchmark._benchmark()) del benchmark assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] > 0) torch.cuda.empty_cache() assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0) # Test automatic cache empty. context = BenchmarkRegistry.create_benchmark_context( 'pytorch-mnist', parameters='--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train' ) benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
def _test_gpu_copy_bw_performance_command_generation(self, platform): """Test gpu-copy benchmark command generation.""" benchmark_name = 'gpu-copy-bw' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform) assert (benchmark_class) size = 1048576 num_warm_up = 20 num_loops = 10000 mem_types = ['htod', 'dtoh', 'dtod'] copy_types = ['sm', 'dma'] parameters = '--mem_type %s --copy_type %s --size %d ' \ '--num_warm_up %d --num_loops %d --bidirectional --check_data' % \ (' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops) benchmark = benchmark_class(benchmark_name, parameters=parameters) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == benchmark_name) assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.mem_type == mem_types) assert (benchmark._args.copy_type == copy_types) assert (benchmark._args.size == size) assert (benchmark._args.num_warm_up == num_warm_up) assert (benchmark._args.num_loops == num_loops) assert (benchmark._args.bidirectional) assert (benchmark._args.check_data) # Check command assert (1 == len(benchmark._commands)) assert (benchmark._commands[0].startswith(benchmark._GpuCopyBwBenchmark__bin_path)) for mem_type in mem_types: assert ('--%s' % mem_type in benchmark._commands[0]) for copy_type in copy_types: assert ('--%s_copy' % copy_type in benchmark._commands[0]) assert ('--size %d' % size in benchmark._commands[0]) assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0]) assert ('--num_loops %d' % num_loops in benchmark._commands[0]) assert ('--bidirectional' in benchmark._commands[0]) assert ('--check_data' in benchmark._commands[0])
def test_gpcnet_network_test(self, raw_output, raw_output_no_execution): """Test gpcnet-network-test benchmark.""" # Check registry. benchmark_name = 'gpcnet-network-test' (benchmark_class, predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU) assert (benchmark_class) # Check preprocess benchmark = benchmark_class(benchmark_name) ret = benchmark._preprocess() assert (ret) expect_command = 'network_test' command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1] assert (command == expect_command) assert (benchmark._process_raw_result(0, raw_output_no_execution)) assert (len(benchmark.result) == benchmark.default_metric_count) # Check function process_raw_data. # Positive case - valid raw output. assert (benchmark._process_raw_result(0, raw_output)) metric_list = [ 'rr_two-sided_lat', 'rr_get_lat', 'rr_two-sided_bw', 'rr_put_bw', 'rr_two-sided+sync_bw', 'nat_two-sided_bw', 'multiple_allreduce_time', 'multiple_alltoall_bw', ] for metric_medium in metric_list: for suffix in ['avg', '99%']: metric = metric_medium + '_' + suffix assert (metric in benchmark.result) assert (len(benchmark.result[metric]) == 1) assert (isinstance(benchmark.result[metric][0], numbers.Number)) # Negative case - Add invalid raw output. assert (benchmark._process_raw_result(0, 'ERROR') is False) # Check basic information. assert (benchmark.name == 'gpcnet-network-test') assert (benchmark.type == BenchmarkType.MICRO) assert (benchmark._bin_name == 'network_test')
def test_get_all_benchmark_predefine_settings(): """Test interface BenchmarkRegistry.get_all_benchmark_predefine_settings().""" benchmark_params = BenchmarkRegistry.get_all_benchmark_predefine_settings() # Choose benchmark 'pytorch-sharding-matmul' for testing. benchmark_name = 'pytorch-sharding-matmul' assert (benchmark_name in benchmark_params) assert (benchmark_params[benchmark_name]['run_count'] == 1) assert (benchmark_params[benchmark_name]['duration'] == 0) assert (benchmark_params[benchmark_name]['n'] == 12288) assert (benchmark_params[benchmark_name]['k'] == 12288) assert (benchmark_params[benchmark_name]['m'] == 16000) assert (benchmark_params[benchmark_name]['mode'] == [ ShardingMode.ALLREDUCE, ShardingMode.ALLGATHER ]) assert (benchmark_params[benchmark_name]['num_warmup'] == 10) assert (benchmark_params[benchmark_name]['num_steps'] == 500)
def benchmark_list_command_handler(name=None): """List benchmarks which match the regular expression. Args: name (str, optional): Benchmark name or regular expression. Defaults to None. Raises: CLIError: If cannot find the matching benchmark. Returns: list: Benchmark list. """ benchmark_list = list(BenchmarkRegistry.get_all_benchmark_predefine_settings().keys()) if name is None: return benchmark_list filter_list = list(filter(re.compile(name).match, benchmark_list)) if not filter_list: raise CLIError('Benchmark {} does not exist.'.format(name)) return filter_list
def test_gpu_burn(self, results): """Test gpu-burn benchmark command generation.""" benchmark_name = 'gpu-burn' (benchmark_class, predefine_params ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark( benchmark_name, Platform.CUDA) assert (benchmark_class) time = 10 parameters = '--doubles --tensor_core --time ' + str(time) benchmark = benchmark_class(benchmark_name, parameters=parameters) # Check basic information assert (benchmark) ret = benchmark._preprocess() assert (ret is True) assert (benchmark.return_code == ReturnCode.SUCCESS) assert (benchmark.name == benchmark_name) assert (benchmark.type == BenchmarkType.MICRO) # Check parameters specified in BenchmarkContext. assert (benchmark._args.time == time) assert (benchmark._args.doubles) assert (benchmark._args.tensor_core) # Check command compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./' compare_rm = 'rm ' + 'compare.ptx' assert (1 == len(benchmark._commands)) assert (benchmark._commands[0].startswith(compare_copy)) assert ('-d' in benchmark._commands[0]) assert ('-tc' in benchmark._commands[0]) assert (str(time) in benchmark._commands[0]) assert (compare_rm in benchmark._commands[0]) # Check results assert (benchmark._process_raw_result(0, results)) assert (benchmark.result['return_code'][0] == 0) assert (benchmark.result['time'][0] == time) for device in range(8): assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1) assert (benchmark.result['abort'][0] == 0)