def _benchmark(self): """Implementation for benchmarking. Return: True if run benchmark successfully. """ for cmd_idx in range(len(self._commands)): logger.info( 'Execute command - round: {}, benchmark: {}, command: {}.'. format(self._curr_run_index, self._name, self._commands[cmd_idx])) output = run_command(self._commands[cmd_idx]) if output.returncode != 0: self._result.set_return_code( ReturnCode.DOCKERBENCHMARK_EXECUTION_FAILURE) logger.error( 'DockerBenchmark execution failed - round: {}, benchmark: {}, error message: {}.' .format(self._curr_run_index, self._name, output.stdout)) return False else: if not self._process_raw_result(cmd_idx, output.stdout): self._result.set_return_code( ReturnCode.DOCKERBENCHMARK_RESULT_PARSING_FAILURE) return False return True
def parse_args(self, ignore_invalid=False): """Parse the arguments. Return: ret (bool): whether parse succeed or not. args (argparse.Namespace): parsed arguments. unknown (list): unknown arguments. """ try: args, unknown = self._parser.parse_known_args(self._argv) except BaseException as e: if ignore_invalid: logger.info( 'Missing or invliad parameters, will ignore the error and skip the args checking.' ) return True, None, [] else: logger.error( 'Invalid argument - benchmark: {}, message: {}.'.format( self._name, str(e))) return False, None, [] ret = True if len(unknown) > 0: logger.error( 'Unknown arguments - benchmark: {}, unknown arguments: {}'. format(self._name, ' '.join(unknown))) ret = False return ret, args, unknown
def run(self): """Method representing the process’s activity. Return: True if launching the process succeed. """ if self.__running.value == 0: if not self.__preprocess(): return False try: logger.info('Start monitoring.') self.__running.value = 1 self.__sample() self.__scheduler.run() except BaseException as e: logger.error( 'Failed to launch the monitor process - error message: {}'. format(str(e))) self.stop() return False else: logger.error('Monitor is still running') return True
def exec(self): """Run the SuperBench benchmarks locally.""" for benchmark_name in self._sb_benchmarks: if benchmark_name not in self._sb_enabled: continue benchmark_config = self._sb_benchmarks[benchmark_name] benchmark_results = list() self.__create_benchmark_dir(benchmark_name) cwd = os.getcwd() os.chdir(self.__get_benchmark_dir(benchmark_name)) monitor = None if self.__get_rank_id( ) == 0 and self._sb_monitor_config and self._sb_monitor_config.enable: if self.__get_platform() == Platform.CUDA: monitor = Monitor( None, int(self._sb_monitor_config.sample_duration or 10), int(self._sb_monitor_config.sample_interval or 1), self.__get_monitor_path(benchmark_name)) monitor.start() else: logger.warning( 'Monitor can not support ROCM/CPU platform.') benchmark_real_name = benchmark_name.split(':')[0] for framework in benchmark_config.frameworks or [ Framework.NONE.value ]: if benchmark_real_name == 'model-benchmarks' or ( ':' not in benchmark_name and benchmark_name.endswith('_models')): for model in benchmark_config.models: full_name = f'{benchmark_name}/{framework}-{model}' logger.info('Executor is going to execute %s.', full_name) context = BenchmarkRegistry.create_benchmark_context( model, platform=self.__get_platform(), framework=Framework(framework.lower()), parameters=self.__get_arguments( benchmark_config.parameters)) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) else: full_name = benchmark_name logger.info('Executor is going to execute %s.', full_name) context = BenchmarkRegistry.create_benchmark_context( benchmark_real_name, platform=self.__get_platform(), framework=Framework(framework.lower()), parameters=self.__get_arguments( benchmark_config.parameters)) result = self.__exec_benchmark(full_name, context) benchmark_results.append(result) if monitor: monitor.stop() self.__write_benchmark_results(benchmark_name, benchmark_results) os.chdir(cwd)
def get_sb_config(config_file): """Read SuperBench config yaml. Read config file, detect Azure SKU and use corresponding config if None is provided. Args: config_file (str): config file path. Returns: OmegaConf: Config object, None if file does not exist. """ p = Path(str(config_file)) if not config_file: config_path = (Path(__file__).parent / '../../config').resolve() p = config_path / 'default.yaml' vm_size = get_vm_size().lower() if vm_size: logger.info('Detected Azure SKU %s.', vm_size) for config in (config_path / 'azure').glob('**/*'): if config.name.startswith(vm_size): p = config break logger.info('No benchmark config provided, using config file %s.', str(p)) if not p.is_file(): return None with p.open() as fp: return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
def _benchmark(self): """Implementation for benchmarking.""" M = self._args.m K = self._args.k N = self._args.n for mode in self._args.mode: if mode == ShardingMode.NOSHARDING: elapse_times = self.__matmul_nosharding(M, K, N) elif mode == ShardingMode.ALLREDUCE: elapse_times = self.__matmul_allreduce(M, K, N) elif mode == ShardingMode.ALLGATHER: elapse_times = self.__matmul_allgather(M, K, N) else: logger.error( 'Unknown sharding mode - benchmark: {}, mode: {}.'.format( self._name, mode)) return False metric = '{}_time'.format(mode) if not self._process_numeric_result( metric, elapse_times, reduce_type=ReduceType.MAX): return False logger.info( 'Matmul sharding - round: {0}, name: {1}, shape: ({2}, {3}) * ({3}, {4}), mode: {5}, cost: {6} ms' .format(self._curr_run_index, self._name, M, K, N, mode, statistics.mean(elapse_times))) return True
def _inference_step(self, precision): """Define the inference process. Args: precision (Precision): precision of model and input data, such as float32, float16. Return: The latency list of every inference operation. """ duration = [] with torch.no_grad(): self._model.eval() for idx, sample in enumerate(self._dataloader): sample = sample.to(dtype=getattr(torch, precision.value)) start = self._timer() if self._gpu_available: sample = sample.cuda() self._model(sample) if self._gpu_available: torch.cuda.synchronize() end = self._timer() if idx % 10 == 0: logger.info( 'Inference step [{}/{} ({:.0f}%)]'.format( idx, len(self._dataloader), 100. * idx / len(self._dataloader) ) ) if idx >= self._args.num_warmup: duration.append((end - start) * 1000) return duration
def _train_step(self, precision): """Define the training process. Args: precision (Precision): precision of model and input data, such as float32, float16. Return: The step-time list of every training step. """ duration = [] for idx, sample in enumerate(self._dataloader): sample = sample.to(dtype=getattr(torch, precision.value)) start = self._timer() if self._gpu_available: sample = sample.cuda() self._optimizer.zero_grad() output = self._model(sample) loss = self._loss_fn(output, self._target) loss.backward() self._optimizer.step() end = self._timer() if idx % 10 == 0: logger.info( 'Train step [{}/{} ({:.0f}%)]'.format( idx, len(self._dataloader), 100. * idx / len(self._dataloader) ) ) if idx >= self._args.num_warmup: duration.append((end - start) * 1000) return duration
def _benchmark(self): """Implementation for benchmarking. Return: True if run benchmark successfully. """ logger.info('TCP validation - round: {0}, name: {1}'.format( self._curr_run_index, self._name)) # Run TCPing on host in the hostfile in parallel try: outputs = Parallel( n_jobs=min(len(self.__hosts), self._args.parallel))( delayed(run_tcping)(self.__hosts[i], self._args.port, self._args.count, self._args.timeout) for i in (range(len(self.__hosts)))) except Exception as e: self._result.set_return_code( ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE) logger.error( 'Microbenchmark execution failed - round: {}, benchmark: {}, error message: {}.' .format(self._curr_run_index, self._name, str(e))) return False # Parse the output and get the results for host_index, out in enumerate(outputs): if not self._process_raw_result(host_index, out): self._result.set_return_code( ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) return False return True
def __parse_and_check_args(cls, name, class_def, parameters): """Parse and check the predefine parameters. If ignore_invalid is True, and 'required' arguments are not set when register the benchmark, the arguments should be provided by user in config and skip the arguments checking. Args: name (str): internal name of benchmark. class_def (Benchmark): class object of benchmark. parameters (str): predefined parameters of benchmark. """ benchmark = class_def(name, parameters) benchmark.add_parser_arguments() ret, args, unknown = benchmark.parse_args(ignore_invalid=True) if not ret or len(unknown) >= 1: logger.log_and_raise( TypeError, 'Registered benchmark has invalid arguments - benchmark: {}, parameters: {}'.format(name, parameters) ) elif args is not None: cls.benchmarks[name]['predefine_param'] = vars(args) logger.debug('Benchmark registration - benchmark: {}, predefine_parameters: {}'.format(name, vars(args))) else: cls.benchmarks[name]['predefine_param'] = dict() logger.info( 'Benchmark registration - benchmark: {}, missing required parameters or invalid parameters, ' 'skip the arguments checking.'.format(name) )
def __train(self, precision): """Launch the training benchmark. Args: precision (Precision): precision of model and input data, such as float32, float16. Return: True if step_times list is not empty. """ if not self._create_model(precision): self._result.set_return_code(ReturnCode.MODEL_CREATION_FAILURE) return False if not self._create_optimizer(): self._result.set_return_code(ReturnCode.OPTIMIZER_CREATION_FAILURE) return False # The unit of step time should be millisecond. step_times = self._train_step(precision) step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times) if not step_times: self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT) return False logger.info( 'Average train time - round: {}, model: {}, precision: {}, step time: {:.6f} ms.'.format( self._curr_run_index, self._name, precision, statistics.mean(step_times) ) ) return True
def _preprocess(self): """Preprocess/preparation operations before the benchmarking. Return: True if _preprocess() succeed. """ if not super()._preprocess(): return False self._judge_gpu_availability() self._set_force_fp32() logger.info( 'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format( self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32 ) ) if not self._init_distributed_setting(): self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) return False # Set sample_count aligned with batch_size. self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size if not self._generate_dataset(): self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE) return False if not self._init_dataloader(): self._result.set_return_code(ReturnCode.DATALOADER_INIT_FAILURE) return False return True
def __exec_benchmark(self, benchmark_full_name, context): """Launch benchmark for context. Args: benchmark_full_name (str): Benchmark full name. context (BenchmarkContext): Benchmark context to launch. Return: dict: Benchmark result. """ try: benchmark = BenchmarkRegistry.launch_benchmark(context) if benchmark: logger.info('benchmark: %s, return code: %s, result: %s.', benchmark.name, benchmark.return_code, benchmark.result) if benchmark.return_code.value == 0: logger.info('Executor succeeded in %s.', benchmark_full_name) else: logger.error('Executor failed in %s.', benchmark_full_name) result = json.loads(benchmark.serialized_result) result['name'] = benchmark_full_name return result else: logger.error('Executor failed in %s, invalid context.', benchmark_full_name) except Exception as e: logger.error(e) logger.error('Executor failed in %s.', benchmark_full_name) return None
def _init_distributed_setting(self): """Initialize the distributed library and bind the worker to GPU. Return: True if distributed library is initialized successfully. """ if self._args.distributed_impl: logger.info( 'Distributed training is enabled - model: {}, distributed implementation: {}.' .format(self._name, self._args.distributed_impl)) if self._args.distributed_impl == DistributedImpl.HOROVOD: import horovod.torch as hvd hvd.init() self._world_size = int(hvd.size()) self._local_rank = int(hvd.local_rank()) self._global_rank = int(hvd.rank()) elif self._args.distributed_impl == DistributedImpl.DDP: if os.environ.get('WORLD_SIZE') is None or os.environ.get( 'LOCAL_RANK') is None: logger.error( 'Can not find WORLD_SIZE or LOCAL_RANK in env variables - model: {},' ' distributed implementation: {}.'.format( self._name, self._args.distributed_impl)) return False # torch >= 1.9.0a0 torch.distributed.elastic is used by default port = int(os.environ['MASTER_PORT']) + 1 addr = os.environ['MASTER_ADDR'] self._global_rank = int(os.environ['RANK']) self._local_rank = int(os.environ['LOCAL_RANK']) self._world_size = int(os.environ['WORLD_SIZE']) logger.debug('ip:{},port:{},rank:{},world:{}'.format( addr, port, self._global_rank, self._world_size)) store = PrefixStore( self._name, TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300))) torch.distributed.init_process_group( backend=self._args.distributed_backend.value, timeout=timedelta(seconds=300), rank=self._global_rank, world_size=self._world_size, store=store) else: logger.error( 'Unsupported distributed implementation - model: {}, distributed implementation: {}.' .format(self._name, self._args.distributed_impl)) return False if self._gpu_available: torch.cuda.set_device(self._local_rank) return True
def get_shell_config(self, cmd): """Get ansible config for shell module. Args: cmd (str): Shell command for config. Returns: dict: Ansible config dict. """ logger.info('Run {} on remote ...'.format(cmd)) ansible_config = { **self._config, 'module': 'shell', 'module_args': cmd, } return ansible_config
def __init__(self, config): """Initilize. Args: config (DictConfig): Ansible config object. """ self._playbook_path = Path(__file__).parent / 'playbooks' self._config = { 'host_pattern': 'localhost', 'cmdline': '--forks 128', } self._head_host = None if config: inventory_file = getattr(config, 'host_file', None) inventory_list = getattr(config, 'host_list', None) if inventory_list: inventory_list = inventory_list.strip(',') if inventory_file or inventory_list: self._config['host_pattern'] = 'all' inventory = InventoryManager(loader=DataLoader(), sources=inventory_file or f'{inventory_list},') host_list = inventory.get_hosts(pattern='all', order='sorted') if len(host_list) > 0: self._config['cmdline'] = '--forks {}'.format( len(host_list)) self._head_host = host_list[0].get_name() if inventory_list in ['localhost', '127.0.0.1']: self._config['cmdline'] += ' --connection local' self._config['cmdline'] += ' --inventory {}'.format( inventory_file or f'{inventory_list},') username = getattr(config, 'host_username', None) if username: self._config['cmdline'] += ' --user {}'.format(username) password = getattr(config, 'host_password', None) if password: self._config['passwords'] = { 'password': password, 'passphrase': password, } key_file = getattr(config, 'private_key', None) if key_file: self._config['cmdline'] += ' --private-key {}'.format(key_file) elif password: self._config['cmdline'] += ' --ask-pass --ask-become-pass' logger.info(self._config)
def get_playbook_config(self, playbook, extravars=None): """Get ansible config for playbook. Args: playbook (str): Playbook file name. extravars (dict): Extra variables in playbook. Defaults to None. Returns: dict: Ansible config dict. """ logger.info('Run playbook {} ...'.format(playbook)) ansible_config = { **self._config, 'extravars': extravars, 'playbook': str(self._playbook_path / playbook), } return ansible_config
def deploy(self): # pragma: no cover """Deploy SuperBench environment.""" logger.info('Preparing SuperBench environment.') extravars = { 'ssh_port': random.randint(1 << 14, (1 << 15) - 1), 'output_dir': str(self._output_path), 'docker_image': self._docker_config.image, } if bool(self._docker_config.username) and bool( self._docker_config.password): extravars.update({ 'docker_registry': self._docker_config.registry, 'docker_username': self._docker_config.username, 'docker_password': self._docker_config.password, }) self._ansible_client.run( self._ansible_client.get_playbook_config('deploy.yaml', extravars=extravars))
def check_env(self): # pragma: no cover """Check SuperBench environment.""" logger.info('Checking SuperBench environment.') OmegaConf.save(config=self._sb_config, f=str(self._output_path / 'sb.config.yaml')) self._ansible_client.run( self._ansible_client.get_playbook_config( 'check_env.yaml', extravars={ 'no_docker': bool(self._docker_config.skip), 'output_dir': str(self._output_path), 'env': '\n'.join( f'{k}={v}' for k, v in self._sb_config.superbench.env.items()), }))
def run(self, raw_data_file, rule_file, output_dir, output_format, round=2): """Run the main process of result summary. Args: raw_data_file (str): the path of raw data jsonl file. rule_file (str): The path of baseline yaml file output_dir (str): the directory of output file output_format (str): the format of the output, 'excel' or 'md' or 'html' round (int): the number of decimal digits """ try: rules = self._preprocess(raw_data_file, rule_file) # parse rules for result summary if not self._parse_rules(rules): return # generate result summary for each category summary = self._generate_summary(round) # output result summary to file output_path = '' if output_format == 'excel': output_path = str(Path(output_dir) / 'results-summary.xlsx') summary_df = self._merge_summary(summary) self.output_summary_in_excel(self._raw_data_df, summary_df, output_path) elif output_format == 'md': output_path = str(Path(output_dir) / 'results-summary.md') lines = self.generate_md_lines(summary) file_handler.output_lines_in_md(lines, output_path) elif output_format == 'html': output_path = str(Path(output_dir) / 'results-summary.html') lines = self.generate_md_lines(summary) file_handler.output_lines_in_html(lines, output_path) else: logger.error( 'ResultSummary: output failed - unsupported output format') logger.info( 'ResultSummary: Output results to {}'.format(output_path)) except Exception as e: logger.error('ResultSummary: run failed - {}'.format(str(e)))
def run(self, ansible_config, sudo=False): # pragma: no cover """Run Ansible runner. Args: ansible_config (dict): Ansible config dict. sudo (bool): Run as sudo or not. Defaults to False. Returns: int: Ansible return code. """ if sudo: logger.info('Run as sudo ...') ansible_config['cmdline'] += ' --become' with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir: r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config) logger.debug(r.stats) if r.rc == 0: logger.info('Run succeed, return code {}.'.format(r.rc)) else: logger.warning('Run failed, return code {}.'.format(r.rc)) return r.rc
def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir): """Initilize. Args: sb_config (DictConfig): SuperBench config object. docker_config (DictConfig): Docker config object. ansible_config (DictConfig): Ansible config object. sb_output_dir (str): SuperBench output directory. """ self._sb_config = sb_config self._docker_config = docker_config self._ansible_config = ansible_config self._sb_output_dir = sb_output_dir self._output_path = Path(sb_output_dir).expanduser().resolve() self._ansible_client = AnsibleClient(ansible_config) self.__set_logger('sb-run.log') logger.info( 'Runner uses config: %s.', pformat(OmegaConf.to_container(self._sb_config, resolve=True))) logger.info('Runner writes to: %s.', str(self._output_path)) self._sb_benchmarks = self._sb_config.superbench.benchmarks self.__validate_sb_config() self._sb_enabled_benchmarks = self.__get_enabled_benchmarks() logger.info('Runner will run: %s', self._sb_enabled_benchmarks)
def _run_proc(self, benchmark_name, mode, vars): """Run the process. Args: benchmark_name (str): Benchmark name. mode (DictConfig): Runner mode. vars (dict): Process variables. Returns: int: Process return code. """ mode.update(vars) logger.info('Runner is going to run %s in %s mode, proc rank %d.', benchmark_name, mode.name, mode.proc_rank) timeout = self._sb_benchmarks[benchmark_name].timeout env_list = '--env-file /tmp/sb.env' if self._docker_config.skip: env_list = 'set -o allexport && source /tmp/sb.env && set +o allexport' for k, v in mode.env.items(): if isinstance(v, str): envvar = f'{k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}' env_list += f' -e {envvar}' if not self._docker_config.skip else f' && export {envvar}' fcmd = "docker exec {env_list} sb-workspace bash -c '{command}'" if self._docker_config.skip: fcmd = "bash -c '{env_list} && cd $SB_WORKSPACE && {command}'" ansible_runner_config = self._ansible_client.get_shell_config( fcmd.format(env_list=env_list, command=self.__get_mode_command( benchmark_name, mode, timeout))) if mode.name == 'mpi' and mode.node_num != 1: ansible_runner_config = self._ansible_client.update_mpi_config( ansible_runner_config) ansible_runner_config['timeout'] = timeout rc = self._ansible_client.run(ansible_runner_config, sudo=(not self._docker_config.skip)) return rc
def _benchmark(self): """Implementation for benchmarking.""" import onnxruntime as ort precision_metric = { 'float16': 'fp16', 'float32': 'fp32', 'int8': 'int8' } for model in self._args.pytorch_models: sess_options = ort.SessionOptions() sess_options.graph_optimization_level = self.__graph_opt_level[ self._args.graph_opt_level] file_name = '{model}.{precision}.onnx'.format( model=model, precision=self._args.precision) ort_sess = ort.InferenceSession( f'{self.__model_cache_path / file_name}', sess_options, providers=['CUDAExecutionProvider']) elapse_times = self.__inference(ort_sess) if self._args.precision.value in precision_metric: precision = precision_metric[self._args.precision.value] else: precision = self._args.precision.value metric = '{}_{}_time'.format(precision, model) if not self._process_numeric_result( metric, elapse_times, cal_percentile=True): return False logger.info( 'ORT Inference - round: {}, name: {}, model: {}, precision: {}, latency: {} ms' .format(self._curr_run_index, self._name, model, self._args.precision, statistics.mean(elapse_times))) return True
def __inference(self, precision): """Launch the inference benchmark. Args: precision (Precision): precision of model and input data, such as float32, float16. Return: True if step_times list is not empty. """ self._create_model(precision) # The unit of step time should be millisecond. step_times = self._inference_step(precision) step_times = self.__process_model_result(ModelAction.INFERENCE, precision, step_times) if not step_times: self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT) return False logger.info( 'Average inference time - round: {}, model: {}, precision: {}, step time: {:.6f} ms.'.format( self._curr_run_index, self._name, precision, statistics.mean(step_times) ) ) return True
def run( self, raw_data_file, rule_file, baseline_file, output_dir, output_format='excel', output_all=False, round=2 ): """Run the data diagnosis and output the results. Args: raw_data_file (str): the path of raw data jsonl file. rule_file (str): The path of baseline yaml file baseline_file (str): The path of baseline json file output_dir (str): the directory of output file output_all (bool): output diagnosis results for all nodes output_format (str): the format of the output, 'excel' or 'json' round (int): the number of decimal digits """ try: rules = self._preprocess(raw_data_file, rule_file) # read baseline baseline = file_handler.read_baseline(baseline_file) logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df))) data_not_accept_df, label_df = self.run_diagnosis_rules(rules, baseline) logger.info('DataDiagnosis: Processed finished') output_path = '' # generate all nodes' info if output_all: output_path = str(Path(output_dir) / 'diagnosis_summary.json') data_not_accept_df = self.output_all_nodes_results(self._raw_data_df, data_not_accept_df) # output according format if output_format == 'excel': output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx') self.output_diagnosis_in_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules) elif output_format == 'json': if output_all: output_path = str(Path(output_dir) / 'diagnosis_summary.json') self.output_diagnosis_in_json(data_not_accept_df, output_path) else: output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl') self.output_diagnosis_in_jsonl(data_not_accept_df, output_path) elif output_format == 'md' or output_format == 'html': lines = self.generate_md_lines(data_not_accept_df, self._sb_rules, round) if output_format == 'md': output_path = str(Path(output_dir) / 'diagnosis_summary.md') file_handler.output_lines_in_md(lines, output_path) else: output_path = str(Path(output_dir) / 'diagnosis_summary.html') file_handler.output_lines_in_html(lines, output_path) else: logger.error('DataDiagnosis: output failed - unsupported output format') logger.info('DataDiagnosis: Output results to {}'.format(output_path)) except Exception as e: logger.error('DataDiagnosis: run failed - {}'.format(str(e)))
# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """Micro benchmark example for ONNXRuntime inference performance. Commands to run: python3 examples/benchmarks/ort_inference_performance.py """ from superbench.benchmarks import BenchmarkRegistry, Platform from superbench.common.utils import logger if __name__ == '__main__': context = BenchmarkRegistry.create_benchmark_context( 'ort-inference', platform=Platform.CUDA, parameters='--pytorch_models resnet50 resnet101 --precision float16') benchmark = BenchmarkRegistry.launch_benchmark(context) if benchmark: logger.info('benchmark: {}, return code: {}, result: {}'.format( benchmark.name, benchmark.return_code, benchmark.result))
def _benchmark(self): """Implementation for benchmarking.""" M = self._args.m K = self._args.k N = self._args.n P = self._args.p Q = self._args.q kernels = self._args.kernel if self.__local_rank == 0: logger.info('Computation Communication Overlap - using {} GPUs,\ matrix shape for computation: M={} K={} N={},\ message tensor shape of NCCL = [{},{}],\ ratio between computation kernel and NCCL kernel={}'.format( self.__world_size, M, K, N, P, Q, self._args.ratio)) MatA = list() MatB = list() # Matrix A for _ in range(self._args.ratio): MatA.append(torch.randn(M, K).cuda()) # Matrix B MatB = torch.randn(K, N).cuda() # message for NCCL to transport shape = [P, Q] message = torch.randn(*shape).cuda() for kernel in kernels: # warm up for i in range(self._args.num_warmup): if not self.__kernel_nccl_pipeline( kernel, MatA, MatB, self._args.ratio, message, times=100): return False torch.cuda.synchronize() # run and collect results start = time.perf_counter() for i in range(self._args.num_steps): self.__kernel_nccl_pipeline(kernel, MatA, MatB, self._args.ratio, message, times=100) compute_end = time.perf_counter() torch.cuda.synchronize() compute_metric = '{}_time'.format(kernel) compute_elapse_times = [ (compute_end - start) * 1000 / self._args.num_steps ] if not self._process_numeric_result(compute_metric, compute_elapse_times): return False logger.info( 'Computation_communication_overlap - round: {0}, name: {1}, gpu: {2} kernel: {3}, cost: {4} ms' .format(self._curr_run_index, self._name, self.__local_rank, kernel, (compute_end - start) * 1000 / self._args.num_steps)) return True