Ejemplo n.º 1
0
    def _benchmark(self):
        """Implementation for benchmarking.

        Return:
            True if run benchmark successfully.
        """
        for cmd_idx in range(len(self._commands)):
            logger.info(
                'Execute command - round: {}, benchmark: {}, command: {}.'.
                format(self._curr_run_index, self._name,
                       self._commands[cmd_idx]))
            output = run_command(self._commands[cmd_idx])
            if output.returncode != 0:
                self._result.set_return_code(
                    ReturnCode.DOCKERBENCHMARK_EXECUTION_FAILURE)
                logger.error(
                    'DockerBenchmark execution failed - round: {}, benchmark: {}, error message: {}.'
                    .format(self._curr_run_index, self._name, output.stdout))
                return False
            else:
                if not self._process_raw_result(cmd_idx, output.stdout):
                    self._result.set_return_code(
                        ReturnCode.DOCKERBENCHMARK_RESULT_PARSING_FAILURE)
                    return False

        return True
Ejemplo n.º 2
0
    def parse_args(self, ignore_invalid=False):
        """Parse the arguments.

        Return:
            ret (bool): whether parse succeed or not.
            args (argparse.Namespace): parsed arguments.
            unknown (list): unknown arguments.
        """
        try:
            args, unknown = self._parser.parse_known_args(self._argv)
        except BaseException as e:
            if ignore_invalid:
                logger.info(
                    'Missing or invliad parameters, will ignore the error and skip the args checking.'
                )
                return True, None, []
            else:
                logger.error(
                    'Invalid argument - benchmark: {}, message: {}.'.format(
                        self._name, str(e)))
                return False, None, []

        ret = True
        if len(unknown) > 0:
            logger.error(
                'Unknown arguments - benchmark: {}, unknown arguments: {}'.
                format(self._name, ' '.join(unknown)))
            ret = False

        return ret, args, unknown
Ejemplo n.º 3
0
    def run(self):
        """Method representing the process’s activity.

        Return:
            True if launching the process succeed.
        """
        if self.__running.value == 0:
            if not self.__preprocess():
                return False

            try:
                logger.info('Start monitoring.')
                self.__running.value = 1
                self.__sample()
                self.__scheduler.run()
            except BaseException as e:
                logger.error(
                    'Failed to launch the monitor process - error message: {}'.
                    format(str(e)))
                self.stop()
                return False
        else:
            logger.error('Monitor is still running')

        return True
Ejemplo n.º 4
0
    def exec(self):
        """Run the SuperBench benchmarks locally."""
        for benchmark_name in self._sb_benchmarks:
            if benchmark_name not in self._sb_enabled:
                continue
            benchmark_config = self._sb_benchmarks[benchmark_name]
            benchmark_results = list()
            self.__create_benchmark_dir(benchmark_name)
            cwd = os.getcwd()
            os.chdir(self.__get_benchmark_dir(benchmark_name))

            monitor = None
            if self.__get_rank_id(
            ) == 0 and self._sb_monitor_config and self._sb_monitor_config.enable:
                if self.__get_platform() == Platform.CUDA:
                    monitor = Monitor(
                        None, int(self._sb_monitor_config.sample_duration
                                  or 10),
                        int(self._sb_monitor_config.sample_interval or 1),
                        self.__get_monitor_path(benchmark_name))
                    monitor.start()
                else:
                    logger.warning(
                        'Monitor can not support ROCM/CPU platform.')

            benchmark_real_name = benchmark_name.split(':')[0]
            for framework in benchmark_config.frameworks or [
                    Framework.NONE.value
            ]:
                if benchmark_real_name == 'model-benchmarks' or (
                        ':' not in benchmark_name
                        and benchmark_name.endswith('_models')):
                    for model in benchmark_config.models:
                        full_name = f'{benchmark_name}/{framework}-{model}'
                        logger.info('Executor is going to execute %s.',
                                    full_name)
                        context = BenchmarkRegistry.create_benchmark_context(
                            model,
                            platform=self.__get_platform(),
                            framework=Framework(framework.lower()),
                            parameters=self.__get_arguments(
                                benchmark_config.parameters))
                        result = self.__exec_benchmark(full_name, context)
                        benchmark_results.append(result)
                else:
                    full_name = benchmark_name
                    logger.info('Executor is going to execute %s.', full_name)
                    context = BenchmarkRegistry.create_benchmark_context(
                        benchmark_real_name,
                        platform=self.__get_platform(),
                        framework=Framework(framework.lower()),
                        parameters=self.__get_arguments(
                            benchmark_config.parameters))
                    result = self.__exec_benchmark(full_name, context)
                    benchmark_results.append(result)

            if monitor:
                monitor.stop()
            self.__write_benchmark_results(benchmark_name, benchmark_results)
            os.chdir(cwd)
Ejemplo n.º 5
0
def get_sb_config(config_file):
    """Read SuperBench config yaml.

    Read config file, detect Azure SKU and use corresponding config if None is provided.

    Args:
        config_file (str): config file path.

    Returns:
        OmegaConf: Config object, None if file does not exist.
    """
    p = Path(str(config_file))
    if not config_file:
        config_path = (Path(__file__).parent / '../../config').resolve()
        p = config_path / 'default.yaml'
        vm_size = get_vm_size().lower()
        if vm_size:
            logger.info('Detected Azure SKU %s.', vm_size)
            for config in (config_path / 'azure').glob('**/*'):
                if config.name.startswith(vm_size):
                    p = config
                    break
        logger.info('No benchmark config provided, using config file %s.',
                    str(p))
    if not p.is_file():
        return None
    with p.open() as fp:
        return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
Ejemplo n.º 6
0
    def _benchmark(self):
        """Implementation for benchmarking."""
        M = self._args.m
        K = self._args.k
        N = self._args.n
        for mode in self._args.mode:
            if mode == ShardingMode.NOSHARDING:
                elapse_times = self.__matmul_nosharding(M, K, N)
            elif mode == ShardingMode.ALLREDUCE:
                elapse_times = self.__matmul_allreduce(M, K, N)
            elif mode == ShardingMode.ALLGATHER:
                elapse_times = self.__matmul_allgather(M, K, N)
            else:
                logger.error(
                    'Unknown sharding mode - benchmark: {}, mode: {}.'.format(
                        self._name, mode))
                return False

            metric = '{}_time'.format(mode)
            if not self._process_numeric_result(
                    metric, elapse_times, reduce_type=ReduceType.MAX):
                return False

            logger.info(
                'Matmul sharding - round: {0}, name: {1}, shape: ({2}, {3}) * ({3}, {4}), mode: {5}, cost: {6} ms'
                .format(self._curr_run_index, self._name, M, K, N, mode,
                        statistics.mean(elapse_times)))

        return True
Ejemplo n.º 7
0
    def _inference_step(self, precision):
        """Define the inference process.

        Args:
            precision (Precision): precision of model and input data,
              such as float32, float16.

        Return:
            The latency list of every inference operation.
        """
        duration = []
        with torch.no_grad():
            self._model.eval()
            for idx, sample in enumerate(self._dataloader):
                sample = sample.to(dtype=getattr(torch, precision.value))
                start = self._timer()
                if self._gpu_available:
                    sample = sample.cuda()
                self._model(sample)
                if self._gpu_available:
                    torch.cuda.synchronize()
                end = self._timer()
                if idx % 10 == 0:
                    logger.info(
                        'Inference step [{}/{} ({:.0f}%)]'.format(
                            idx, len(self._dataloader), 100. * idx / len(self._dataloader)
                        )
                    )
                if idx >= self._args.num_warmup:
                    duration.append((end - start) * 1000)
        return duration
Ejemplo n.º 8
0
    def _train_step(self, precision):
        """Define the training process.

        Args:
            precision (Precision): precision of model and input data, such as float32, float16.

        Return:
            The step-time list of every training step.
        """
        duration = []
        for idx, sample in enumerate(self._dataloader):
            sample = sample.to(dtype=getattr(torch, precision.value))
            start = self._timer()
            if self._gpu_available:
                sample = sample.cuda()
            self._optimizer.zero_grad()
            output = self._model(sample)
            loss = self._loss_fn(output, self._target)
            loss.backward()
            self._optimizer.step()
            end = self._timer()
            if idx % 10 == 0:
                logger.info(
                    'Train step [{}/{} ({:.0f}%)]'.format(
                        idx, len(self._dataloader), 100. * idx / len(self._dataloader)
                    )
                )
            if idx >= self._args.num_warmup:
                duration.append((end - start) * 1000)

        return duration
Ejemplo n.º 9
0
    def _benchmark(self):
        """Implementation for benchmarking.

        Return:
            True if run benchmark successfully.
        """
        logger.info('TCP validation - round: {0}, name: {1}'.format(
            self._curr_run_index, self._name))

        # Run TCPing on host in the hostfile in parallel
        try:
            outputs = Parallel(
                n_jobs=min(len(self.__hosts), self._args.parallel))(
                    delayed(run_tcping)(self.__hosts[i], self._args.port,
                                        self._args.count, self._args.timeout)
                    for i in (range(len(self.__hosts))))
        except Exception as e:
            self._result.set_return_code(
                ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
            logger.error(
                'Microbenchmark execution failed - round: {}, benchmark: {}, error message: {}.'
                .format(self._curr_run_index, self._name, str(e)))
            return False

        # Parse the output and get the results
        for host_index, out in enumerate(outputs):
            if not self._process_raw_result(host_index, out):
                self._result.set_return_code(
                    ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
                return False

        return True
Ejemplo n.º 10
0
    def __parse_and_check_args(cls, name, class_def, parameters):
        """Parse and check the predefine parameters.

        If ignore_invalid is True, and 'required' arguments are not set when register the benchmark,
        the arguments should be provided by user in config and skip the arguments checking.

        Args:
            name (str): internal name of benchmark.
            class_def (Benchmark): class object of benchmark.
            parameters (str): predefined parameters of benchmark.
        """
        benchmark = class_def(name, parameters)
        benchmark.add_parser_arguments()
        ret, args, unknown = benchmark.parse_args(ignore_invalid=True)
        if not ret or len(unknown) >= 1:
            logger.log_and_raise(
                TypeError,
                'Registered benchmark has invalid arguments - benchmark: {}, parameters: {}'.format(name, parameters)
            )
        elif args is not None:
            cls.benchmarks[name]['predefine_param'] = vars(args)
            logger.debug('Benchmark registration - benchmark: {}, predefine_parameters: {}'.format(name, vars(args)))
        else:
            cls.benchmarks[name]['predefine_param'] = dict()
            logger.info(
                'Benchmark registration - benchmark: {}, missing required parameters or invalid parameters, '
                'skip the arguments checking.'.format(name)
            )
Ejemplo n.º 11
0
    def __train(self, precision):
        """Launch the training benchmark.

        Args:
            precision (Precision): precision of model and input data, such as float32, float16.

        Return:
            True if step_times list is not empty.
        """
        if not self._create_model(precision):
            self._result.set_return_code(ReturnCode.MODEL_CREATION_FAILURE)
            return False

        if not self._create_optimizer():
            self._result.set_return_code(ReturnCode.OPTIMIZER_CREATION_FAILURE)
            return False

        # The unit of step time should be millisecond.
        step_times = self._train_step(precision)
        step_times = self.__process_model_result(ModelAction.TRAIN, precision, step_times)
        if not step_times:
            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
            return False

        logger.info(
            'Average train time - round: {}, model: {}, precision: {}, step time: {:.6f} ms.'.format(
                self._curr_run_index, self._name, precision, statistics.mean(step_times)
            )
        )

        return True
Ejemplo n.º 12
0
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        self._judge_gpu_availability()
        self._set_force_fp32()
        logger.info(
            'Model placement - model: {}, GPU availablility: {}, pin memory: {}, force fp32: {}.'.format(
                self._name, self._gpu_available, self._args.pin_memory, self._args.force_fp32
            )
        )

        if not self._init_distributed_setting():
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
            return False

        # Set sample_count aligned with batch_size.
        self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size

        if not self._generate_dataset():
            self._result.set_return_code(ReturnCode.DATASET_GENERATION_FAILURE)
            return False

        if not self._init_dataloader():
            self._result.set_return_code(ReturnCode.DATALOADER_INIT_FAILURE)
            return False

        return True
Ejemplo n.º 13
0
    def __exec_benchmark(self, benchmark_full_name, context):
        """Launch benchmark for context.

        Args:
            benchmark_full_name (str): Benchmark full name.
            context (BenchmarkContext): Benchmark context to launch.

        Return:
            dict: Benchmark result.
        """
        try:
            benchmark = BenchmarkRegistry.launch_benchmark(context)
            if benchmark:
                logger.info('benchmark: %s, return code: %s, result: %s.',
                            benchmark.name, benchmark.return_code,
                            benchmark.result)
                if benchmark.return_code.value == 0:
                    logger.info('Executor succeeded in %s.',
                                benchmark_full_name)
                else:
                    logger.error('Executor failed in %s.', benchmark_full_name)
                result = json.loads(benchmark.serialized_result)
                result['name'] = benchmark_full_name
                return result
            else:
                logger.error('Executor failed in %s, invalid context.',
                             benchmark_full_name)
        except Exception as e:
            logger.error(e)
            logger.error('Executor failed in %s.', benchmark_full_name)
        return None
Ejemplo n.º 14
0
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.

        Return:
            True if distributed library is initialized successfully.
        """
        if self._args.distributed_impl:
            logger.info(
                'Distributed training is enabled - model: {}, distributed implementation: {}.'
                .format(self._name, self._args.distributed_impl))
            if self._args.distributed_impl == DistributedImpl.HOROVOD:
                import horovod.torch as hvd
                hvd.init()
                self._world_size = int(hvd.size())
                self._local_rank = int(hvd.local_rank())
                self._global_rank = int(hvd.rank())
            elif self._args.distributed_impl == DistributedImpl.DDP:
                if os.environ.get('WORLD_SIZE') is None or os.environ.get(
                        'LOCAL_RANK') is None:
                    logger.error(
                        'Can not find WORLD_SIZE or LOCAL_RANK in env variables - model: {},'
                        ' distributed implementation: {}.'.format(
                            self._name, self._args.distributed_impl))
                    return False
                # torch >= 1.9.0a0 torch.distributed.elastic is used by default
                port = int(os.environ['MASTER_PORT']) + 1
                addr = os.environ['MASTER_ADDR']
                self._global_rank = int(os.environ['RANK'])
                self._local_rank = int(os.environ['LOCAL_RANK'])
                self._world_size = int(os.environ['WORLD_SIZE'])
                logger.debug('ip:{},port:{},rank:{},world:{}'.format(
                    addr, port, self._global_rank, self._world_size))
                store = PrefixStore(
                    self._name,
                    TCPStore(addr, port, self._world_size,
                             self._global_rank == 0, timedelta(seconds=300)))
                torch.distributed.init_process_group(
                    backend=self._args.distributed_backend.value,
                    timeout=timedelta(seconds=300),
                    rank=self._global_rank,
                    world_size=self._world_size,
                    store=store)

            else:
                logger.error(
                    'Unsupported distributed implementation - model: {}, distributed implementation: {}.'
                    .format(self._name, self._args.distributed_impl))
                return False

            if self._gpu_available:
                torch.cuda.set_device(self._local_rank)

        return True
Ejemplo n.º 15
0
    def get_shell_config(self, cmd):
        """Get ansible config for shell module.

        Args:
            cmd (str): Shell command for config.

        Returns:
            dict: Ansible config dict.
        """
        logger.info('Run {} on remote ...'.format(cmd))
        ansible_config = {
            **self._config,
            'module': 'shell',
            'module_args': cmd,
        }
        return ansible_config
Ejemplo n.º 16
0
    def __init__(self, config):
        """Initilize.

        Args:
            config (DictConfig): Ansible config object.
        """
        self._playbook_path = Path(__file__).parent / 'playbooks'
        self._config = {
            'host_pattern': 'localhost',
            'cmdline': '--forks 128',
        }
        self._head_host = None
        if config:
            inventory_file = getattr(config, 'host_file', None)
            inventory_list = getattr(config, 'host_list', None)
            if inventory_list:
                inventory_list = inventory_list.strip(',')
            if inventory_file or inventory_list:
                self._config['host_pattern'] = 'all'
                inventory = InventoryManager(loader=DataLoader(),
                                             sources=inventory_file
                                             or f'{inventory_list},')
                host_list = inventory.get_hosts(pattern='all', order='sorted')
                if len(host_list) > 0:
                    self._config['cmdline'] = '--forks {}'.format(
                        len(host_list))
                    self._head_host = host_list[0].get_name()
                if inventory_list in ['localhost', '127.0.0.1']:
                    self._config['cmdline'] += ' --connection local'
                self._config['cmdline'] += ' --inventory {}'.format(
                    inventory_file or f'{inventory_list},')
            username = getattr(config, 'host_username', None)
            if username:
                self._config['cmdline'] += ' --user {}'.format(username)
            password = getattr(config, 'host_password', None)
            if password:
                self._config['passwords'] = {
                    'password': password,
                    'passphrase': password,
                }
            key_file = getattr(config, 'private_key', None)
            if key_file:
                self._config['cmdline'] += ' --private-key {}'.format(key_file)
            elif password:
                self._config['cmdline'] += ' --ask-pass --ask-become-pass'
        logger.info(self._config)
Ejemplo n.º 17
0
    def get_playbook_config(self, playbook, extravars=None):
        """Get ansible config for playbook.

        Args:
            playbook (str): Playbook file name.
            extravars (dict): Extra variables in playbook. Defaults to None.

        Returns:
            dict: Ansible config dict.
        """
        logger.info('Run playbook {} ...'.format(playbook))
        ansible_config = {
            **self._config,
            'extravars': extravars,
            'playbook': str(self._playbook_path / playbook),
        }
        return ansible_config
Ejemplo n.º 18
0
 def deploy(self):  # pragma: no cover
     """Deploy SuperBench environment."""
     logger.info('Preparing SuperBench environment.')
     extravars = {
         'ssh_port': random.randint(1 << 14, (1 << 15) - 1),
         'output_dir': str(self._output_path),
         'docker_image': self._docker_config.image,
     }
     if bool(self._docker_config.username) and bool(
             self._docker_config.password):
         extravars.update({
             'docker_registry': self._docker_config.registry,
             'docker_username': self._docker_config.username,
             'docker_password': self._docker_config.password,
         })
     self._ansible_client.run(
         self._ansible_client.get_playbook_config('deploy.yaml',
                                                  extravars=extravars))
Ejemplo n.º 19
0
 def check_env(self):  # pragma: no cover
     """Check SuperBench environment."""
     logger.info('Checking SuperBench environment.')
     OmegaConf.save(config=self._sb_config,
                    f=str(self._output_path / 'sb.config.yaml'))
     self._ansible_client.run(
         self._ansible_client.get_playbook_config(
             'check_env.yaml',
             extravars={
                 'no_docker':
                 bool(self._docker_config.skip),
                 'output_dir':
                 str(self._output_path),
                 'env':
                 '\n'.join(
                     f'{k}={v}'
                     for k, v in self._sb_config.superbench.env.items()),
             }))
Ejemplo n.º 20
0
    def run(self,
            raw_data_file,
            rule_file,
            output_dir,
            output_format,
            round=2):
        """Run the main process of result summary.

        Args:
            raw_data_file (str): the path of raw data jsonl file.
            rule_file (str): The path of baseline yaml file
            output_dir (str): the directory of output file
            output_format (str): the format of the output, 'excel' or 'md' or 'html'
            round (int): the number of decimal digits
        """
        try:
            rules = self._preprocess(raw_data_file, rule_file)
            # parse rules for result summary
            if not self._parse_rules(rules):
                return
            # generate result summary for each category
            summary = self._generate_summary(round)
            # output result summary to file
            output_path = ''
            if output_format == 'excel':
                output_path = str(Path(output_dir) / 'results-summary.xlsx')
                summary_df = self._merge_summary(summary)
                self.output_summary_in_excel(self._raw_data_df, summary_df,
                                             output_path)
            elif output_format == 'md':
                output_path = str(Path(output_dir) / 'results-summary.md')
                lines = self.generate_md_lines(summary)
                file_handler.output_lines_in_md(lines, output_path)
            elif output_format == 'html':
                output_path = str(Path(output_dir) / 'results-summary.html')
                lines = self.generate_md_lines(summary)
                file_handler.output_lines_in_html(lines, output_path)
            else:
                logger.error(
                    'ResultSummary: output failed - unsupported output format')
            logger.info(
                'ResultSummary: Output results to {}'.format(output_path))
        except Exception as e:
            logger.error('ResultSummary: run failed - {}'.format(str(e)))
Ejemplo n.º 21
0
    def run(self, ansible_config, sudo=False):  # pragma: no cover
        """Run Ansible runner.

        Args:
            ansible_config (dict): Ansible config dict.
            sudo (bool): Run as sudo or not. Defaults to False.

        Returns:
            int: Ansible return code.
        """
        if sudo:
            logger.info('Run as sudo ...')
            ansible_config['cmdline'] += ' --become'
        with tempfile.TemporaryDirectory(prefix='ansible') as tmpdir:
            r = ansible_runner.run(private_data_dir=tmpdir, **ansible_config)
            logger.debug(r.stats)
        if r.rc == 0:
            logger.info('Run succeed, return code {}.'.format(r.rc))
        else:
            logger.warning('Run failed, return code {}.'.format(r.rc))
        return r.rc
Ejemplo n.º 22
0
    def __init__(self, sb_config, docker_config, ansible_config,
                 sb_output_dir):
        """Initilize.

        Args:
            sb_config (DictConfig): SuperBench config object.
            docker_config (DictConfig): Docker config object.
            ansible_config (DictConfig): Ansible config object.
            sb_output_dir (str): SuperBench output directory.
        """
        self._sb_config = sb_config
        self._docker_config = docker_config
        self._ansible_config = ansible_config
        self._sb_output_dir = sb_output_dir
        self._output_path = Path(sb_output_dir).expanduser().resolve()
        self._ansible_client = AnsibleClient(ansible_config)

        self.__set_logger('sb-run.log')
        logger.info(
            'Runner uses config: %s.',
            pformat(OmegaConf.to_container(self._sb_config, resolve=True)))
        logger.info('Runner writes to: %s.', str(self._output_path))

        self._sb_benchmarks = self._sb_config.superbench.benchmarks
        self.__validate_sb_config()
        self._sb_enabled_benchmarks = self.__get_enabled_benchmarks()
        logger.info('Runner will run: %s', self._sb_enabled_benchmarks)
Ejemplo n.º 23
0
    def _run_proc(self, benchmark_name, mode, vars):
        """Run the process.

        Args:
            benchmark_name (str): Benchmark name.
            mode (DictConfig): Runner mode.
            vars (dict): Process variables.

        Returns:
            int: Process return code.
        """
        mode.update(vars)
        logger.info('Runner is going to run %s in %s mode, proc rank %d.',
                    benchmark_name, mode.name, mode.proc_rank)

        timeout = self._sb_benchmarks[benchmark_name].timeout
        env_list = '--env-file /tmp/sb.env'
        if self._docker_config.skip:
            env_list = 'set -o allexport && source /tmp/sb.env && set +o allexport'
        for k, v in mode.env.items():
            if isinstance(v, str):
                envvar = f'{k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}'
                env_list += f' -e {envvar}' if not self._docker_config.skip else f' && export {envvar}'

        fcmd = "docker exec {env_list} sb-workspace bash -c '{command}'"
        if self._docker_config.skip:
            fcmd = "bash -c '{env_list} && cd $SB_WORKSPACE && {command}'"
        ansible_runner_config = self._ansible_client.get_shell_config(
            fcmd.format(env_list=env_list,
                        command=self.__get_mode_command(
                            benchmark_name, mode, timeout)))
        if mode.name == 'mpi' and mode.node_num != 1:
            ansible_runner_config = self._ansible_client.update_mpi_config(
                ansible_runner_config)

        ansible_runner_config['timeout'] = timeout

        rc = self._ansible_client.run(ansible_runner_config,
                                      sudo=(not self._docker_config.skip))
        return rc
Ejemplo n.º 24
0
    def _benchmark(self):
        """Implementation for benchmarking."""
        import onnxruntime as ort
        precision_metric = {
            'float16': 'fp16',
            'float32': 'fp32',
            'int8': 'int8'
        }

        for model in self._args.pytorch_models:
            sess_options = ort.SessionOptions()
            sess_options.graph_optimization_level = self.__graph_opt_level[
                self._args.graph_opt_level]
            file_name = '{model}.{precision}.onnx'.format(
                model=model, precision=self._args.precision)
            ort_sess = ort.InferenceSession(
                f'{self.__model_cache_path / file_name}',
                sess_options,
                providers=['CUDAExecutionProvider'])

            elapse_times = self.__inference(ort_sess)

            if self._args.precision.value in precision_metric:
                precision = precision_metric[self._args.precision.value]
            else:
                precision = self._args.precision.value
            metric = '{}_{}_time'.format(precision, model)
            if not self._process_numeric_result(
                    metric, elapse_times, cal_percentile=True):
                return False

            logger.info(
                'ORT Inference - round: {}, name: {}, model: {}, precision: {}, latency: {} ms'
                .format(self._curr_run_index, self._name, model,
                        self._args.precision, statistics.mean(elapse_times)))

        return True
Ejemplo n.º 25
0
    def __inference(self, precision):
        """Launch the inference benchmark.

        Args:
            precision (Precision): precision of model and input data, such as float32, float16.

        Return:
            True if step_times list is not empty.
        """
        self._create_model(precision)
        # The unit of step time should be millisecond.
        step_times = self._inference_step(precision)
        step_times = self.__process_model_result(ModelAction.INFERENCE, precision, step_times)
        if not step_times:
            self._result.set_return_code(ReturnCode.INVALID_BENCHMARK_RESULT)
            return False

        logger.info(
            'Average inference time - round: {}, model: {}, precision: {}, step time: {:.6f} ms.'.format(
                self._curr_run_index, self._name, precision, statistics.mean(step_times)
            )
        )

        return True
Ejemplo n.º 26
0
    def run(
        self, raw_data_file, rule_file, baseline_file, output_dir, output_format='excel', output_all=False, round=2
    ):
        """Run the data diagnosis and output the results.

        Args:
            raw_data_file (str): the path of raw data jsonl file.
            rule_file (str): The path of baseline yaml file
            baseline_file (str): The path of baseline json file
            output_dir (str): the directory of output file
            output_all (bool): output diagnosis results for all nodes
            output_format (str): the format of the output, 'excel' or 'json'
            round (int): the number of decimal digits
        """
        try:
            rules = self._preprocess(raw_data_file, rule_file)
            # read baseline
            baseline = file_handler.read_baseline(baseline_file)
            logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df)))
            data_not_accept_df, label_df = self.run_diagnosis_rules(rules, baseline)
            logger.info('DataDiagnosis: Processed finished')
            output_path = ''
            # generate all nodes' info
            if output_all:
                output_path = str(Path(output_dir) / 'diagnosis_summary.json')
                data_not_accept_df = self.output_all_nodes_results(self._raw_data_df, data_not_accept_df)
            # output according format
            if output_format == 'excel':
                output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
                self.output_diagnosis_in_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
            elif output_format == 'json':
                if output_all:
                    output_path = str(Path(output_dir) / 'diagnosis_summary.json')
                    self.output_diagnosis_in_json(data_not_accept_df, output_path)
                else:
                    output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
                    self.output_diagnosis_in_jsonl(data_not_accept_df, output_path)
            elif output_format == 'md' or output_format == 'html':
                lines = self.generate_md_lines(data_not_accept_df, self._sb_rules, round)
                if output_format == 'md':
                    output_path = str(Path(output_dir) / 'diagnosis_summary.md')
                    file_handler.output_lines_in_md(lines, output_path)
                else:
                    output_path = str(Path(output_dir) / 'diagnosis_summary.html')
                    file_handler.output_lines_in_html(lines, output_path)
            else:
                logger.error('DataDiagnosis: output failed - unsupported output format')
            logger.info('DataDiagnosis: Output results to {}'.format(output_path))
        except Exception as e:
            logger.error('DataDiagnosis: run failed - {}'.format(str(e)))
Ejemplo n.º 27
0
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for ONNXRuntime inference performance.

Commands to run:
    python3 examples/benchmarks/ort_inference_performance.py
"""

from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger

if __name__ == '__main__':
    context = BenchmarkRegistry.create_benchmark_context(
        'ort-inference',
        platform=Platform.CUDA,
        parameters='--pytorch_models resnet50 resnet101 --precision float16')
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    if benchmark:
        logger.info('benchmark: {}, return code: {}, result: {}'.format(
            benchmark.name, benchmark.return_code, benchmark.result))
Ejemplo n.º 28
0
    def _benchmark(self):
        """Implementation for benchmarking."""
        M = self._args.m
        K = self._args.k
        N = self._args.n
        P = self._args.p
        Q = self._args.q
        kernels = self._args.kernel
        if self.__local_rank == 0:
            logger.info('Computation Communication Overlap - using {} GPUs,\
                matrix shape for computation: M={} K={} N={},\
                message tensor shape of NCCL = [{},{}],\
                ratio between computation kernel and NCCL kernel={}'.format(
                self.__world_size, M, K, N, P, Q, self._args.ratio))

        MatA = list()
        MatB = list()
        # Matrix A
        for _ in range(self._args.ratio):
            MatA.append(torch.randn(M, K).cuda())
        # Matrix B
        MatB = torch.randn(K, N).cuda()
        # message for NCCL to transport
        shape = [P, Q]
        message = torch.randn(*shape).cuda()

        for kernel in kernels:
            # warm up
            for i in range(self._args.num_warmup):
                if not self.__kernel_nccl_pipeline(
                        kernel, MatA, MatB, self._args.ratio, message,
                        times=100):
                    return False
            torch.cuda.synchronize()

            # run and collect results
            start = time.perf_counter()
            for i in range(self._args.num_steps):
                self.__kernel_nccl_pipeline(kernel,
                                            MatA,
                                            MatB,
                                            self._args.ratio,
                                            message,
                                            times=100)
            compute_end = time.perf_counter()
            torch.cuda.synchronize()

            compute_metric = '{}_time'.format(kernel)
            compute_elapse_times = [
                (compute_end - start) * 1000 / self._args.num_steps
            ]

            if not self._process_numeric_result(compute_metric,
                                                compute_elapse_times):
                return False

            logger.info(
                'Computation_communication_overlap - round: {0}, name: {1}, gpu: {2} kernel: {3}, cost: {4} ms'
                .format(self._curr_run_index, self._name, self.__local_rank,
                        kernel,
                        (compute_end - start) * 1000 / self._args.num_steps))
        return True