コード例 #1
0
def test_register_benchmark():
    """Test interface BenchmarkRegistry.register_benchmark()."""
    # Register the benchmark for all platform if use default platform.
    BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark)
    for platform in Platform:
        context = BenchmarkRegistry.create_benchmark_context('accumulation',
                                                             platform=platform)
        assert (BenchmarkRegistry.is_benchmark_registered(context))

    # Register the benchmark for CUDA platform if use platform=Platform.CUDA.
    BenchmarkRegistry.register_benchmark('accumulation-cuda',
                                         AccumulationBenchmark,
                                         platform=Platform.CUDA)
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation-cuda', platform=Platform.CUDA)
    assert (BenchmarkRegistry.is_benchmark_registered(context))
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation-cuda', platform=Platform.ROCM)
    assert (BenchmarkRegistry.is_benchmark_registered(context) is False)
コード例 #2
0
def test_get_benchmark_configurable_settings():
    """Test BenchmarkRegistry interface.

    BenchmarkRegistry.get_benchmark_configurable_settings().
    """
    # Register benchmarks for testing.
    BenchmarkRegistry.register_benchmark('accumulation', AccumulationBenchmark)

    context = BenchmarkRegistry.create_benchmark_context('accumulation',
                                                         platform=Platform.CPU)
    settings = BenchmarkRegistry.get_benchmark_configurable_settings(context)

    expected = """optional arguments:
  --duration int     The elapsed time of benchmark in seconds.
  --log_raw_data     Log raw data into file instead of saving it into result
                     object.
  --lower_bound int  The lower bound for accumulation.
  --run_count int    The run count of benchmark.
  --upper_bound int  The upper bound for accumulation."""
    assert (settings == expected)
コード例 #3
0
def create_benchmark(params='--num_steps 8'):
    """Register and create benchmark."""
    # Register the FakeModelBenchmark benchmark.
    BenchmarkRegistry.register_benchmark(
        'pytorch-fake-model',
        FakeModelBenchmark,
        parameters='--hidden_size 2',
        platform=Platform.CUDA,
    )
    context = BenchmarkRegistry.create_benchmark_context(
        'fake-model',
        platform=Platform.CUDA,
        parameters=params,
        framework=Framework.PYTORCH)
    name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(context)
    assert (name)
    (benchmark_class, predefine_params
     ) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(
         name, context.platform)
    assert (benchmark_class)
    return benchmark_class(name, predefine_params + ' ' + context.parameters)
コード例 #4
0
def test_get_benchmark_name():
    """Test interface BenchmarkRegistry.get_benchmark_name()."""
    # Register benchmarks for testing.
    benchmark_names = [
        'accumulation', 'pytorch-accumulation', 'tf1-accumulation',
        'onnxruntime-accumulation'
    ]
    for name in benchmark_names:
        BenchmarkRegistry.register_benchmark(name, AccumulationBenchmark)

    # Test benchmark name for different Frameworks.
    benchmark_frameworks = [
        Framework.NONE, Framework.PYTORCH, Framework.TENSORFLOW1,
        Framework.ONNXRUNTIME
    ]
    for i in range(len(benchmark_names)):
        context = BenchmarkRegistry.create_benchmark_context(
            'accumulation',
            platform=Platform.CPU,
            framework=benchmark_frameworks[i])
        name = BenchmarkRegistry._BenchmarkRegistry__get_benchmark_name(
            context)
        assert (name == benchmark_names[i])
コード例 #5
0
def test_pytorch_empty_cache():
    """Test PytorchBase class."""
    # Register mnist benchmark.
    BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST)

    # Test cache empty by manually calling torch.cuda.empty_cache().
    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train'
    benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters)
    assert (benchmark)
    assert (benchmark._preprocess())
    assert (benchmark._benchmark())
    del benchmark
    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] > 0)
    torch.cuda.empty_cache()
    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)

    # Test automatic cache empty.
    context = BenchmarkRegistry.create_benchmark_context(
        'pytorch-mnist', parameters='--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train'
    )

    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark)
    assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0)
コード例 #6
0
        """
        precision = self._precision_need_to_run[cmd_idx]
        self._result.add_raw_data('raw_output_' + precision, raw_output,
                                  self._args.log_raw_data)

        valid = True
        flops = list()
        content = raw_output.splitlines()
        try:
            for line in content:
                for item in self.__parse_logline:
                    if item in line:
                        flops.append(float(line.split(',')[-1]))
        except BaseException:
            valid = False
        finally:
            if valid is False or len(flops) == 0:
                logger.error(
                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'
                    .format(self._curr_run_index, self._name, raw_output))
                return False

        self._result.add_result(self._metric_map[precision], max(flops))

        return True


BenchmarkRegistry.register_benchmark('gemm-flops',
                                     CudaGemmFlopsBenchmark,
                                     platform=Platform.CUDA)
コード例 #7
0
def test_pytorch_base():
    """Test PytorchBase class."""
    # Register mnist benchmark.
    BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST)

    # Launch benchmark with --no_gpu for testing.
    parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train inference --no_gpu --force_fp32'
    benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters)
    assert (benchmark)
    assert (benchmark._preprocess())
    assert (benchmark._benchmark())
    assert (benchmark.name == 'pytorch-mnist')
    assert (benchmark.return_code == ReturnCode.SUCCESS)

    # Test results.
    for metric in [
        'fp32_train_step_time', 'fp32_inference_step_time', 'fp32_train_throughput', 'fp32_inference_throughput'
    ]:
        assert (len(benchmark.raw_data[metric]) == 1)
        assert (len(benchmark.raw_data[metric][0]) == 64)
        assert (len(benchmark.result[metric]) == 1)
        assert (isinstance(benchmark.result[metric][0], numbers.Number))

    # Test _cal_params_count().
    assert (benchmark._cal_params_count() == 1199882)

    # Test _judge_gpu_availability().
    assert (benchmark._gpu_available is False)

    # Test _set_force_fp32().
    assert (benchmark._args.force_fp32 is True)

    # Test _init_distributed_setting().
    assert (benchmark._args.distributed_impl is None)
    assert (benchmark._args.distributed_backend is None)
    assert (benchmark._init_distributed_setting() is True)
    benchmark._args.distributed_impl = DistributedImpl.DDP
    benchmark._args.distributed_backend = DistributedBackend.NCCL
    assert (benchmark._init_distributed_setting() is False)
    benchmark._args.distributed_impl = DistributedImpl.MIRRORED
    assert (benchmark._init_distributed_setting() is False)

    # Test _init_dataloader().
    benchmark._args.distributed_impl = None
    assert (benchmark._init_dataloader() is True)
    benchmark._args.distributed_impl = DistributedImpl.DDP
    assert (benchmark._init_dataloader() is False)
    benchmark._args.distributed_impl = DistributedImpl.MIRRORED
    assert (benchmark._init_dataloader() is False)

    # Test _create_optimizer().
    assert (isinstance(benchmark._optimizer, transformers.AdamW))
    benchmark._optimizer_type = Optimizer.ADAM
    assert (benchmark._create_optimizer() is True)
    assert (isinstance(benchmark._optimizer, torch.optim.Adam))
    benchmark._optimizer_type = Optimizer.SGD
    assert (benchmark._create_optimizer() is True)
    assert (isinstance(benchmark._optimizer, torch.optim.SGD))
    benchmark._optimizer_type = None
    assert (benchmark._create_optimizer() is False)

    # Test _sync_result().
    step_time = [2.0, 2.0]
    benchmark._args.distributed_impl = DistributedImpl.DDP
    step_time = benchmark._sync_result(step_time)
    assert (not step_time)
    benchmark._args.distributed_impl = None

    # Test _postprocess().
    assert (benchmark._postprocess())
コード例 #8
0
        """Do inference given the ORT inference session.

        Args:
            ort_sess (InferenceSession): inference session for ORT.

        Return:
            elapse_times (List[float]): latency of every iterations.
        """
        precision = np.float16 if self._args.precision == Precision.FLOAT16 else np.float32
        input_tensor = np.random.randn(self._args.batch_size, 3, 224,
                                       224).astype(dtype=precision)

        for i in range(self._args.num_warmup):
            ort_sess.run(None, {'input': input_tensor})

        elapse_times = list()
        for i in range(self._args.num_steps):
            start = time.time()
            ort_sess.run(None, {'input': input_tensor})
            end = time.time()
            elapse_times.append((end - start) * 1000)

        return elapse_times


BenchmarkRegistry.register_benchmark(
    'ort-inference',
    ORTInferenceBenchmark,
    platform=Platform.CUDA,
)
コード例 #9
0
        self._result.add_raw_data('raw_output', raw_output,
                                  self._args.log_raw_data)

        content = raw_output.splitlines(False)
        try:
            result_header = 'benchmark implementation mode config score'
            found = False
            for line in content:
                if result_header in line:
                    found = True
                elif found:
                    items = line.split(' ')
                    if len(items) == 7:
                        name = '_'.join(items[0:4] + [items[5]])
                        for char in ['-', ' ', '=', '/']:
                            name = name.replace(char, '_')
                        score = float(items[4])
                        self._result.add_result(name.lower(), score)
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, message: {}.'
                .format(self._curr_run_index, self._name, str(e)))
            return False

        return True


BenchmarkRegistry.register_benchmark('fambench',
                                     FAMBenchBenchmark,
                                     platform=Platform.CUDA)
コード例 #10
0
        mem_bw = -1
        value_index = -1
        valid = True
        content = raw_output.splitlines()
        try:
            metric = self._metrics[self._mem_types.index(self._args.mem_type[cmd_idx])]
            parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]]
            for line in content:
                if parse_logline in line and value_index != -1:
                    line = line.split()
                    mem_bw = max(mem_bw, float(line[value_index]))
                elif 'mean' in line:
                    line = line.split()
                    value_index = line.index('mean')
        except BaseException:
            valid = False
        finally:
            if valid is False or mem_bw == -1:
                logger.error(
                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
                        self._curr_run_index, self._name, raw_output
                    )
                )
                return False
        self._result.add_result(metric, mem_bw)
        return True


BenchmarkRegistry.register_benchmark('mem-bw', RocmMemBwBenchmark, platform=Platform.ROCM)
コード例 #11
0
                        res = line.split('|')
                        res = [result.strip() for result in res]
                        suc = int(res[labels.index('Successed')])
                        fail = int(res[labels.index('Failed')])
                        rate = float(
                            res[labels.index('Success Rate')].strip('%'))
                        mininum = float(
                            res[labels.index('Minimum')].strip('ms'))
                        maximum = float(
                            res[labels.index('Maximum')].strip('ms'))
                        average = float(
                            res[labels.index('Average')].strip('ms'))
            self._result.add_result(host + '_successed_count', suc)
            self._result.add_result(host + '_failed_count', fail)
            self._result.add_result(host + '_success_rate', rate)
            self._result.add_result(host + '_time_min', mininum)
            self._result.add_result(host + '_time_max', maximum)
            self._result.add_result(host + '_time_avg', average)
        except Exception as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, address: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, host, raw_output,
                        str(e)))
            return False

        return True


BenchmarkRegistry.register_benchmark('tcp-connectivity',
                                     TCPConnectivityBenchmark)
コード例 #12
0
            model = self._args.pytorch_models[cmd_idx]
            for line in raw_output.strip().splitlines():
                line = line.strip()
                if '[I] mean:' in line or '[I] percentile:' in line:
                    tag = 'mean' if '[I] mean:' in line else '99'
                    lats = re.findall(r'(\d+\.\d+) ms', line)
                    if len(lats) == 1:
                        self._result.add_result(f'{model}_gpu_time_{tag}',
                                                float(lats[0]))
                    elif len(lats) == 2:
                        self._result.add_result(f'{model}_host_time_{tag}',
                                                float(lats[0]))
                        self._result.add_result(
                            f'{model}_end_to_end_time_{tag}', float(lats[1]))
                    success = True
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, raw_output, str(e)))
            return False
        return success


BenchmarkRegistry.register_benchmark(
    'tensorrt-inference',
    TensorRTInferenceBenchmark,
    platform=Platform.CUDA,
)
コード例 #13
0
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                    if self._is_finished(curr_step, end):
                        return duration


# Register CNN benchmarks.
# Reference: https://pytorch.org/vision/0.8/models.html
#            https://github.com/pytorch/vision/tree/v0.8.0/torchvision/models
MODELS = [
    'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161',
    'googlenet', 'inception_v3', 'mnasnet0_5', 'mnasnet0_75', 'mnasnet1_0',
    'mnasnet1_3', 'mobilenet_v2', 'resnet18', 'resnet34', 'resnet50',
    'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
    'wide_resnet50_2', 'wide_resnet101_2', 'shufflenet_v2_x0_5',
    'shufflenet_v2_x1_0', 'shufflenet_v2_x1_5', 'shufflenet_v2_x2_0',
    'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn',
    'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19'
]

for model in MODELS:
    if hasattr(models, model):
        BenchmarkRegistry.register_benchmark('pytorch-' + model,
                                             PytorchCNN,
                                             parameters='--model_type ' +
                                             model)
    else:
        logger.warning(
            'model missing in torchvision.models - model: {}'.format(model))
コード例 #14
0
        metric_set = set()
        for line in content:
            try:
                values = list(filter(None, line.split()))
                if len(values) != 5:
                    continue
                # Extract value from the line
                size = int(values[0])
                avg_bw = float(values[-2]) / 8.0
                metric = f'{self.__support_ib_commands[self._args.commands[cmd_idx]]}_{size}:{self._args.ib_index}'
                # Filter useless value in client output
                if metric not in metric_set:
                    metric_set.add(metric)
                    self._result.add_result(metric, avg_bw)
                    valid = True
            except BaseException:
                pass
        if valid is False:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
                    self._curr_run_index, self._name, raw_output
                )
            )
            return False

        return True


BenchmarkRegistry.register_benchmark('ib-loopback', IBLoopbackBenchmark)
コード例 #15
0
def test_launch_benchmark():
    """Test interface BenchmarkRegistry.launch_benchmark()."""
    # Register benchmarks for testing.
    BenchmarkRegistry.register_benchmark('accumulation',
                                         AccumulationBenchmark,
                                         parameters='--upper_bound 5',
                                         platform=Platform.CPU)

    # Launch benchmark.
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation', platform=Platform.CPU, parameters='--lower_bound 1')

    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark)
    assert (benchmark.name == 'accumulation')
    assert (benchmark.type == BenchmarkType.MICRO)
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    assert (benchmark.raw_data == {'accumulation_result': ['1,3,6,10']})
    assert (benchmark.result == {
        'return_code': [0],
        'accumulation_result': [10]
    })

    # Replace the timestamp as null.
    result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null',
                    benchmark.serialized_result)
    expected = (
        '{"name": "accumulation", "type": "micro", "run_count": 1, '
        '"return_code": 0, "start_time": null, "end_time": null, '
        '"raw_data": {"accumulation_result": ["1,3,6,10"]}, '
        '"result": {"return_code": [0], "accumulation_result": [10]}, '
        '"reduce_op": {"return_code": null, "accumulation_result": null}}')
    assert (result == expected)

    # Launch benchmark with overridden parameters.
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation',
        platform=Platform.CPU,
        parameters='--lower_bound 1 --upper_bound 4')
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark)
    assert (benchmark.name == 'accumulation')
    assert (benchmark.type == BenchmarkType.MICRO)
    assert (benchmark.run_count == 1)
    assert (benchmark.return_code == ReturnCode.SUCCESS)
    assert (benchmark.raw_data == {'accumulation_result': ['1,3,6']})
    assert (benchmark.result == {
        'return_code': [0],
        'accumulation_result': [6]
    })

    # Replace the timestamp as null.
    result = re.sub(r'\"\d+-\d+-\d+ \d+:\d+:\d+\"', 'null',
                    benchmark.serialized_result)
    expected = (
        '{"name": "accumulation", "type": "micro", "run_count": 1, '
        '"return_code": 0, "start_time": null, "end_time": null, '
        '"raw_data": {"accumulation_result": ["1,3,6"]}, '
        '"result": {"return_code": [0], "accumulation_result": [6]}, '
        '"reduce_op": {"return_code": null, "accumulation_result": null}}')
    assert (result == expected)

    # Failed to launch benchmark due to 'benchmark not found'.
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation-fail',
        Platform.CPU,
        parameters='--lower_bound 1 --upper_bound 4',
        framework=Framework.PYTORCH)
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark is None)

    # Failed to launch benchmark due to 'unknown arguments'.
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation',
        platform=Platform.CPU,
        parameters='--lower_bound 1 --test 4')
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark)
    assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)

    # Failed to launch benchmark due to 'invalid arguments'.
    context = BenchmarkRegistry.create_benchmark_context(
        'accumulation',
        platform=Platform.CPU,
        parameters='--lower_bound 1 --upper_bound x')
    benchmark = BenchmarkRegistry.launch_benchmark(context)
    assert (benchmark)
    assert (benchmark.return_code == ReturnCode.INVALID_ARGUMENT)
コード例 #16
0
                                and self._name == 'gpcnet-network-load-test':
                            name_prefix = items[1].replace(' ', '')
                            for i in range(2, len(items) - 1):
                                if labels[i] != 'Units':
                                    self._result.add_result(
                                        self.__metrics_x[name_prefix] + '_' + labels[i].lower(),
                                        float(items[i].strip('X'))
                                    )
            elif 'ERROR: this application must be run on at least' in raw_output:
                return True
            else:
                logger.error(
                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
                        self._curr_run_index, self._name, raw_output
                    )
                )
                return False
        except Exception as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
                    self._curr_run_index, self._name, raw_output, str(e)
                )
            )
            return False

        return True


BenchmarkRegistry.register_benchmark('gpcnet-network-test', GPCNetBenchmark)
BenchmarkRegistry.register_benchmark('gpcnet-network-load-test', GPCNetBenchmark)
コード例 #17
0
              such as float32, float16.

        Return:
            The latency list of every inference operation.
        """
        duration = []
        curr_step = 0
        with torch.no_grad():
            self._model.eval()
            while True:
                for idx, sample in enumerate(self._dataloader):
                    sample = sample.to(dtype=getattr(torch, precision.value))
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
                    self._model(sample)
                    end = self._timer()
                    curr_step += 1
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                    if self._is_finished(curr_step, end):
                        return duration


# Register LSTM benchmark.
BenchmarkRegistry.register_benchmark(
    'pytorch-lstm',
    PytorchLSTM,
    parameters='--input_size=256 --hidden_size=1024 --num_layers=8')
コード例 #18
0
                    raw_data = line[line.index('[raw_data]: ') +
                                    len('[raw_data]: '):]
                    raw_data = raw_data.split(',')
                    raw_data.pop()
                    raw_data = [float(item) for item in raw_data]
                    self._result.add_result(metric.lower() + '_time',
                                            statistics.mean(raw_data))
                    self._result.add_raw_data(metric.lower() + '_time',
                                              raw_data,
                                              self._args.log_raw_data)
                if 'Error' in line:
                    error = True
        except BaseException as e:
            logger.error(
                'Cannot extract results from cublas functions - round: {}, index of cmd: {}, \
                benchmark: {}, raw data: {}, message: {}'.format(
                    self._curr_run_index, cmd_idx, self._name, raw_output,
                    str(e)))
            return False
        if error:
            logger.error(
                'Error in running cublas test - round: {}, index of cmd: {}, benchmark: {}, raw data: {}'
                .format(self._curr_run_index, cmd_idx, self._name, raw_output))
            return False
        return True


BenchmarkRegistry.register_benchmark('cublas-function',
                                     CublasBenchmark,
                                     platform=Platform.CUDA)
コード例 #19
0
                    start = self._timer()
                    if self._gpu_available:
                        sample = sample.cuda()
                    self._model(sample)
                    end = self._timer()
                    curr_step += 1
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                    if self._is_finished(curr_step, end):
                        return duration


# Register BERT Large benchmark.
# Reference: https://huggingface.co/transformers/pretrained_models.html
BenchmarkRegistry.register_benchmark(
    'pytorch-bert-large',
    PytorchBERT,
    parameters=
    '--hidden_size=1024 --num_hidden_layers=24 --num_attention_heads=16 --intermediate_size=4096'
)

# Register BERT Base benchmark.
# Reference: https://huggingface.co/transformers/pretrained_models.html
BenchmarkRegistry.register_benchmark(
    'pytorch-bert-base',
    PytorchBERT,
    parameters=
    '--hidden_size=768 --num_hidden_layers=12 --num_attention_heads=12 --intermediate_size=3072'
)
コード例 #20
0
        out_table = dict()
        for line in raw_output.splitlines():
            if line.strip() == '':
                continue
            # only lines starting with a digit is of interest
            if line.lstrip()[0].isdigit():
                vals = line.split()
                if len(vals) < 2:
                    continue
                numa_index = 'numa_%s' % vals[0]
                out_table[numa_index] = vals[1:]
        return out_table

    def _parse_max_bw(self, raw_output):
        out_table = dict()
        # the very last line is empty and only the last 5 lines of the output are of interest
        for line in raw_output.splitlines()[-6:]:
            if line.strip() == '':
                continue
            vals = line.split()
            if len(vals) < 2:
                continue
            key = '_'.join(vals[0:2]).rstrip(':').replace(':', '_')
            # making a list to be consistent with the _parse_bw_latency output
            out_table[key] = [vals[-1]]
        return out_table


BenchmarkRegistry.register_benchmark('cpu-memory-bw-latency',
                                     CpuMemBwLatencyBenchmark)
コード例 #21
0
                        sample = sample.cuda()
                    self._model(sample)
                    end = self._timer()
                    curr_step += 1
                    if curr_step > self._args.num_warmup:
                        # Save the step time of every training/inference step, unit is millisecond.
                        duration.append((end - start) * 1000)
                    if self._is_finished(curr_step, end):
                        return duration


# Register GPT2 benchmark with 117M parameters.
# Reference: https://huggingface.co/transformers/pretrained_models.html
BenchmarkRegistry.register_benchmark(
    'pytorch-gpt2-small',
    PytorchGPT2,
    parameters=
    '--hidden_size=768 --num_hidden_layers=12 --num_attention_heads=12')

# Register GPT2 benchmark with 345M parameters.
# Reference: https://huggingface.co/transformers/pretrained_models.html
BenchmarkRegistry.register_benchmark(
    'pytorch-gpt2-medium',
    PytorchGPT2,
    parameters=
    '--hidden_size=1024 --num_hidden_layers=24 --num_attention_heads=16')

# Register GPT2 benchmark with 774M parameters.
# Reference: https://huggingface.co/transformers/pretrained_models.html
BenchmarkRegistry.register_benchmark(
    'pytorch-gpt2-large',
コード例 #22
0
                valid = False
            else:
                content = content[result_index:]
                for line_index, line in enumerate(content):
                    line_result = list(filter(None, line.strip().split(',')))
                    for pair_index, pair_result in enumerate(line_result):
                        rank_results = list(
                            filter(None,
                                   pair_result.strip().split(' ')))
                        for rank_index, rank_result in enumerate(rank_results):
                            metric = f'{command}_{line_index}_{pair_index}:{self.__config[config_index]}:{rank_index}'
                            value = float(rank_result)
                            # Check if the value is valid before the base conversion
                            if 'bw' in command and value >= 0.0:
                                value = value / 8.0
                            self._result.add_result(metric, value)
                            valid = True
                        config_index += 1
        except Exception:
            valid = False
        if valid is False or config_index != len(self.__config):
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'
                .format(self._curr_run_index, self._name, raw_output))
            return False

        return True


BenchmarkRegistry.register_benchmark('ib-traffic', IBBenchmark)
コード例 #23
0
                    raw_data = line[line.index('[raw_data]: ') +
                                    len('[raw_data]: '):]
                    raw_data = raw_data.split(',')
                    raw_data.pop()
                    raw_data = [float(item) for item in raw_data]
                    self._result.add_result(metric.lower() + '_time',
                                            statistics.mean(raw_data) * 1000)
                    self._result.add_raw_data(metric.lower() + '_time',
                                              raw_data,
                                              self._args.log_raw_data)
                if 'Error' in line:
                    error = True
        except BaseException as e:
            logger.error(
                'Cannot extract results from cudnn functions - round: {}, index of cmd: {}, \
                benchmark: {}, raw data: {}, message: {}'.format(
                    self._curr_run_index, cmd_idx, self._name, raw_output,
                    str(e)))
            return False
        if error:
            logger.error(
                'Error in running cudnn test - round: {}, index of cmd: {}, benchmark: {}, raw data: {}'
                .format(self._curr_run_index, cmd_idx, self._name, raw_output))
            return False
        return True


BenchmarkRegistry.register_benchmark('cudnn-function',
                                     CudnnBenchmark,
                                     platform=Platform.CUDA)
コード例 #24
0
    def _postprocess(self):
        """Postprocess/cleanup operations after the benchmarking.

        Return:
            True if _postprocess() succeed.
        """
        if not super()._postprocess():
            return False

        try:
            if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
                torch.distributed.destroy_process_group()
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
            logger.error(
                'Post process failed - benchmark: {}, mode: {}, message: {}.'.
                format(self._name, self._args.mode, str(e)))
            return False

        return True


BenchmarkRegistry.register_benchmark('pytorch-sharding-matmul',
                                     ShardingMatmul,
                                     parameters='--mode allreduce allgather')
BenchmarkRegistry.register_benchmark('pytorch-matmul',
                                     ShardingMatmul,
                                     parameters='--mode nosharding')
コード例 #25
0
                        kernel,
                        (compute_end - start) * 1000 / self._args.num_steps))
        return True

    def _postprocess(self):
        """Postprocess/cleanup operations after the benchmarking.

        Return:
            True if _postprocess() succeed.
        """
        if not super()._postprocess():
            return False

        try:
            torch.distributed.destroy_process_group()
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.DISTRIBUTED_SETTING_DESTROY_FAILURE)
            logger.error(
                'Post process failed - benchmark: {}, message: {}.'.format(
                    self._name, str(e)))
            return False

        return True


BenchmarkRegistry.register_benchmark(
    'pytorch-computation-communication-overlap',
    ComputationCommunicationOverlap,
    parameters='--kernel mul matmul')
コード例 #26
0
                iops = fio_output['jobs'][0][io_type]['iops']
                self._result.add_result('%s_iops' % io_type_prefix,
                                        float(iops))

                for lat_unit in lat_units:
                    if lat_unit in fio_output['jobs'][0][io_type] and \
                       'percentile' in fio_output['jobs'][0][io_type][lat_unit]:
                        lat_unit_prefix = '%s_%s' % (io_type_prefix, lat_unit)
                        for lat_percentile in [
                                '95.000000', '99.000000', '99.900000'
                        ]:
                            lat = fio_output['jobs'][0][io_type][lat_unit][
                                'percentile'][lat_percentile]
                            self._result.add_result(
                                '%s_%s' %
                                (lat_unit_prefix, lat_percentile[:-5]),
                                float(lat))
                        break
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, raw_output, str(e)))
            return False

        return True


BenchmarkRegistry.register_benchmark('disk-benchmark', DiskBenchmark)
コード例 #27
0
          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output,
                                  self._args.log_raw_data)

        try:
            output_lines = [x.strip() for x in raw_output.strip().splitlines()]
            for output_line in output_lines:
                tag, bw_str = output_line.split()
                self._result.add_result(tag + '_bw', float(bw_str))
        except BaseException as e:
            self._result.set_return_code(
                ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, raw_output, str(e)))
            return False

        return True


BenchmarkRegistry.register_benchmark('gpu-copy-bw', GpuCopyBwBenchmark)
コード例 #28
0
        mem_bw = -1
        valid = True
        content = raw_output.splitlines()
        try:
            metric = self._metrics[self._mem_types.index(self._args.mem_type[cmd_idx])]
            parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]]
            for line in content:
                if parse_logline in line:
                    line = line.split(',')[1]
                    value = re.search(r'(\d+.\d+)', line)
                    if value:
                        mem_bw = max(mem_bw, float(value.group(0)))

        except BaseException:
            valid = False
        finally:
            if valid is False or mem_bw == -1:
                logger.error(
                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
                        self._curr_run_index, self._name, raw_output
                    )
                )
                return False

        self._result.add_result(metric, mem_bw)

        return True


BenchmarkRegistry.register_benchmark('mem-bw', CudaMemBwBenchmark, platform=Platform.CUDA)
コード例 #29
0
        Return:
            True if the raw output string is valid and result can be extracted.
        """
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output,
                                  self._args.log_raw_data)

        pattern = r'\d+\.\d+'
        result = re.findall(pattern, raw_output)
        if len(result) != 2:
            logger.error(
                'Cannot extract kernel launch overhead in event and wall mode - round: {}, benchmark: {}, raw data: {}.'
                .format(self._curr_run_index, self._name, raw_output))
            return False

        try:
            result = [float(item) for item in result]
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, result: {}, message: {}.'
                .format(self._curr_run_index, self._name, result, str(e)))
            return False

        self._result.add_result('event_time', result[0])
        self._result.add_result('wall_time', result[1])

        return True


BenchmarkRegistry.register_benchmark('kernel-launch', KernelLaunch)
コード例 #30
0
                        algbw_out = float(line[algbw_index])
                        self._result.add_result(
                            self._args.operation + '_' + str(size) + '_busbw',
                            busbw_out)
                        self._result.add_result(
                            self._args.operation + '_' + str(size) + '_algbw',
                            algbw_out)
                        self._result.add_result(
                            self._args.operation + '_' + str(size) + '_time',
                            time_out)
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'
                .format(self._curr_run_index, self._name, raw_output, str(e)))
            return False
        if out_of_place_index == -1 or out_of_bound_index == -1 or busbw_out == -1:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'
                .format(self._curr_run_index, self._name, raw_output))
            return False

        return True


BenchmarkRegistry.register_benchmark('nccl-bw',
                                     CudaNcclBwBenchmark,
                                     platform=Platform.CUDA)
BenchmarkRegistry.register_benchmark('rccl-bw',
                                     CudaNcclBwBenchmark,
                                     platform=Platform.ROCM)