Esempio n. 1
0
    def benchmark(self, events: List[_ProfilerEvent]):
        def closest_multiple(shapes, multiple):
            return [multiple * math.ceil(shape / multiple) for shape in shapes]

        shapes_factor_map = {input_shapes(event): 0.0 for event in events}
        for shape in shapes_factor_map:
            matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float16)
            matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float16)
            not_aligned_dim_timer = benchmark.Timer(
                stmt='torch.mm(matrixA, matrixB)',
                globals={
                    "matrixA": matrixA,
                    "matrixB": matrixB
                })
            matrixA = torch.randn(closest_multiple(shape[0], 8),
                                  device="cuda",
                                  dtype=torch.float16)
            matrixB = torch.randn(closest_multiple(shape[1], 8),
                                  device="cuda",
                                  dtype=torch.float16)
            aligned_dim_timer = benchmark.Timer(
                stmt='torch.mm(matrixA, matrixB)',
                globals={
                    "matrixA": matrixA,
                    "matrixB": matrixB
                })
            not_aligned_dim_time = not_aligned_dim_timer.timeit(10).mean
            aligned_dim_time = aligned_dim_timer.timeit(10).mean
            shapes_factor_map[shape] = aligned_dim_time / not_aligned_dim_time
        return shapes_factor_map
Esempio n. 2
0
    def test_timer(self):
        timer = benchmark_utils.Timer(
            stmt="torch.ones(())",
        )
        sample = timer.timeit(5).median
        self.assertIsInstance(sample, float)

        median = timer.blocked_autorange(min_run_time=0.01).median
        self.assertIsInstance(median, float)

        # We set a very high threshold to avoid flakiness in CI.
        # The internal algorithm is tested in `test_adaptive_timer`
        median = timer.adaptive_autorange(threshold=0.5).median

        # Test that multi-line statements work properly.
        median = benchmark_utils.Timer(
            stmt="""
                with torch.no_grad():
                    y = x + 1""",
            setup="""
                x = torch.ones((1,), requires_grad=True)
                for _ in range(5):
                    x = x + 1.0""",
        ).timeit(5).median
        self.assertIsInstance(sample, float)
Esempio n. 3
0
def compare_optimize_resnet18_to_torchscript():
    results = []
    for i in range(20):
        test_input = torch.rand(1, 3, 224, 224).half().cuda()
        sub_label = f"[test {i}]"
        results.append(
            benchmark.Timer(
                stmt="meta_module_resnet18(test_input)",
                setup="from __main__ import meta_module_resnet18",
                globals={"test_input": test_input},
                sub_label=sub_label,
                description="tuning by meta",
            ).blocked_autorange()
        )
        results.append(
            benchmark.Timer(
                stmt="jit_module_resnet18(test_input)",
                setup="from __main__ import jit_module_resnet18",
                globals={"test_input": test_input},
                sub_label=sub_label,
                description="tuning by jit",
            ).blocked_autorange()
        )
    compare = benchmark.Compare(results)
    compare.print()
Esempio n. 4
0
    def test_collect_callgrind(self):
        with self.assertRaisesRegex(
                ValueError,
                r"`collect_callgrind` requires that globals be wrapped "
                r"in `CopyIfCallgrind` so that serialization is explicit."):
            benchmark_utils.Timer("pass", globals={
                "x": 1
            }).collect_callgrind(collect_baseline=False)

        with self.assertRaisesRegex(
                # Subprocess raises AttributeError (from pickle),
                # _ValgrindWrapper re-raises as generic OSError.
                OSError,
                "AttributeError: Can't get attribute 'MyModule'"):
            benchmark_utils.Timer("model(1)",
                                  globals={
                                      "model":
                                      benchmark_utils.CopyIfCallgrind(
                                          MyModule())
                                  }).collect_callgrind(collect_baseline=False)

        @torch.jit.script
        def add_one(x):
            return x + 1

        timer = benchmark_utils.Timer(
            "y = add_one(x) + k",
            setup="x = torch.ones((1,))",
            globals={
                "add_one":
                benchmark_utils.CopyIfCallgrind(add_one),
                "k":
                benchmark_utils.CopyIfCallgrind(5),
                "model":
                benchmark_utils.CopyIfCallgrind(MyModule(),
                                                setup=f"""\
                    import sys
                    sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])})
                    from test_benchmark_utils import MyModule
                    """)
            })

        stats = timer.collect_callgrind(number=1000)
        counts = stats.counts(denoise=False)

        self.assertIsInstance(counts, int)
        self.assertGreater(counts, 0)

        stats = timer.collect_callgrind(number=1000, repeats=10)
        assert isinstance(stats, tuple)

        # Check that the repeats are at least somewhat repeatable.
        counts = collections.Counter([s.counts(denoise=True) for s in stats])
        self.assertGreater(
            max(counts.values), 1,
            f"Every instruction count total was unique: {counts}")

        from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton
        self.assertIsNone(wrapper_singleton()._bindings_module,
                          "JIT'd bindings are only for back testing.")
Esempio n. 5
0
def generate_callgrind_artifacts() -> None:
    """Regenerate `callgrind_artifacts.json`

    Unlike the expect tests, regenerating callgrind counts will produce a
    large diff since build directories and conda/pip directories are included
    in the instruction string. It is also not 100% deterministic (due to jitter
    from Python) and takes over a minute to run. As a result, running this
    function is manual.
    """
    print("Regenerating callgrind artifact.")

    stats_no_data = benchmark_utils.Timer(
        "y = torch.ones(())"
    ).collect_callgrind(number=1000)

    stats_with_data = benchmark_utils.Timer(
        "y = torch.ones((1,))"
    ).collect_callgrind(number=1000)

    user = os.getenv("USER")

    def to_entry(fn_counts):
        return [f"{c} {fn.replace(f'/{user}/', '/test_user/')}" for c, fn in fn_counts]

    artifacts = {
        "baseline_inclusive": to_entry(stats_no_data.baseline_inclusive_stats),
        "baseline_exclusive": to_entry(stats_no_data.baseline_exclusive_stats),
        "ones_no_data_inclusive": to_entry(stats_no_data.stmt_inclusive_stats),
        "ones_no_data_exclusive": to_entry(stats_no_data.stmt_exclusive_stats),
        "ones_with_data_inclusive": to_entry(stats_with_data.stmt_inclusive_stats),
        "ones_with_data_exclusive": to_entry(stats_with_data.stmt_exclusive_stats),
    }

    with open(CALLGRIND_ARTIFACTS, "wt") as f:
        json.dump(artifacts, f, indent=4)
Esempio n. 6
0
    def test_collect_callgrind(self):
        with self.assertRaisesRegex(
            ValueError,
            r"`collect_callgrind` requires that globals be wrapped "
            r"in `CopyIfCallgrind` so that serialization is explicit."
        ):
            benchmark_utils.Timer(
                "pass",
                globals={"x": 1}
            ).collect_callgrind(collect_baseline=False)

        with self.assertRaisesRegex(
            # Subprocess raises AttributeError (from pickle),
            # _ValgrindWrapper re-raises as generic OSError.
            OSError, "AttributeError: Can't get attribute 'MyModule'"
        ):
            benchmark_utils.Timer(
                "model(1)",
                globals={"model": benchmark_utils.CopyIfCallgrind(MyModule())}
            ).collect_callgrind(collect_baseline=False)


        @torch.jit.script
        def add_one(x):
            return x + 1

        timer = benchmark_utils.Timer(
            "y = add_one(x) + k",
            setup="x = torch.ones((1,))",
            globals={
                "add_one": benchmark_utils.CopyIfCallgrind(add_one),
                "k": benchmark_utils.CopyIfCallgrind(5),
                "model": benchmark_utils.CopyIfCallgrind(
                    MyModule(),
                    setup=f"""\
                    import sys
                    sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])})
                    from test_benchmark_utils import MyModule
                    """
                )
            }
        )

        # Don't collect baseline to speed up unit test by ~30 seconds.
        stats = timer.collect_callgrind(number=1000, collect_baseline=False)
        counts = stats.counts(denoise=False)

        self.assertIsInstance(counts, int)
        self.assertGreater(counts, 0)

        from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton
        self.assertIsNone(
            wrapper_singleton()._bindings_module,
            "JIT'd bindings are only for back testing."
        )
Esempio n. 7
0
 def benchmark(self, events: List[_ProfilerEvent]):
     shapes_factor_map = {input_shapes(event)[0]: 0.0 for event in events}
     for shape in shapes_factor_map:
         to_timer = benchmark.Timer(stmt='torch.ones(shape).to("cuda")',
                                    globals={'shape': shape})
         de_timer = benchmark.Timer(stmt='torch.ones(shape, device="cuda")',
                                    globals={'shape': shape})
         to_time = to_timer.timeit(10).mean
         de_time = de_timer.timeit(10).mean
         shapes_factor_map[shape] = de_time / to_time
     return shapes_factor_map
Esempio n. 8
0
def bench2():
    cases = [
        # input size, pad_width_new, pad_width_old, constant_values_new, constant_values_old
        ((10, ), 10, (10, 10), None, None),
        ((100, ), 100, (100, 100), None, None),
        ((1000, ), 1000, (1000, 1000), None, None),
        ((10000, ), 10000, (10000, 10000), None, None),
        ((10000, ), 10, (10, 10), None, None),
        ((10, 10, 10), 10, (10, 10, 10, 10, 10, 10), None, None),
        ((10, 10, 10), ((1000, ), (0, ), (0, )), (1000, 1000, 0, 0, 0, 0),
         None, None),
        ((20, 10, 10), 10, (10, 10), None, None),
        ((30, 10, 10), 10, (10, 10), None, None),
        ((100, 10, 10), 10, (10, 10), None, None),
    ]

    num_iters = 10000

    print('====================================')
    print('compare with torch.nn.functional.pad')
    print()
    print('device dtype case_idx time_new time_old new_speedup')
    print()

    for device, dtype, (case_idx,
                        (input_size, pad_width_new, pad_width_old,
                         constant_values_new, constant_values_old)) in product(
                             devices, dtypes, enumerate(cases)):
        time_old = benchmark.Timer(
            setup=
            f'a = torch.randn({input_size}, dtype={dtype}, device="{device}")',
            stmt=
            f'torch.nn.functional.pad(a, {pad_width_old}, value={0 if constant_values_old is None else constant_values_old})'
        ).timeit(num_iters).mean

        time_new = benchmark.Timer(
            setup=
            f'a = torch.randn({input_size}, dtype={dtype}, device="{device}")',
            stmt=
            f'torch.pad(a, {pad_width_new}, constant_values={constant_values_new})'
        ).timeit(num_iters).mean

        new_speedup = time_old / time_new

        print(
            f'{device} {dtype} {case_idx} {time_new:.2e} {time_old:.2e} {new_speedup:.2f}'
        )
        if case_idx == (len(cases) - 1):
            print()
Esempio n. 9
0
def bench1():
    cases = [
        # input size, pad_width, constant_values
        ((10, ), 10, None),
        ((100, ), 100, None),
        ((1000, ), 1000, None),
        ((10000, ), 10000, None),
        ((10000, ), 10, None),
        ((10, 10, 10), 10, None),
        ((10, 10, 10), ((1000, ), (0, ), (0, )), None),
        ((20, 10, 10), 10, None),
        ((30, 10, 10), 10, None),
        ((100, 10, 10), 10, None),
        ((10, 10, 10), 10, 10.0),
        ((10, 10, 10), ((10, 10), (10, 10), (10, 10)), 123),
        ((100, 100, 100), ((10, 10), (10, 10), (10, 10)), None),
    ]

    num_iters = 10000

    print('====================================')
    print('compare with torch.pad')
    print()
    print('device dtype case_idx time_torch time_numpy torch_speedup')
    print()

    for device, dtype, (case_idx,
                        (input_size, pad_width,
                         constant_values)) in product(devices, dtypes,
                                                      enumerate(cases)):
        time_numpy = benchmark.Timer(
            setup=
            f'import numpy as np; a = torch.randn({input_size}, dtype={dtype}).numpy()',
            stmt=f'np.pad(a, {pad_width}, constant_values={constant_values})'
        ).timeit(num_iters).mean

        time_torch = benchmark.Timer(
            setup=
            f'a = torch.randn({input_size}, dtype={dtype}, device="{device}")',
            stmt=f'torch.pad(a, {pad_width}, constant_values={constant_values})'
        ).timeit(num_iters).mean

        torch_speedup = time_numpy / time_torch

        print(
            f'{device} {dtype} {case_idx} {time_torch:.2e} {time_numpy:.2e} {torch_speedup:.2f}'
        )
        if case_idx == (len(cases) - 1):
            print()
Esempio n. 10
0
def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int,
                  probability_regular: float):
    cuda = device == 'cuda'
    spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda,
                                       probability_regular=probability_regular)
    results = []
    for tensors, tensor_params, params in spectral_fuzzer.take(samples):
        shape = [params['k0'], params['k1'], params['k2']][:params['ndim']]
        str_shape = ' x '.join(["{:<4}".format(s) for s in shape])
        sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}"
        for dim in _dim_options(params['ndim']):
            for nthreads in (1, 4, 16) if not cuda else (1,):
                measurement = benchmark.Timer(
                    stmt='func(x, dim=dim)',
                    globals={'func': function, 'x': tensors['x'], 'dim': dim},
                    label=f"{name}_{device}",
                    sub_label=sub_label,
                    description=f"dim={dim}",
                    num_threads=nthreads,
                ).blocked_autorange(min_run_time=1)
                measurement.metadata = {
                    'name': name,
                    'device': device,
                    'dim': dim,
                    'shape': shape,
                }
                measurement.metadata.update(tensor_params['x'])
                results.append(measurement)
    return results
Esempio n. 11
0
    def test_collect_cpp_callgrind(self):
        timer = benchmark_utils.Timer(
            "x += 1;",
            setup="torch::Tensor x = torch::ones({1});",
            timer=timeit.default_timer,
            language="c++",
        )
        stats = [timer.collect_callgrind() for _ in range(3)]
        counts = [s.counts() for s in stats]

        self.assertGreater(min(counts), 0, "No stats were collected")
        self.assertEqual(min(counts), max(counts),
                         "C++ Callgrind should be deterministic")

        for s in stats:
            self.assertEqual(s.counts(denoise=True), s.counts(denoise=False),
                             "De-noising should not apply to C++.")

        stats = timer.collect_callgrind(number=1000, repeats=10)
        assert isinstance(stats, tuple)

        # NB: Unlike the example above, there is no expectation that all
        #     repeats will be identical.
        counts = collections.Counter([s.counts(denoise=True) for s in stats])
        self.assertGreater(max(counts.values), 1, repr(counts))
Esempio n. 12
0
def prof(dtype, op, nl, hidden_size_max):
    fuzzer = benchmark.Fuzzer(
        parameters=[
            benchmark.FuzzedParameter('s', minval=1000, maxval=6000, distribution='uniform'),    # seq_length
            benchmark.FuzzedParameter('b', minval=1, maxval=64, distribution='uniform'),   # batch_size
            benchmark.FuzzedParameter('i', minval=16, maxval=512, distribution='uniform'),   # input_size
            benchmark.FuzzedParameter('h', minval=16, maxval=hidden_size_max, distribution='uniform'),   # hidden_size
            benchmark.FuzzedParameter('n', minval=1, maxval=4, distribution='uniform'),   # num_layer
        ],
        tensors=[
            benchmark.FuzzedTensor('x',
                                   size='sbi',
                                   min_elements=12,
                                   max_elements=10000000,
                                   cuda=True,
                                   dtype=d_dtype[dtype],
                                   max_allocation_bytes=1_000_000_000)
        ],
        seed=42,
        constraints=[
            lambda params: params['i'] % 8 == 0,
            lambda params: params['h'] % 8 == 0
        ])

    res = []

    for tensors, tensor_params, params in fuzzer.take(20):
        s = params['s']
        b = params['b']
        i = params['i']
        h = params['h']
        n = params['n']
        sub_label = f'x=({s}, {b}, {i}),'.ljust(20) + f'op=({i}, {h}, {n})'
        # sub_label = str(tensors['x'].size())

        if nl is None:
            setup=f'rnn=torch.nn.{op}({i}, {h}, {n})'
        else:
            setup=f'rnn=torch.nn.{op}({i}, {h}, {n}, nonlinearity="{nl}")'
        setup += f'.to(device="cuda", dtype={d_dtype[dtype]})'

        res.append(
            benchmark.Timer(stmt=f'rnn(x)',
                            setup=setup,
                            globals=tensors,
                            label=f"{op=}, nonlinearity='{nl}', {dtype=}",
                            sub_label=sub_label,
                            description=f'{torch.__version__}')
                        .blocked_autorange(min_run_time=0.1))

    torch_ver = str(torch.__version__)
    torch_git_ver = torch_ver[torch_ver.index('+') + 1:]

    with open(f'{torch_git_ver}-{op}-{nl}-{dtype}.pkl', 'wb') as f:
        pickle.dump(res, f)

    compare = benchmark.Compare(res)
    # compare.colorize()
    compare.print()
Esempio n. 13
0
 def test_cpp_timer(self):
     timer = benchmark_utils.Timer(
         "torch::Tensor y = x + 1;",
         setup="torch::Tensor x = torch::empty({1});",
         language=benchmark_utils.Language.CPP,
     )
     t = timer.timeit(10)
     self.assertIsInstance(t.median, float)
Esempio n. 14
0
    def test_collect_callgrind(self):
        timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1")

        # Don't collect baseline to speed up unit test by ~30 seconds.
        stats = timer.collect_callgrind(number=1000, collect_baseline=False)

        self.assertIsInstance(stats.counts(include_lookdict_unicode=False),
                              int)
Esempio n. 15
0
 def test_timer_tiny_fast_snippet(self):
     timer = benchmark_utils.Timer(
         'auto x = 1;',
         timer=timeit.default_timer,
         language=benchmark_utils.Language.CPP,
     )
     median = timer.blocked_autorange().median
     self.assertIsInstance(median, float)
Esempio n. 16
0
def run_lobpcg_comparison(label,
                          generator,
                          generator_settings,
                          k=5,
                          largest=True,
                          tol=1e-5):
    label = '{} {} (k={}, largest={})'.format(args.format.upper(), label, k,
                                              largest)

    results = []
    for kwargs in generator_settings:
        # generate input matrix
        a_pt, a_sp = generator(**kwargs)

        # use same initial eigenvectors for both scipy and pytorch
        x_pt = torch.randn(a_pt.size(0), k)
        x_sp = x_pt.numpy()

        description = '{:.4e}'.format(a_pt.size(0))

        t1 = benchmark.Timer(
            stmt="torch.lobpcg(a, X=x, largest=largest, tol=tol)",
            setup="import torch",
            globals=dict(a=a_pt, x=x_pt, largest=largest, tol=tol),
            num_threads=torch.get_num_threads(),
            label=label,
            sub_label='torch_lobpcg',
            description=description,
        )

        t2 = benchmark.Timer(
            stmt="lobpcg(a, X=x, largest=largest, tol=tol)",
            setup="from scipy.sparse.linalg import lobpcg",
            globals=dict(a=a_sp, x=x_sp, largest=largest, tol=tol),
            num_threads=torch.get_num_threads(),
            label=label,
            sub_label='scipy_lobpcg',
            description=description,
        )

        results.append(t1.blocked_autorange(min_run_time=1.))
        results.append(t2.blocked_autorange(min_run_time=1.))

    compare = benchmark.Compare(results)
    compare.print()
Esempio n. 17
0
def prof(dtype, op):
    fuzzer = benchmark.Fuzzer(parameters=[
        benchmark.FuzzedParameter('n',
                                  minval=4,
                                  maxval=16,
                                  distribution='uniform'),
        benchmark.FuzzedParameter('c',
                                  minval=4,
                                  maxval=256,
                                  distribution='uniform'),
        benchmark.FuzzedParameter('h',
                                  minval=8,
                                  maxval=256,
                                  distribution='uniform'),
        benchmark.FuzzedParameter('w',
                                  minval=8,
                                  maxval=256,
                                  distribution='uniform'),
    ],
                              tensors=[
                                  benchmark.FuzzedTensor(
                                      'x',
                                      size='nchw',
                                      min_elements=12,
                                      max_elements=10000000,
                                      cuda=True,
                                      dtype=d_dtype[dtype],
                                      max_allocation_bytes=1_000_000_000)
                              ],
                              seed=42)

    res = []

    for kernel_size in [2, 3, 5]:
        for tensors, tensor_params, params in fuzzer.take(20):
            sub_label = str(tensors['x'].size())
            res.append(
                benchmark.Timer(
                    stmt=f'torch.nn.functional.{op}(x, {kernel_size})',
                    setup='',
                    globals=tensors,
                    label=f'{op}, {dtype=}, {kernel_size=}',
                    sub_label=sub_label,
                    description=f'{torch.__version__}').blocked_autorange(
                        min_run_time=0.1))

    torch_ver = str(torch.__version__)
    torch_git_ver = torch_ver[torch_ver.index('+') + 1:]

    with open(f'{torch_git_ver}-{op}-{dtype}.pkl', 'wb') as f:
        pickle.dump(res, f)

    compare = benchmark.Compare(res)
    # compare.colorize()
    compare.print()
Esempio n. 18
0
    def test_timer(self):
        timer = benchmark_utils.Timer(stmt="torch.ones(())", )
        sample = timer.timeit(5).median
        self.assertIsInstance(sample, float)

        median = timer.blocked_autorange(min_run_time=0.01).median
        self.assertIsInstance(median, float)

        # We set a very high threshold to avoid flakiness in CI.
        # The internal algorithm is tested in `test_adaptive_timer`
        median = timer.adaptive_autorange(threshold=0.5).median
Esempio n. 19
0
def main():
    timer = benchmark_utils.Timer(
        stmt="x + y",
        globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))},
        label="Broadcasting add (4x8)",
    )

    for i in range(3):
        print(f"Run: {i}\n{'-' * 40}")
        print(f"timeit:\n{timer.timeit(10000)}\n")
        print(f"autorange:\n{timer.blocked_autorange()}\n\n")
Esempio n. 20
0
 def benchmark(self, events: List[_ProfilerEvent]):
     shapes_factor_map = {input_shapes(event): 0.0 for event in events}
     for shape in shapes_factor_map:
         matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float32)
         matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float32)
         fp32_timer = benchmark.Timer(stmt='torch.mm(matrixA, matrixB)',
                                      globals={
                                          "matrixA": matrixA,
                                          "matrixB": matrixB
                                      })
         tf32_timer = benchmark.Timer(
             stmt='torch.mm(matrixA, matrixB)',
             setup='torch.backends.cuda.matmul.allow_tf32 = True',
             globals={
                 "matrixA": matrixA,
                 "matrixB": matrixB
             })
         torch.backends.cuda.matmul.allow_tf32 = False
         fp32_time = fp32_timer.timeit(10).mean
         tf32_time = tf32_timer.timeit(10).mean
         shapes_factor_map[shape] = tf32_time / fp32_time
     return shapes_factor_map
Esempio n. 21
0
def benchMark(sizes):
    results = []
    if (len(sizes) == 0):
        print("Parameter 'sizes' has to a have minumun of 1 parameters")
        return

    for n in sizes:
        # label and sub_label are the rows
        # description is the column
        label = 'Batched dot'
        sub_label = f'[{n}, {n}]'
        x = torch.ones((n, n))
        results.append(
            benchmark.Timer(
                stmt='batched_dot_mul_sum(x, x)',
                setup='from __main__ import batched_dot_mul_sum',
                globals={
                    'x': x
                },
                num_threads=torch.get_num_threads(),
                label=label,
                sub_label=sub_label,
                description='mul/sum',
            ).blocked_autorange())
        results.append(
            benchmark.Timer(
                stmt='batched_dot_bmm(x, x)',
                setup='from __main__ import batched_dot_bmm',
                globals={
                    'x': x
                },
                num_threads=torch.get_num_threads(),
                label=label,
                sub_label=sub_label,
                description='bmm',
            ).blocked_autorange())
    compare = benchmark.Compare(results)
    compare.print()
    return compare
Esempio n. 22
0
def main():
    tasks = [
        ("add", "add", "torch.add(x, y)"),
        ("add", "add (extra +0)", "torch.add(x, y + zero)"),
    ]

    serialized_results = []
    repeats = 2
    timers = [
        benchmark_utils.Timer(
            stmt=stmt,
            globals={
                "torch":
                torch if branch == "master" else FauxTorch(torch, overhead_ns),
                "x":
                torch.ones((size, 4)),
                "y":
                torch.ones((1, 4)),
                "zero":
                torch.zeros(()),
            },
            label=label,
            sub_label=sub_label,
            description=f"size: {size}",
            env=branch,
            num_threads=num_threads,
        ) for branch, overhead_ns in [("master",
                                       None), ("my_branch",
                                               1), ("severe_regression", 5)]
        for label, sub_label, stmt in tasks
        for size in [1, 10, 100, 1000, 10000, 50000] for num_threads in [1, 4]
    ]

    for i, timer in enumerate(timers * repeats):
        serialized_results.append(
            pickle.dumps(timer.blocked_autorange(min_run_time=0.05)))
        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
        sys.stdout.flush()
    print()

    comparison = benchmark_utils.Compare(
        [pickle.loads(i) for i in serialized_results])

    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
    comparison.print()

    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
    comparison.trim_significant_figures()
    comparison.colorize()
    comparison.print()
Esempio n. 23
0
def main():
    tasks = [
        ("matmul", "x @ y", "torch.sparse.mm(x, y)"),
        ("matmul", "x @ y + 0", "torch.sparse.mm(x, y) + zero"),
    ]

    serialized_results = []
    repeats = 2
    timers = [
        benchmark_utils.Timer(
            stmt=stmt,
            globals={
                "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns),
                "x": gen_sparse(size=size, density=density, dtype=torch.float32),
                "y": torch.rand(size, dtype=torch.float32),
                "zero": torch.zeros(()),
            },
            label=label,
            sub_label=sub_label,
            description=f"size: {size}",
            env=branch,
            num_threads=num_threads,
        )
        for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 10)]
        for label, sub_label, stmt in tasks
        for density in [0.05, 0.1]
        for size in [(8, 8), (32, 32), (64, 64), (128, 128)]
        for num_threads in [1, 4]
    ]

    for i, timer in enumerate(timers * repeats):
        serialized_results.append(pickle.dumps(
            timer.blocked_autorange(min_run_time=0.05)
        ))
        print(f"\r{i + 1} / {len(timers) * repeats}", end="")
        sys.stdout.flush()
    print()

    comparison = benchmark_utils.Compare([
        pickle.loads(i) for i in serialized_results
    ])

    print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n")
    comparison.print()

    print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n")
    comparison.trim_significant_figures()
    comparison.colorize()
    comparison.print()
Esempio n. 24
0
    def test_collect_cpp_callgrind(self):
        timer = benchmark_utils.Timer(
            "x += 1;",
            setup="torch::Tensor x = torch::ones({1});",
            language="c++",
        )
        stats = [timer.collect_callgrind() for _ in range(3)]
        counts = [s.counts() for s in stats]

        self.assertGreater(min(counts), 0, "No stats were collected")
        self.assertEqual(min(counts), max(counts),
                         "C++ Callgrind should be deterministic")

        for s in stats:
            self.assertEqual(s.counts(denoise=True), s.counts(denoise=False),
                             "De-noising should not apply to C++.")
Esempio n. 25
0
    def test_cpp_timer(self):
        timer = benchmark_utils.Timer(
            """
                #ifndef TIMER_GLOBAL_CHECK
                static_assert(false);
                #endif

                torch::Tensor y = x + 1;
            """,
            setup="torch::Tensor x = torch::empty({1});",
            global_setup="#define TIMER_GLOBAL_CHECK",
            timer=timeit.default_timer,
            language=benchmark_utils.Language.CPP,
        )
        t = timer.timeit(10)
        self.assertIsInstance(t.median, float)
def run_bench(model_names, bench_args):
    results = []
    for model_name in model_names:
        model_creator = MODELS[model_name]
        inputs, model = model_creator(bench_args)

        print("Benchmarking RecordFunction overhead for", model_name)
        print("Running warmup...", end=" ")
        sys.stdout.flush()
        for _ in range(bench_args.warmup):
            model(*inputs)
        print("finished")

        for num_threads in NUM_THREADS:
            for with_rec_fn in [True, False]:
                torch.autograd._enable_record_function(with_rec_fn)
                torch.autograd._clear_callbacks()
                if with_rec_fn:
                    torch.autograd._set_empty_test_observer(True, 0.0001)

                print("Running {} RecordFunction, num threads {} ...".format(
                    "with" if with_rec_fn else "without", num_threads),
                      end=" ")
                sys.stdout.flush()
                timer = benchmark_utils.Timer(
                    stmt="model(*inputs)",
                    globals={
                        "model": model,
                        "inputs": inputs
                    },
                    description=model_name,
                    label="Record function overhead",
                    sub_label=
                    f"with{'' if with_rec_fn else 'out'}_rec_fn, num_threads {num_threads}",
                    num_threads=num_threads)
                result = timer.blocked_autorange(
                    min_run_time=bench_args.timer_min_run_time)
                print("finished")
                print(result)
                sys.stdout.flush()
                results.append(result)

    comparison = benchmark_utils.Compare(results)
    comparison.trim_significant_figures()
    comparison.highlight_warnings()
    comparison.print()
Esempio n. 27
0
    bbox = np.array([250, 250, 300, 300])
    out_size = 125

    x = np.random.randn(600, 600, 3)
    y: torch.tensor = kornia.image_to_tensor(x, keepdim=False) # .to('cuda')  # BxCxHxW


    # a = crop_chw(x, bbox, out_size)
    # b = crop_chw_torch(x, bbox, out_size)
    # print(a.shape)
    # print(b.shape)

    import torch.utils.benchmark as benchmark

    t0 = benchmark.Timer(
        stmt='crop_chw(x, box, 125)',
        setup='from __main__ import crop_chw',
        globals={'x': x, 'box':np.array([250, 250, 300, 300])})

    t1 = benchmark.Timer(
        stmt='crop_chw_torch(x, box, 125, device="cpu")',
        setup='from __main__ import crop_chw_torch',
        globals={'x': y, 'box': np.array([250, 250, 300, 300])})

    print(t0.timeit(1000))
    print(t1.timeit(1000))



    # assert a.allclose(b)

    pass
Esempio n. 28
0
def benchmark_multihead_attention(
    label="",
    attn_dtype=torch.uint8,
    key_padding_dtype=torch.uint8,
    add_bias_kv=False,
    add_zero_attn=False,
    static_kv=False,
    batch_size=20,
    embedding=EMB,
    seq_len=SEQ,
    num_heads=HEADS,
):

    results = []
    # device = torch.device("cuda")

    xformers_att_config = '{"name": "scaled_dot_product"}'

    attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len)
    key_padding_mask = _get_mask(to_dtype=key_padding_dtype,
                                 dim0=batch_size,
                                 dim1=seq_len)

    q = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
    k = torch.rand(seq_len, batch_size, embedding, requires_grad=True)
    v = torch.rand(seq_len, batch_size, embedding, requires_grad=True)

    _reset_seeds()

    original_mha = MultiheadAttention(
        embedding,
        num_heads,
        dropout=0.0,
        xformers_att_config=None,
        add_bias_kv=add_bias_kv,
        add_zero_attn=add_zero_attn,
    )

    xformers_mha = MultiheadAttention(
        embedding,
        num_heads,
        dropout=0.0,
        xformers_att_config=xformers_att_config,
        add_bias_kv=add_bias_kv,
        add_zero_attn=add_zero_attn,
    )

    def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv):
        original_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )

    def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv):
        xformers_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )

    def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv):
        output, _ = original_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )
        loss = torch.norm(output)
        loss.backward()

    def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv):
        output, _ = xformers_mha(
            query=q,
            key=k,
            value=v,
            key_padding_mask=key_padding_mask,
            attn_mask=attn_mask,
            static_kv=static_kv,
        )
        loss = torch.norm(output)
        loss.backward()

    fns = [
        original_bench_fw,
        xformers_bench_fw,
        original_bench_fw_bw,
        xformers_bench_fw_bw,
    ]

    for fn in fns:
        results.append(
            benchmark.Timer(
                stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)",
                globals={
                    "q": q,
                    "k": k,
                    "v": v,
                    "key_padding_mask": key_padding_mask,
                    "attn_mask": attn_mask,
                    "static_kv": static_kv,
                    "fn": fn,
                },
                label="multihead fw + bw",
                sub_label=f"{fn.__name__}",
                description=label,
            ).blocked_autorange(min_run_time=1))

    compare = benchmark.Compare(results)
    compare.print()
Esempio n. 29
0
     ]
 serialized_results = []
 repeats = 2
 timers = [
     benchmark_utils.Timer(
         stmt=stmt,
         globals={
             "scipy_coo_matmul": scipy_coo_matmul,
             "torch_backward": torch_backward,
             "sparse_torch_backward": sparse_torch_backward,
             "scipy_varx": to_coo_scipy(x),
             "scipy_vary": to_coo_scipy(y),
             "tx": x,
             "ty": y,
             "tx_cuda": x.cuda(),
             "ty_cuda": y.cuda(),
             "dense_cuda_x": x.to_dense().cuda(),
             "dense_cuda_y": y.to_dense().cuda(),
             "dense_x": x.to_dense(),
             "dense_y": y.to_dense(),
         },
         label=label,
         sub_label=sub_label,
         description=f"{sparsity}",
         env=device,
         # num_threads=num_threads,
     ) for hidden_size in [512]
     for sparsity in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98]
     for label, device, sub_label, stmt in tasks
     for num_threads in [1, 4, 8, 16]
     for x, y in load_dataset(dataset_path, hidden_size, sparsity)
Esempio n. 30
0
                                                     min_elements=12,
                                                     max_elements=10000000,
                                                     cuda=True,
                                                     dtype=torch.half,
                                                     max_allocation_bytes=1_000_000_000)
                          ],
                          seed=42)

res = []

for kernel_size in [2, 3, 5]:
    for tensors, tensor_params, params in fuzzer.take(20):
        sub_label = str(tensors['x'].size())
        res.append(
            benchmark.Timer(stmt=f'torch.nn.functional.max_pool3d(x, {kernel_size})',
                            setup='',
                            globals=tensors,
                            label=f'max_pool3d, {kernel_size=}',
                            sub_label=sub_label,
                            description=f'{torch.__version__}').blocked_autorange(min_run_time=0.1))

torch_ver = str(torch.__version__)
torch_git_ver = torch_ver[torch_ver.index('+') + 1:]

with open(f'{torch_git_ver}.pkl', 'wb') as f:
    pickle.dump(res, f)

compare = benchmark.Compare(res)
# compare.colorize()
compare.print()