def benchmark(self, events: List[_ProfilerEvent]): def closest_multiple(shapes, multiple): return [multiple * math.ceil(shape / multiple) for shape in shapes] shapes_factor_map = {input_shapes(event): 0.0 for event in events} for shape in shapes_factor_map: matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float16) matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float16) not_aligned_dim_timer = benchmark.Timer( stmt='torch.mm(matrixA, matrixB)', globals={ "matrixA": matrixA, "matrixB": matrixB }) matrixA = torch.randn(closest_multiple(shape[0], 8), device="cuda", dtype=torch.float16) matrixB = torch.randn(closest_multiple(shape[1], 8), device="cuda", dtype=torch.float16) aligned_dim_timer = benchmark.Timer( stmt='torch.mm(matrixA, matrixB)', globals={ "matrixA": matrixA, "matrixB": matrixB }) not_aligned_dim_time = not_aligned_dim_timer.timeit(10).mean aligned_dim_time = aligned_dim_timer.timeit(10).mean shapes_factor_map[shape] = aligned_dim_time / not_aligned_dim_time return shapes_factor_map
def test_timer(self): timer = benchmark_utils.Timer( stmt="torch.ones(())", ) sample = timer.timeit(5).median self.assertIsInstance(sample, float) median = timer.blocked_autorange(min_run_time=0.01).median self.assertIsInstance(median, float) # We set a very high threshold to avoid flakiness in CI. # The internal algorithm is tested in `test_adaptive_timer` median = timer.adaptive_autorange(threshold=0.5).median # Test that multi-line statements work properly. median = benchmark_utils.Timer( stmt=""" with torch.no_grad(): y = x + 1""", setup=""" x = torch.ones((1,), requires_grad=True) for _ in range(5): x = x + 1.0""", ).timeit(5).median self.assertIsInstance(sample, float)
def compare_optimize_resnet18_to_torchscript(): results = [] for i in range(20): test_input = torch.rand(1, 3, 224, 224).half().cuda() sub_label = f"[test {i}]" results.append( benchmark.Timer( stmt="meta_module_resnet18(test_input)", setup="from __main__ import meta_module_resnet18", globals={"test_input": test_input}, sub_label=sub_label, description="tuning by meta", ).blocked_autorange() ) results.append( benchmark.Timer( stmt="jit_module_resnet18(test_input)", setup="from __main__ import jit_module_resnet18", globals={"test_input": test_input}, sub_label=sub_label, description="tuning by jit", ).blocked_autorange() ) compare = benchmark.Compare(results) compare.print()
def test_collect_callgrind(self): with self.assertRaisesRegex( ValueError, r"`collect_callgrind` requires that globals be wrapped " r"in `CopyIfCallgrind` so that serialization is explicit."): benchmark_utils.Timer("pass", globals={ "x": 1 }).collect_callgrind(collect_baseline=False) with self.assertRaisesRegex( # Subprocess raises AttributeError (from pickle), # _ValgrindWrapper re-raises as generic OSError. OSError, "AttributeError: Can't get attribute 'MyModule'"): benchmark_utils.Timer("model(1)", globals={ "model": benchmark_utils.CopyIfCallgrind( MyModule()) }).collect_callgrind(collect_baseline=False) @torch.jit.script def add_one(x): return x + 1 timer = benchmark_utils.Timer( "y = add_one(x) + k", setup="x = torch.ones((1,))", globals={ "add_one": benchmark_utils.CopyIfCallgrind(add_one), "k": benchmark_utils.CopyIfCallgrind(5), "model": benchmark_utils.CopyIfCallgrind(MyModule(), setup=f"""\ import sys sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])}) from test_benchmark_utils import MyModule """) }) stats = timer.collect_callgrind(number=1000) counts = stats.counts(denoise=False) self.assertIsInstance(counts, int) self.assertGreater(counts, 0) stats = timer.collect_callgrind(number=1000, repeats=10) assert isinstance(stats, tuple) # Check that the repeats are at least somewhat repeatable. counts = collections.Counter([s.counts(denoise=True) for s in stats]) self.assertGreater( max(counts.values), 1, f"Every instruction count total was unique: {counts}") from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton self.assertIsNone(wrapper_singleton()._bindings_module, "JIT'd bindings are only for back testing.")
def generate_callgrind_artifacts() -> None: """Regenerate `callgrind_artifacts.json` Unlike the expect tests, regenerating callgrind counts will produce a large diff since build directories and conda/pip directories are included in the instruction string. It is also not 100% deterministic (due to jitter from Python) and takes over a minute to run. As a result, running this function is manual. """ print("Regenerating callgrind artifact.") stats_no_data = benchmark_utils.Timer( "y = torch.ones(())" ).collect_callgrind(number=1000) stats_with_data = benchmark_utils.Timer( "y = torch.ones((1,))" ).collect_callgrind(number=1000) user = os.getenv("USER") def to_entry(fn_counts): return [f"{c} {fn.replace(f'/{user}/', '/test_user/')}" for c, fn in fn_counts] artifacts = { "baseline_inclusive": to_entry(stats_no_data.baseline_inclusive_stats), "baseline_exclusive": to_entry(stats_no_data.baseline_exclusive_stats), "ones_no_data_inclusive": to_entry(stats_no_data.stmt_inclusive_stats), "ones_no_data_exclusive": to_entry(stats_no_data.stmt_exclusive_stats), "ones_with_data_inclusive": to_entry(stats_with_data.stmt_inclusive_stats), "ones_with_data_exclusive": to_entry(stats_with_data.stmt_exclusive_stats), } with open(CALLGRIND_ARTIFACTS, "wt") as f: json.dump(artifacts, f, indent=4)
def test_collect_callgrind(self): with self.assertRaisesRegex( ValueError, r"`collect_callgrind` requires that globals be wrapped " r"in `CopyIfCallgrind` so that serialization is explicit." ): benchmark_utils.Timer( "pass", globals={"x": 1} ).collect_callgrind(collect_baseline=False) with self.assertRaisesRegex( # Subprocess raises AttributeError (from pickle), # _ValgrindWrapper re-raises as generic OSError. OSError, "AttributeError: Can't get attribute 'MyModule'" ): benchmark_utils.Timer( "model(1)", globals={"model": benchmark_utils.CopyIfCallgrind(MyModule())} ).collect_callgrind(collect_baseline=False) @torch.jit.script def add_one(x): return x + 1 timer = benchmark_utils.Timer( "y = add_one(x) + k", setup="x = torch.ones((1,))", globals={ "add_one": benchmark_utils.CopyIfCallgrind(add_one), "k": benchmark_utils.CopyIfCallgrind(5), "model": benchmark_utils.CopyIfCallgrind( MyModule(), setup=f"""\ import sys sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])}) from test_benchmark_utils import MyModule """ ) } ) # Don't collect baseline to speed up unit test by ~30 seconds. stats = timer.collect_callgrind(number=1000, collect_baseline=False) counts = stats.counts(denoise=False) self.assertIsInstance(counts, int) self.assertGreater(counts, 0) from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton self.assertIsNone( wrapper_singleton()._bindings_module, "JIT'd bindings are only for back testing." )
def benchmark(self, events: List[_ProfilerEvent]): shapes_factor_map = {input_shapes(event)[0]: 0.0 for event in events} for shape in shapes_factor_map: to_timer = benchmark.Timer(stmt='torch.ones(shape).to("cuda")', globals={'shape': shape}) de_timer = benchmark.Timer(stmt='torch.ones(shape, device="cuda")', globals={'shape': shape}) to_time = to_timer.timeit(10).mean de_time = de_timer.timeit(10).mean shapes_factor_map[shape] = de_time / to_time return shapes_factor_map
def bench2(): cases = [ # input size, pad_width_new, pad_width_old, constant_values_new, constant_values_old ((10, ), 10, (10, 10), None, None), ((100, ), 100, (100, 100), None, None), ((1000, ), 1000, (1000, 1000), None, None), ((10000, ), 10000, (10000, 10000), None, None), ((10000, ), 10, (10, 10), None, None), ((10, 10, 10), 10, (10, 10, 10, 10, 10, 10), None, None), ((10, 10, 10), ((1000, ), (0, ), (0, )), (1000, 1000, 0, 0, 0, 0), None, None), ((20, 10, 10), 10, (10, 10), None, None), ((30, 10, 10), 10, (10, 10), None, None), ((100, 10, 10), 10, (10, 10), None, None), ] num_iters = 10000 print('====================================') print('compare with torch.nn.functional.pad') print() print('device dtype case_idx time_new time_old new_speedup') print() for device, dtype, (case_idx, (input_size, pad_width_new, pad_width_old, constant_values_new, constant_values_old)) in product( devices, dtypes, enumerate(cases)): time_old = benchmark.Timer( setup= f'a = torch.randn({input_size}, dtype={dtype}, device="{device}")', stmt= f'torch.nn.functional.pad(a, {pad_width_old}, value={0 if constant_values_old is None else constant_values_old})' ).timeit(num_iters).mean time_new = benchmark.Timer( setup= f'a = torch.randn({input_size}, dtype={dtype}, device="{device}")', stmt= f'torch.pad(a, {pad_width_new}, constant_values={constant_values_new})' ).timeit(num_iters).mean new_speedup = time_old / time_new print( f'{device} {dtype} {case_idx} {time_new:.2e} {time_old:.2e} {new_speedup:.2f}' ) if case_idx == (len(cases) - 1): print()
def bench1(): cases = [ # input size, pad_width, constant_values ((10, ), 10, None), ((100, ), 100, None), ((1000, ), 1000, None), ((10000, ), 10000, None), ((10000, ), 10, None), ((10, 10, 10), 10, None), ((10, 10, 10), ((1000, ), (0, ), (0, )), None), ((20, 10, 10), 10, None), ((30, 10, 10), 10, None), ((100, 10, 10), 10, None), ((10, 10, 10), 10, 10.0), ((10, 10, 10), ((10, 10), (10, 10), (10, 10)), 123), ((100, 100, 100), ((10, 10), (10, 10), (10, 10)), None), ] num_iters = 10000 print('====================================') print('compare with torch.pad') print() print('device dtype case_idx time_torch time_numpy torch_speedup') print() for device, dtype, (case_idx, (input_size, pad_width, constant_values)) in product(devices, dtypes, enumerate(cases)): time_numpy = benchmark.Timer( setup= f'import numpy as np; a = torch.randn({input_size}, dtype={dtype}).numpy()', stmt=f'np.pad(a, {pad_width}, constant_values={constant_values})' ).timeit(num_iters).mean time_torch = benchmark.Timer( setup= f'a = torch.randn({input_size}, dtype={dtype}, device="{device}")', stmt=f'torch.pad(a, {pad_width}, constant_values={constant_values})' ).timeit(num_iters).mean torch_speedup = time_numpy / time_torch print( f'{device} {dtype} {case_idx} {time_torch:.2e} {time_numpy:.2e} {torch_speedup:.2f}' ) if case_idx == (len(cases) - 1): print()
def run_benchmark(name: str, function: object, dtype: torch.dtype, seed: int, device: str, samples: int, probability_regular: float): cuda = device == 'cuda' spectral_fuzzer = SpectralOpFuzzer(seed=seed, dtype=dtype, cuda=cuda, probability_regular=probability_regular) results = [] for tensors, tensor_params, params in spectral_fuzzer.take(samples): shape = [params['k0'], params['k1'], params['k2']][:params['ndim']] str_shape = ' x '.join(["{:<4}".format(s) for s in shape]) sub_label = f"{str_shape} {'' if tensor_params['x']['is_contiguous'] else '(discontiguous)'}" for dim in _dim_options(params['ndim']): for nthreads in (1, 4, 16) if not cuda else (1,): measurement = benchmark.Timer( stmt='func(x, dim=dim)', globals={'func': function, 'x': tensors['x'], 'dim': dim}, label=f"{name}_{device}", sub_label=sub_label, description=f"dim={dim}", num_threads=nthreads, ).blocked_autorange(min_run_time=1) measurement.metadata = { 'name': name, 'device': device, 'dim': dim, 'shape': shape, } measurement.metadata.update(tensor_params['x']) results.append(measurement) return results
def test_collect_cpp_callgrind(self): timer = benchmark_utils.Timer( "x += 1;", setup="torch::Tensor x = torch::ones({1});", timer=timeit.default_timer, language="c++", ) stats = [timer.collect_callgrind() for _ in range(3)] counts = [s.counts() for s in stats] self.assertGreater(min(counts), 0, "No stats were collected") self.assertEqual(min(counts), max(counts), "C++ Callgrind should be deterministic") for s in stats: self.assertEqual(s.counts(denoise=True), s.counts(denoise=False), "De-noising should not apply to C++.") stats = timer.collect_callgrind(number=1000, repeats=10) assert isinstance(stats, tuple) # NB: Unlike the example above, there is no expectation that all # repeats will be identical. counts = collections.Counter([s.counts(denoise=True) for s in stats]) self.assertGreater(max(counts.values), 1, repr(counts))
def prof(dtype, op, nl, hidden_size_max): fuzzer = benchmark.Fuzzer( parameters=[ benchmark.FuzzedParameter('s', minval=1000, maxval=6000, distribution='uniform'), # seq_length benchmark.FuzzedParameter('b', minval=1, maxval=64, distribution='uniform'), # batch_size benchmark.FuzzedParameter('i', minval=16, maxval=512, distribution='uniform'), # input_size benchmark.FuzzedParameter('h', minval=16, maxval=hidden_size_max, distribution='uniform'), # hidden_size benchmark.FuzzedParameter('n', minval=1, maxval=4, distribution='uniform'), # num_layer ], tensors=[ benchmark.FuzzedTensor('x', size='sbi', min_elements=12, max_elements=10000000, cuda=True, dtype=d_dtype[dtype], max_allocation_bytes=1_000_000_000) ], seed=42, constraints=[ lambda params: params['i'] % 8 == 0, lambda params: params['h'] % 8 == 0 ]) res = [] for tensors, tensor_params, params in fuzzer.take(20): s = params['s'] b = params['b'] i = params['i'] h = params['h'] n = params['n'] sub_label = f'x=({s}, {b}, {i}),'.ljust(20) + f'op=({i}, {h}, {n})' # sub_label = str(tensors['x'].size()) if nl is None: setup=f'rnn=torch.nn.{op}({i}, {h}, {n})' else: setup=f'rnn=torch.nn.{op}({i}, {h}, {n}, nonlinearity="{nl}")' setup += f'.to(device="cuda", dtype={d_dtype[dtype]})' res.append( benchmark.Timer(stmt=f'rnn(x)', setup=setup, globals=tensors, label=f"{op=}, nonlinearity='{nl}', {dtype=}", sub_label=sub_label, description=f'{torch.__version__}') .blocked_autorange(min_run_time=0.1)) torch_ver = str(torch.__version__) torch_git_ver = torch_ver[torch_ver.index('+') + 1:] with open(f'{torch_git_ver}-{op}-{nl}-{dtype}.pkl', 'wb') as f: pickle.dump(res, f) compare = benchmark.Compare(res) # compare.colorize() compare.print()
def test_cpp_timer(self): timer = benchmark_utils.Timer( "torch::Tensor y = x + 1;", setup="torch::Tensor x = torch::empty({1});", language=benchmark_utils.Language.CPP, ) t = timer.timeit(10) self.assertIsInstance(t.median, float)
def test_collect_callgrind(self): timer = benchmark_utils.Timer("y = torch.ones((1,)) + 1") # Don't collect baseline to speed up unit test by ~30 seconds. stats = timer.collect_callgrind(number=1000, collect_baseline=False) self.assertIsInstance(stats.counts(include_lookdict_unicode=False), int)
def test_timer_tiny_fast_snippet(self): timer = benchmark_utils.Timer( 'auto x = 1;', timer=timeit.default_timer, language=benchmark_utils.Language.CPP, ) median = timer.blocked_autorange().median self.assertIsInstance(median, float)
def run_lobpcg_comparison(label, generator, generator_settings, k=5, largest=True, tol=1e-5): label = '{} {} (k={}, largest={})'.format(args.format.upper(), label, k, largest) results = [] for kwargs in generator_settings: # generate input matrix a_pt, a_sp = generator(**kwargs) # use same initial eigenvectors for both scipy and pytorch x_pt = torch.randn(a_pt.size(0), k) x_sp = x_pt.numpy() description = '{:.4e}'.format(a_pt.size(0)) t1 = benchmark.Timer( stmt="torch.lobpcg(a, X=x, largest=largest, tol=tol)", setup="import torch", globals=dict(a=a_pt, x=x_pt, largest=largest, tol=tol), num_threads=torch.get_num_threads(), label=label, sub_label='torch_lobpcg', description=description, ) t2 = benchmark.Timer( stmt="lobpcg(a, X=x, largest=largest, tol=tol)", setup="from scipy.sparse.linalg import lobpcg", globals=dict(a=a_sp, x=x_sp, largest=largest, tol=tol), num_threads=torch.get_num_threads(), label=label, sub_label='scipy_lobpcg', description=description, ) results.append(t1.blocked_autorange(min_run_time=1.)) results.append(t2.blocked_autorange(min_run_time=1.)) compare = benchmark.Compare(results) compare.print()
def prof(dtype, op): fuzzer = benchmark.Fuzzer(parameters=[ benchmark.FuzzedParameter('n', minval=4, maxval=16, distribution='uniform'), benchmark.FuzzedParameter('c', minval=4, maxval=256, distribution='uniform'), benchmark.FuzzedParameter('h', minval=8, maxval=256, distribution='uniform'), benchmark.FuzzedParameter('w', minval=8, maxval=256, distribution='uniform'), ], tensors=[ benchmark.FuzzedTensor( 'x', size='nchw', min_elements=12, max_elements=10000000, cuda=True, dtype=d_dtype[dtype], max_allocation_bytes=1_000_000_000) ], seed=42) res = [] for kernel_size in [2, 3, 5]: for tensors, tensor_params, params in fuzzer.take(20): sub_label = str(tensors['x'].size()) res.append( benchmark.Timer( stmt=f'torch.nn.functional.{op}(x, {kernel_size})', setup='', globals=tensors, label=f'{op}, {dtype=}, {kernel_size=}', sub_label=sub_label, description=f'{torch.__version__}').blocked_autorange( min_run_time=0.1)) torch_ver = str(torch.__version__) torch_git_ver = torch_ver[torch_ver.index('+') + 1:] with open(f'{torch_git_ver}-{op}-{dtype}.pkl', 'wb') as f: pickle.dump(res, f) compare = benchmark.Compare(res) # compare.colorize() compare.print()
def test_timer(self): timer = benchmark_utils.Timer(stmt="torch.ones(())", ) sample = timer.timeit(5).median self.assertIsInstance(sample, float) median = timer.blocked_autorange(min_run_time=0.01).median self.assertIsInstance(median, float) # We set a very high threshold to avoid flakiness in CI. # The internal algorithm is tested in `test_adaptive_timer` median = timer.adaptive_autorange(threshold=0.5).median
def main(): timer = benchmark_utils.Timer( stmt="x + y", globals={"x": torch.ones((4, 8)), "y": torch.ones((1, 8))}, label="Broadcasting add (4x8)", ) for i in range(3): print(f"Run: {i}\n{'-' * 40}") print(f"timeit:\n{timer.timeit(10000)}\n") print(f"autorange:\n{timer.blocked_autorange()}\n\n")
def benchmark(self, events: List[_ProfilerEvent]): shapes_factor_map = {input_shapes(event): 0.0 for event in events} for shape in shapes_factor_map: matrixA = torch.randn(shape[0], device="cuda", dtype=torch.float32) matrixB = torch.randn(shape[1], device="cuda", dtype=torch.float32) fp32_timer = benchmark.Timer(stmt='torch.mm(matrixA, matrixB)', globals={ "matrixA": matrixA, "matrixB": matrixB }) tf32_timer = benchmark.Timer( stmt='torch.mm(matrixA, matrixB)', setup='torch.backends.cuda.matmul.allow_tf32 = True', globals={ "matrixA": matrixA, "matrixB": matrixB }) torch.backends.cuda.matmul.allow_tf32 = False fp32_time = fp32_timer.timeit(10).mean tf32_time = tf32_timer.timeit(10).mean shapes_factor_map[shape] = tf32_time / fp32_time return shapes_factor_map
def benchMark(sizes): results = [] if (len(sizes) == 0): print("Parameter 'sizes' has to a have minumun of 1 parameters") return for n in sizes: # label and sub_label are the rows # description is the column label = 'Batched dot' sub_label = f'[{n}, {n}]' x = torch.ones((n, n)) results.append( benchmark.Timer( stmt='batched_dot_mul_sum(x, x)', setup='from __main__ import batched_dot_mul_sum', globals={ 'x': x }, num_threads=torch.get_num_threads(), label=label, sub_label=sub_label, description='mul/sum', ).blocked_autorange()) results.append( benchmark.Timer( stmt='batched_dot_bmm(x, x)', setup='from __main__ import batched_dot_bmm', globals={ 'x': x }, num_threads=torch.get_num_threads(), label=label, sub_label=sub_label, description='bmm', ).blocked_autorange()) compare = benchmark.Compare(results) compare.print() return compare
def main(): tasks = [ ("add", "add", "torch.add(x, y)"), ("add", "add (extra +0)", "torch.add(x, y + zero)"), ] serialized_results = [] repeats = 2 timers = [ benchmark_utils.Timer( stmt=stmt, globals={ "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns), "x": torch.ones((size, 4)), "y": torch.ones((1, 4)), "zero": torch.zeros(()), }, label=label, sub_label=sub_label, description=f"size: {size}", env=branch, num_threads=num_threads, ) for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 5)] for label, sub_label, stmt in tasks for size in [1, 10, 100, 1000, 10000, 50000] for num_threads in [1, 4] ] for i, timer in enumerate(timers * repeats): serialized_results.append( pickle.dumps(timer.blocked_autorange(min_run_time=0.05))) print(f"\r{i + 1} / {len(timers) * repeats}", end="") sys.stdout.flush() print() comparison = benchmark_utils.Compare( [pickle.loads(i) for i in serialized_results]) print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n") comparison.print() print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n") comparison.trim_significant_figures() comparison.colorize() comparison.print()
def main(): tasks = [ ("matmul", "x @ y", "torch.sparse.mm(x, y)"), ("matmul", "x @ y + 0", "torch.sparse.mm(x, y) + zero"), ] serialized_results = [] repeats = 2 timers = [ benchmark_utils.Timer( stmt=stmt, globals={ "torch": torch if branch == "master" else FauxTorch(torch, overhead_ns), "x": gen_sparse(size=size, density=density, dtype=torch.float32), "y": torch.rand(size, dtype=torch.float32), "zero": torch.zeros(()), }, label=label, sub_label=sub_label, description=f"size: {size}", env=branch, num_threads=num_threads, ) for branch, overhead_ns in [("master", None), ("my_branch", 1), ("severe_regression", 10)] for label, sub_label, stmt in tasks for density in [0.05, 0.1] for size in [(8, 8), (32, 32), (64, 64), (128, 128)] for num_threads in [1, 4] ] for i, timer in enumerate(timers * repeats): serialized_results.append(pickle.dumps( timer.blocked_autorange(min_run_time=0.05) )) print(f"\r{i + 1} / {len(timers) * repeats}", end="") sys.stdout.flush() print() comparison = benchmark_utils.Compare([ pickle.loads(i) for i in serialized_results ]) print("== Unformatted " + "=" * 80 + "\n" + "/" * 95 + "\n") comparison.print() print("== Formatted " + "=" * 80 + "\n" + "/" * 93 + "\n") comparison.trim_significant_figures() comparison.colorize() comparison.print()
def test_collect_cpp_callgrind(self): timer = benchmark_utils.Timer( "x += 1;", setup="torch::Tensor x = torch::ones({1});", language="c++", ) stats = [timer.collect_callgrind() for _ in range(3)] counts = [s.counts() for s in stats] self.assertGreater(min(counts), 0, "No stats were collected") self.assertEqual(min(counts), max(counts), "C++ Callgrind should be deterministic") for s in stats: self.assertEqual(s.counts(denoise=True), s.counts(denoise=False), "De-noising should not apply to C++.")
def test_cpp_timer(self): timer = benchmark_utils.Timer( """ #ifndef TIMER_GLOBAL_CHECK static_assert(false); #endif torch::Tensor y = x + 1; """, setup="torch::Tensor x = torch::empty({1});", global_setup="#define TIMER_GLOBAL_CHECK", timer=timeit.default_timer, language=benchmark_utils.Language.CPP, ) t = timer.timeit(10) self.assertIsInstance(t.median, float)
def run_bench(model_names, bench_args): results = [] for model_name in model_names: model_creator = MODELS[model_name] inputs, model = model_creator(bench_args) print("Benchmarking RecordFunction overhead for", model_name) print("Running warmup...", end=" ") sys.stdout.flush() for _ in range(bench_args.warmup): model(*inputs) print("finished") for num_threads in NUM_THREADS: for with_rec_fn in [True, False]: torch.autograd._enable_record_function(with_rec_fn) torch.autograd._clear_callbacks() if with_rec_fn: torch.autograd._set_empty_test_observer(True, 0.0001) print("Running {} RecordFunction, num threads {} ...".format( "with" if with_rec_fn else "without", num_threads), end=" ") sys.stdout.flush() timer = benchmark_utils.Timer( stmt="model(*inputs)", globals={ "model": model, "inputs": inputs }, description=model_name, label="Record function overhead", sub_label= f"with{'' if with_rec_fn else 'out'}_rec_fn, num_threads {num_threads}", num_threads=num_threads) result = timer.blocked_autorange( min_run_time=bench_args.timer_min_run_time) print("finished") print(result) sys.stdout.flush() results.append(result) comparison = benchmark_utils.Compare(results) comparison.trim_significant_figures() comparison.highlight_warnings() comparison.print()
bbox = np.array([250, 250, 300, 300]) out_size = 125 x = np.random.randn(600, 600, 3) y: torch.tensor = kornia.image_to_tensor(x, keepdim=False) # .to('cuda') # BxCxHxW # a = crop_chw(x, bbox, out_size) # b = crop_chw_torch(x, bbox, out_size) # print(a.shape) # print(b.shape) import torch.utils.benchmark as benchmark t0 = benchmark.Timer( stmt='crop_chw(x, box, 125)', setup='from __main__ import crop_chw', globals={'x': x, 'box':np.array([250, 250, 300, 300])}) t1 = benchmark.Timer( stmt='crop_chw_torch(x, box, 125, device="cpu")', setup='from __main__ import crop_chw_torch', globals={'x': y, 'box': np.array([250, 250, 300, 300])}) print(t0.timeit(1000)) print(t1.timeit(1000)) # assert a.allclose(b) pass
def benchmark_multihead_attention( label="", attn_dtype=torch.uint8, key_padding_dtype=torch.uint8, add_bias_kv=False, add_zero_attn=False, static_kv=False, batch_size=20, embedding=EMB, seq_len=SEQ, num_heads=HEADS, ): results = [] # device = torch.device("cuda") xformers_att_config = '{"name": "scaled_dot_product"}' attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len) key_padding_mask = _get_mask(to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len) q = torch.rand(seq_len, batch_size, embedding, requires_grad=True) k = torch.rand(seq_len, batch_size, embedding, requires_grad=True) v = torch.rand(seq_len, batch_size, embedding, requires_grad=True) _reset_seeds() original_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=None, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) xformers_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=xformers_att_config, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): original_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): xformers_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): output, _ = original_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) loss = torch.norm(output) loss.backward() def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): output, _ = xformers_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) loss = torch.norm(output) loss.backward() fns = [ original_bench_fw, xformers_bench_fw, original_bench_fw_bw, xformers_bench_fw_bw, ] for fn in fns: results.append( benchmark.Timer( stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)", globals={ "q": q, "k": k, "v": v, "key_padding_mask": key_padding_mask, "attn_mask": attn_mask, "static_kv": static_kv, "fn": fn, }, label="multihead fw + bw", sub_label=f"{fn.__name__}", description=label, ).blocked_autorange(min_run_time=1)) compare = benchmark.Compare(results) compare.print()
] serialized_results = [] repeats = 2 timers = [ benchmark_utils.Timer( stmt=stmt, globals={ "scipy_coo_matmul": scipy_coo_matmul, "torch_backward": torch_backward, "sparse_torch_backward": sparse_torch_backward, "scipy_varx": to_coo_scipy(x), "scipy_vary": to_coo_scipy(y), "tx": x, "ty": y, "tx_cuda": x.cuda(), "ty_cuda": y.cuda(), "dense_cuda_x": x.to_dense().cuda(), "dense_cuda_y": y.to_dense().cuda(), "dense_x": x.to_dense(), "dense_y": y.to_dense(), }, label=label, sub_label=sub_label, description=f"{sparsity}", env=device, # num_threads=num_threads, ) for hidden_size in [512] for sparsity in [0.5, 0.7, 0.8, 0.9, 0.95, 0.98] for label, device, sub_label, stmt in tasks for num_threads in [1, 4, 8, 16] for x, y in load_dataset(dataset_path, hidden_size, sparsity)
min_elements=12, max_elements=10000000, cuda=True, dtype=torch.half, max_allocation_bytes=1_000_000_000) ], seed=42) res = [] for kernel_size in [2, 3, 5]: for tensors, tensor_params, params in fuzzer.take(20): sub_label = str(tensors['x'].size()) res.append( benchmark.Timer(stmt=f'torch.nn.functional.max_pool3d(x, {kernel_size})', setup='', globals=tensors, label=f'max_pool3d, {kernel_size=}', sub_label=sub_label, description=f'{torch.__version__}').blocked_autorange(min_run_time=0.1)) torch_ver = str(torch.__version__) torch_git_ver = torch_ver[torch_ver.index('+') + 1:] with open(f'{torch_git_ver}.pkl', 'wb') as f: pickle.dump(res, f) compare = benchmark.Compare(res) # compare.colorize() compare.print()