def test_collect_callgrind(self): with self.assertRaisesRegex( ValueError, r"`collect_callgrind` requires that globals be wrapped " r"in `CopyIfCallgrind` so that serialization is explicit."): benchmark_utils.Timer("pass", globals={ "x": 1 }).collect_callgrind(collect_baseline=False) with self.assertRaisesRegex( # Subprocess raises AttributeError (from pickle), # _ValgrindWrapper re-raises as generic OSError. OSError, "AttributeError: Can't get attribute 'MyModule'"): benchmark_utils.Timer("model(1)", globals={ "model": benchmark_utils.CopyIfCallgrind( MyModule()) }).collect_callgrind(collect_baseline=False) @torch.jit.script def add_one(x): return x + 1 timer = benchmark_utils.Timer( "y = add_one(x) + k", setup="x = torch.ones((1,))", globals={ "add_one": benchmark_utils.CopyIfCallgrind(add_one), "k": benchmark_utils.CopyIfCallgrind(5), "model": benchmark_utils.CopyIfCallgrind(MyModule(), setup=f"""\ import sys sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])}) from test_benchmark_utils import MyModule """) }) stats = timer.collect_callgrind(number=1000) counts = stats.counts(denoise=False) self.assertIsInstance(counts, int) self.assertGreater(counts, 0) stats = timer.collect_callgrind(number=1000, repeats=10) assert isinstance(stats, tuple) # Check that the repeats are at least somewhat repeatable. counts = collections.Counter([s.counts(denoise=True) for s in stats]) self.assertGreater( max(counts.values), 1, f"Every instruction count total was unique: {counts}") from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton self.assertIsNone(wrapper_singleton()._bindings_module, "JIT'd bindings are only for back testing.")
def collect_callgrind(self, number=100, collect_baseline=True): if not isinstance(self._task_spec.stmt, str): raise ValueError( "`collect_callgrind` currently only supports string `stmt`") # __init__ adds torch, and Timer adds __builtins__ allowed_keys = {"torch", "__builtins__"} if any(k not in allowed_keys for k in self._globals.keys()): raise ValueError( "`collect_callgrind` does not currently support passing globals. " "Please define a `setup` str instead.") if self._globals.get("torch", torch) is not torch: raise ValueError( "`collect_callgrind` does not support mocking out `torch`.") # Check that the statement is valid. It doesn't guarantee success, but it's much # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in # the parent process rather than the valgrind subprocess. self._timer.timeit(1) return valgrind_timer_interface.wrapper_singleton().collect_callgrind( stmt=self._task_spec.stmt, setup=self._task_spec.setup, number=number, num_threads=self._task_spec.num_threads, collect_baseline=collect_baseline)
def collect_callgrind( self, number: int = 100, *, collect_baseline: bool = True, retain_out_file: bool = False, ) -> valgrind_timer_interface.CallgrindStats: """Collect instruction counts using Callgrind. Unlike wall times, instruction counts are deterministic (modulo non-determinism in the program itself and small amounts of jitter from the Python interpreter.) This makes them ideal for detailed performance analysis. This method runs `stmt` in a separate process so that Valgrind can instrument the program. Performance is severely degraded due to the instrumentation, howevever this is ameliorated by the fact that a small number of iterations is generally sufficient to obtain good measurements. In order to to use this method `valgrind`, `callgrind_control`, and `callgrind_annotate` must be installed. Because there is a process boundary between the caller (this process) and the `stmt` execution, `globals` cannot contain arbitrary in-memory data structures. (Unlike timing methods) Instead, globals are restricted to builtins, `nn.Modules`'s, and TorchScripted functions/modules to reduce the surprise factor from serialization and subsequent deserialization. The `GlobalsBridge` class provides more detail on this subject. Take particular care with nn.Modules: they rely on pickle and you may need to add an import to `setup` for them to transfer properly. By default, a profile for an empty statement will be collected and cached to indicate how many instructions are from the Python loop which drives `stmt`. Returns: A `CallgrindStats` object which provides instruction counts and some basic facilities for analyzing and manipulating results. """ if not isinstance(self._task_spec.stmt, str): raise ValueError( "`collect_callgrind` currently only supports string `stmt`") # Check that the statement is valid. It doesn't guarantee success, but it's much # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in # the parent process rather than the valgrind subprocess. self._timer.timeit(1) is_python = (self._language == Language.PYTHON) assert is_python or not self._globals return valgrind_timer_interface.wrapper_singleton().collect_callgrind( task_spec=self._task_spec, globals=self._globals, number=number, collect_baseline=collect_baseline and is_python, is_python=is_python, retain_out_file=retain_out_file, )
def test_collect_callgrind(self): with self.assertRaisesRegex( ValueError, r"`collect_callgrind` requires that globals be wrapped " r"in `CopyIfCallgrind` so that serialization is explicit." ): benchmark_utils.Timer( "pass", globals={"x": 1} ).collect_callgrind(collect_baseline=False) with self.assertRaisesRegex( # Subprocess raises AttributeError (from pickle), # _ValgrindWrapper re-raises as generic OSError. OSError, "AttributeError: Can't get attribute 'MyModule'" ): benchmark_utils.Timer( "model(1)", globals={"model": benchmark_utils.CopyIfCallgrind(MyModule())} ).collect_callgrind(collect_baseline=False) @torch.jit.script def add_one(x): return x + 1 timer = benchmark_utils.Timer( "y = add_one(x) + k", setup="x = torch.ones((1,))", globals={ "add_one": benchmark_utils.CopyIfCallgrind(add_one), "k": benchmark_utils.CopyIfCallgrind(5), "model": benchmark_utils.CopyIfCallgrind( MyModule(), setup=f"""\ import sys sys.path.append({repr(os.path.split(os.path.abspath(__file__))[0])}) from test_benchmark_utils import MyModule """ ) } ) # Don't collect baseline to speed up unit test by ~30 seconds. stats = timer.collect_callgrind(number=1000, collect_baseline=False) counts = stats.counts(denoise=False) self.assertIsInstance(counts, int) self.assertGreater(counts, 0) from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton self.assertIsNone( wrapper_singleton()._bindings_module, "JIT'd bindings are only for back testing." )
"x += 1", setup="x = torch.ones((1,))", ) stats = timer.collect_callgrind(number=1000, repeats=20) assert isinstance(stats, tuple) # Check that the repeats are at least somewhat repeatable. (within 10 instructions per iter) counts = collections.Counter( [s.counts(denoise=True) // 10_000 * 10_000 for s in stats]) self.assertGreater( max(counts.values()), 1, f"Every instruction count total was unique: {counts}") from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton self.assertIsNone(wrapper_singleton()._bindings_module, "JIT'd bindings are only for back testing.") @slowTest @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.") @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.") def test_collect_cpp_callgrind(self): timer = benchmark_utils.Timer( "x += 1;", setup="torch::Tensor x = torch::ones({1});", timer=timeit.default_timer, language="c++", ) stats = [timer.collect_callgrind() for _ in range(3)] counts = [s.counts() for s in stats]
# test reproducibility. timer = benchmark_utils.Timer( "x += 1", setup="x = torch.ones((1,))", ) stats = timer.collect_callgrind(number=1000, repeats=20) assert isinstance(stats, tuple) # Check that the repeats are at least somewhat repeatable. (within 10 instructions per iter) counts = collections.Counter([s.counts(denoise=True) // 10_000 * 10_000 for s in stats]) self.assertGreater(max(counts.values()), 1, f"Every instruction count total was unique: {counts}") from torch.utils.benchmark.utils.valgrind_wrapper.timer_interface import wrapper_singleton self.assertIsNone( wrapper_singleton()._bindings_module, "JIT'd bindings are only for back testing." ) @slowTest @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.") @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.") @unittest.skipIf(True, "Failing on clang, see 74398") def test_collect_cpp_callgrind(self): timer = benchmark_utils.Timer( "x += 1;", setup="torch::Tensor x = torch::ones({1});", timer=timeit.default_timer, language="c++", ) stats = [