def test_utils_compute_queue_depth_when_no_cuda_events(self): # For traces with only cpu events, we expect empty queue depth list x = torch.ones((1024, 1024)) with profile() as prof: for _ in range(5): x = x @ x basic_evaluation = _utils.BasicEvaluation(prof.profiler) self.assertFalse(basic_evaluation.compute_queue_depth())
def test_utils_compute_self_time(self): with profile() as prof: t1, t2 = torch.ones(1, requires_grad=True), torch.ones( 1, requires_grad=True) z = torch.add(t1, t2) y = torch.ones(1) loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y) loss.backward() basic_eval = _utils.BasicEvaluation(prof.profiler) metrics = basic_eval.metrics self.assertTrue(len(metrics) > 0) for event_key, event_metrics in metrics.items(): self.assertEqual( event_metrics.self_time_ns, event_key.event.duration_time_ns - sum([ child.duration_time_ns for child in event_key.event.children ]))
def test_utils_compute_queue_depth(self): x = torch.ones((8096, 8096), device="cuda") with profile() as prof: # First half we want it to be compute bound for _ in range(5): y = torch.mm(x, x) # Second half we want it to be overhead bound # So we are synchronize and sleeping torch.cuda.synchronize() for _ in range(3): y[0] += 1 time.sleep(0.1) basic_evaluation = _utils.BasicEvaluation(prof.profiler) # We can assume golden because mm is compute intensive, # so kernel will queued up. # But later tensor indexing is overhead bound, and there # is sleep to make sure kernel finished before next dispatch. golden_queue_depth_list = [1, 2, 3, 4, 5, 1, 1, 1] for entry, golden in zip(basic_evaluation.compute_queue_depth(), golden_queue_depth_list): self.assertTrue(entry.queue_depth == golden)
def test_utils_compute_queue_depth(self): def format_queue_depth(queue_depth_list, events): res = "" for data, event in zip(queue_depth_list, events): res += f"{data.queue_depth} [{event.name()}]\n" return res # We have to use Mock because time series data is too flaky to test profiler = self.generate_mock_profile() basic_evaluation = _utils.BasicEvaluation(profiler) self.assertExpectedInline( format_queue_depth(basic_evaluation.queue_depth_list, basic_evaluation.cuda_events), """\ 1 [cudaLaunchKernel] 2 [cudaLaunchKernel] 3 [cudaLaunchKernel] 2 [GPU] 1 [GPU] 0 [GPU] """) self.assertExpectedInline( format_queue_depth([ basic_evaluation.metrics[k] for k in basic_evaluation.event_keys ], basic_evaluation.events), """\ 0 [CPU (Before cudaLaunchKernel)] 0 [CPU (Before cudaLaunchKernel)] 0 [CPU (Before cudaLaunchKernel)] 0 [CPU (Before cudaLaunchKernel)] 1 [CPU (After cudaLaunchKernel)] 2 [CPU (After cudaLaunchKernel)] 3 [CPU (After cudaLaunchKernel)] 2 [CPU (After GPU)] 1 [CPU (After GPU)] 0 [CPU (After GPU)] 0 [CPU (No Event)] """)