Example #1
0
 def test_utils_compute_queue_depth_when_no_cuda_events(self):
     # For traces with only cpu events, we expect empty queue depth list
     x = torch.ones((1024, 1024))
     with profile() as prof:
         for _ in range(5):
             x = x @ x
     basic_evaluation = _utils.BasicEvaluation(prof.profiler)
     self.assertFalse(basic_evaluation.compute_queue_depth())
Example #2
0
 def test_utils_compute_self_time(self):
     with profile() as prof:
         t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
             1, requires_grad=True)
         z = torch.add(t1, t2)
         y = torch.ones(1)
         loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
         loss.backward()
     basic_eval = _utils.BasicEvaluation(prof.profiler)
     metrics = basic_eval.metrics
     self.assertTrue(len(metrics) > 0)
     for event_key, event_metrics in metrics.items():
         self.assertEqual(
             event_metrics.self_time_ns,
             event_key.event.duration_time_ns - sum([
                 child.duration_time_ns
                 for child in event_key.event.children
             ]))
Example #3
0
 def test_utils_compute_queue_depth(self):
     x = torch.ones((8096, 8096), device="cuda")
     with profile() as prof:
         # First half we want it to be compute bound
         for _ in range(5):
             y = torch.mm(x, x)
         # Second half we want it to be overhead bound
         # So we are synchronize and sleeping
         torch.cuda.synchronize()
         for _ in range(3):
             y[0] += 1
             time.sleep(0.1)
     basic_evaluation = _utils.BasicEvaluation(prof.profiler)
     # We can assume golden because mm is compute intensive,
     # so kernel will queued up.
     # But later tensor indexing is overhead bound, and there
     # is sleep to make sure kernel finished before next dispatch.
     golden_queue_depth_list = [1, 2, 3, 4, 5, 1, 1, 1]
     for entry, golden in zip(basic_evaluation.compute_queue_depth(),
                              golden_queue_depth_list):
         self.assertTrue(entry.queue_depth == golden)
Example #4
0
    def test_utils_compute_queue_depth(self):
        def format_queue_depth(queue_depth_list, events):
            res = ""
            for data, event in zip(queue_depth_list, events):
                res += f"{data.queue_depth} [{event.name()}]\n"
            return res

        # We have to use Mock because time series data is too flaky to test
        profiler = self.generate_mock_profile()
        basic_evaluation = _utils.BasicEvaluation(profiler)
        self.assertExpectedInline(
            format_queue_depth(basic_evaluation.queue_depth_list,
                               basic_evaluation.cuda_events), """\
1 [cudaLaunchKernel]
2 [cudaLaunchKernel]
3 [cudaLaunchKernel]
2 [GPU]
1 [GPU]
0 [GPU]
""")
        self.assertExpectedInline(
            format_queue_depth([
                basic_evaluation.metrics[k]
                for k in basic_evaluation.event_keys
            ], basic_evaluation.events), """\
0 [CPU (Before cudaLaunchKernel)]
0 [CPU (Before cudaLaunchKernel)]
0 [CPU (Before cudaLaunchKernel)]
0 [CPU (Before cudaLaunchKernel)]
1 [CPU (After cudaLaunchKernel)]
2 [CPU (After cudaLaunchKernel)]
3 [CPU (After cudaLaunchKernel)]
2 [CPU (After GPU)]
1 [CPU (After GPU)]
0 [CPU (After GPU)]
0 [CPU (No Event)]
""")