Esempio n. 1
0
    def test_kineto_profiler_api(self):
        called_num = [0]

        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        with profile(activities=supported_activities()):
            self.payload(use_cuda=use_cuda)

        def trace_handler(p):
            output = p.key_averages().table(
                sort_by="self_cuda_time_total"
                if use_cuda else "self_cpu_time_total",
                row_limit=-1)
            # print(output)
            # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
            called_num[0] += 1

        with profile(activities=supported_activities(),
                     schedule=torch.profiler.schedule(wait=1,
                                                      warmup=1,
                                                      active=2),
                     on_trace_ready=trace_handler) as p:
            for idx in range(8):
                self.payload(use_cuda=use_cuda)
                p.step()

        self.assertEqual(called_num[0], 2)

        # case without schedule
        with profile(activities=supported_activities()) as p:
            self.payload(use_cuda=use_cuda)
            self.payload(use_cuda=use_cuda)
        output = p.key_averages().table(sort_by="self_cuda_time_total"
                                        if use_cuda else "self_cpu_time_total",
                                        row_limit=-1)
Esempio n. 2
0
    def test_kineto_profiler_api(self):
        called_num = [0]

        with _profile(use_cuda=True, use_kineto=True):
            self.payload()

        def trace_handler(p):
            print(p.key_averages().table(sort_by="self_cuda_time_total",
                                         row_limit=-1))
            # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
            called_num[0] += 1

        with profile(activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA
        ],
                     schedule=torch.profiler.schedule(wait=1,
                                                      warmup=1,
                                                      active=2),
                     on_trace_ready=trace_handler) as p:
            for idx in range(8):
                self.payload()
                p.step()

        self.assertEqual(called_num[0], 2)

        # case without enable_pred
        with profile(activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA
        ]) as p:
            self.payload()
            self.payload()
        print(p.key_averages().table(sort_by="self_cuda_time_total",
                                     row_limit=-1))
Esempio n. 3
0
    def test_tensorboard_trace_handler(self):
        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        with _profile(use_cuda=use_cuda, use_kineto=True):
            self.payload(use_cuda=use_cuda)

        with TemporaryDirectoryName() as dname:
            with profile(
                    activities=[torch.profiler.ProfilerActivity.CPU] +
                ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
                    schedule=torch.profiler.schedule(wait=1,
                                                     warmup=1,
                                                     active=2,
                                                     repeat=3),
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        dname)) as p:
                for _ in range(18):
                    self.payload(use_cuda=use_cuda)
                    p.step()

            self.assertTrue(os.path.exists(dname))
            file_num = 0
            for file_name in os.listdir(dname):
                parts = file_name.split('.')
                self.assertTrue(len(parts) > 4)
                self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0,
                                "Wrong tracing file name pattern")
                self.assertEqual(parts[-3:], ['pt', 'trace', 'json'])
                file_num += 1
            self.assertEqual(file_num, 3)

        # test case for gzip file format
        with TemporaryDirectoryName() as dname:
            p = profile(
                activities=[torch.profiler.ProfilerActivity.CPU] +
                ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []),
                schedule=torch.profiler.schedule(wait=1,
                                                 warmup=1,
                                                 active=2,
                                                 repeat=3),
                on_trace_ready=torch.profiler.tensorboard_trace_handler(
                    dname, use_gzip=True))
            p.start()
            for _ in range(18):
                self.payload(use_cuda=use_cuda)
                p.step()
            p.stop()

            self.assertTrue(os.path.exists(dname))
            file_num = 0
            for file_name in os.listdir(dname):
                parts = file_name.split('.')
                self.assertTrue(len(parts) > 4)
                self.assertTrue(parts[-5].isdigit() and int(parts[-5]) > 0,
                                "Wrong tracing file name pattern")
                self.assertEqual(parts[-4:], ['pt', 'trace', 'json', 'gz'])
                file_num += 1
            self.assertEqual(file_num, 3)
Esempio n. 4
0
    def test_extra_fields(self):
        with profile(with_stack=True, profile_memory=True) as p:
            _ = torch.ones((1, ))

        def find_ones(nodes):
            for n in nodes:
                if n.name() == "aten::ones":
                    return n
                result = find_ones(n.children)
                if result:
                    return result

        node = find_ones(p.profiler.kineto_results.experimental_event_tree())
        self.assertIsNotNone(node)

        self.assertIsInstance(node.extra_fields,
                              torch._C._autograd._ExtraFields_TorchOp)

        self.assertIsInstance(node.parent.extra_fields,
                              torch._C._autograd._ExtraFields_PyCCall)

        self.assertEqual(node.children[0].name(), "aten::empty")
        self.assertEqual(node.children[0].children[0].name(), "[memory]")
        self.assertIsInstance(node.children[0].children[0].extra_fields,
                              torch._C._autograd._ExtraFields_Allocation)
Esempio n. 5
0
    def test_kineto_multigpu(self):
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA
                                 ]) as prof:
            for gpu_id in [0, 1]:
                x = torch.randn(10, 10).cuda(gpu_id)
                y = torch.randn(10, 10).cuda(gpu_id)
                z = x.matmul(y)

        found_gemm_0 = False
        found_gemm_1 = False
        found_cuda = False
        for evt in prof.events():
            if "gemm" in evt.name.lower(
            ) and evt.device_type == DeviceType.CUDA:
                if evt.device_index == 0:
                    found_gemm_0 = True
                elif evt.device_index == 1:
                    found_gemm_1 = True
            if "cuda" in evt.name.lower(
            ) and evt.device_type == DeviceType.CPU:
                found_cuda = True

        self.assertTrue(found_gemm_0)
        self.assertTrue(found_gemm_1)
        self.assertTrue(found_cuda)
Esempio n. 6
0
    def test_flops(self):
        model = torch.nn.Sequential(
            nn.Conv2d(16, 33, 18),
            nn.ReLU(),
            nn.Linear(243, 243),
            nn.ReLU(),
        )
        inputs = torch.randn(40, 16, 18, 260)
        with _profile(record_shapes=True,
                      with_flops=True,
                      use_kineto=kineto_available()) as prof:
            model(inputs)
        profiler_output = prof.key_averages(group_by_input_shape=True).table(
            sort_by="cpu_time_total", row_limit=10)
        self.assertIn("FLOPS", profiler_output)

        if not (kineto_available() and torch.cuda.is_available()):
            return

        with profile(
                activities=[
                    torch.profiler.ProfilerActivity.CPU,
                    torch.profiler.ProfilerActivity.CUDA
                ],
                record_shapes=True,
                with_flops=True,
        ) as kineto_profiler:
            model(inputs)
        profiler_output = kineto_profiler.key_averages().table(
            sort_by="self_cuda_time_total", row_limit=-1)
        self.assertIn("FLOPS", profiler_output)
Esempio n. 7
0
    def test_tensorboard_trace_handler(self):
        with _profile(use_cuda=True, use_kineto=True):
            self.payload()

        with TemporaryDirectoryName() as dname:
            with profile(
                    activities=[
                        torch.profiler.ProfilerActivity.CPU,
                        torch.profiler.ProfilerActivity.CUDA
                    ],
                    schedule=torch.profiler.schedule(wait=1,
                                                     warmup=1,
                                                     active=2,
                                                     repeat=3),
                    on_trace_ready=torch.profiler.tensorboard_trace_handler(
                        dname)) as p:
                for _ in range(18):
                    self.payload()
                    p.step()

            self.assertTrue(os.path.exists(dname))
            file_num = 0
            for file_name in os.listdir(dname):
                parts = file_name.split('.')
                self.assertTrue(len(parts) > 4)
                self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0,
                                "Wrong tracing file name pattern")
                self.assertEqual(parts[-3:], ['pt', 'trace', 'json'])
                file_num += 1
            self.assertEqual(file_num, 3)
Esempio n. 8
0
    def inference(self, model, device):
        with torch.no_grad():
            # for data, target in zip(self.x_train, self.y_train):
            if self.params["profile_pytorch"]:
                try:
                    from torch.profiler import profile

                    with profile(record_shapes=True) as prof:
                        self.inner_loop(model)
                    profile_data = prof.key_averages().table(
                        sort_by="cpu_time_total",
                        row_limit=20,
                    )
                    print(profile_data)
                except Exception:
                    from .torchprof import Profile

                    # Profile using torchprof (TODO:profile_per_batch for all batches and epochs)
                    profile_cuda = self.device.type == "cuda"
                    with Profile(model, use_cuda=profile_cuda) as prof:
                        self.inner_loop(model)
                    data = prof.display(show_events=False)
                    profile_data = json.dumps(data,
                                              indent=4,
                                              separators=(",", ": "))

                self.params["profile_data"] = profile_data
            else:
                self.inner_loop(model)

        if self.params["nb_gpus"] > 0:
            torch.cuda.synchronize()
Esempio n. 9
0
 def test_utils_compute_queue_depth_when_no_cuda_events(self):
     # For traces with only cpu events, we expect empty queue depth list
     x = torch.ones((1024, 1024))
     with profile() as prof:
         for _ in range(5):
             x = x @ x
     basic_evaluation = _utils.BasicEvaluation(prof.profiler)
     self.assertFalse(basic_evaluation.compute_queue_depth())
Esempio n. 10
0
def profile_cuda_kernels(fn, args, string_id="Model time"):
    print("################################################")
    print(f"#### Profiling for {string_id} starts #########")
    print("################################################")
    warmup = 50
    old_args = args[:]
    n_repeats = 1
    n_layers = 1
    ref = fn(*old_args)
    gO = torch.rand_like(ref)
    for _ in range(0, warmup // n_layers):
        args = list(old_args[:])
        ref = fn(*args)
        ref.backward(gO)

    torch.cuda.synchronize()

    # Forward profile
    def fwd_run():
        for _ in range(0, n_repeats // n_layers):
            args = list(old_args[:])
            for arg in args:
                if isinstance(arg, torch.Tensor):
                    arg.grad = None
            ref = fn(*args)

    print(f"###### Forward profile for {string_id} starts #####")
    with profile(activities=[ProfilerActivity.CUDA],
                 record_shapes=True) as prof:
        with record_function("baseline"):
            fwd_run()
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30))
    print(f"###### Forward profile for {string_id} ends #####")

    # Backward profile
    def bwd_run():
        for _ in range(0, n_repeats // n_layers):
            args = list(old_args[:])
            for arg in args:
                if isinstance(arg, torch.Tensor):
                    arg.grad = None
            ref = fn(*args)

            print(f"###### Backward profile for {string_id} starts #####")
            torch.cuda.synchronize()
            with profile(activities=[ProfilerActivity.CUDA],
                         record_shapes=True) as prof:
                with record_function("baseline"):
                    ref.backward(gO)
            print(prof.key_averages().table(sort_by="cuda_time_total",
                                            row_limit=30))
            torch.cuda.synchronize()
            print(f"###### Backward profile for {string_id} ends #####")

    bwd_run()
    print("################################################")
    print(f"#### Profiling for {string_id} ends #########")
    print("################################################\n\n\n\n")
Esempio n. 11
0
def dump_chrome_trace(f,
                      input,
                      trace_filename,
                      optimize_ctx,
                      activities,
                      num_runs=1,
                      devices=None,
                      kwargs_for_f=None,
                      kwargs_for_profiler=None):
    """
    Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
    [num_runs] times to [trace_filename].

    [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
    Return total runtime without the profiler

    Outputs to trace_filename
    """

    if devices is None:
        devices = ["cuda"]

    global synchronize
    if devices != ["cpu"] and torch.cuda.is_available():
        synchronize = torch.cuda.synchronize

    if kwargs_for_f is None:
        kwargs_for_f = {}
    if kwargs_for_profiler is None:
        kwargs_for_profiler = {}

    with optimize_ctx:
        torch.manual_seed(1337)
        for _ in range(5):  # warmup runs
            f(input, **kwargs_for_f)
            synchronize()
        torch.manual_seed(1337)
        t0 = time.perf_counter()
        for _ in range(num_runs):
            f(input, **kwargs_for_f)
            synchronize()
        t1 = time.perf_counter()
    timing = t1 - t0

    with profile(activities=activities, **kwargs_for_profiler) as prof:
        with optimize_ctx:
            synchronize()
            torch.manual_seed(1337)
            for _ in range(num_runs):
                f(input, **kwargs_for_f)
                synchronize()
    prof.export_chrome_trace(trace_filename)

    return timing
Esempio n. 12
0
    def test_profiler_type(self):
        profiler_type = torch._C._autograd._profiler_type
        ActiveProfilerType = torch._C._autograd.ActiveProfilerType
        self.assertEqual(profiler_type(), ActiveProfilerType.NONE)

        # Autograd profiler
        with _profile_legacy():
            self.assertEqual(profiler_type(), ActiveProfilerType.LEGACY)

        # Kineto profiler
        with profile():
            self.assertEqual(profiler_type(), ActiveProfilerType.KINETO)
Esempio n. 13
0
def profile_it(f, inp):
    for _ in range(5):
        f(inp)

    itr = 5
    with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
        for _ in range(itr):
            f(inp)

    timing = prof.key_averages()
    cuda_time_total = 0
    for e in timing:
        cuda_time_total = cuda_time_total + e.cuda_time_total
    return cuda_time_total / itr
Esempio n. 14
0
    def test_profiler_metadata(self):
        t1, t2 = torch.ones(1), torch.ones(1)
        with profile() as prof:
            torch.add(t1, t2)
            prof.add_metadata("test_key1", "test_value1")
            prof.add_metadata_json("test_key2", "[1,2,3]")

        with TemporaryFileName(mode="w+") as fname:
            prof.export_chrome_trace(fname)
            with io.open(fname, 'r') as f:
                trace = json.load(f)
                assert "test_key1" in trace
                assert trace["test_key1"] == "test_value1"
                assert "test_key2" in trace
                assert trace["test_key2"] == [1, 2, 3]
Esempio n. 15
0
 def __init__(self,
              record_func_name='inference',
              activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
              record_shapes=False,
              profile_memory=True,
              scheduler=schedule(wait=1, warmup=1, active=2),
              trace_handler=tensorboard_trace_handler('./log')):
     self.activities = activities
     self.profile = profile(activities=activities,
                            record_shapes=record_shapes,
                            profile_memory=profile_memory,
                            with_flops=True,
                            schedule=scheduler,
                            on_trace_ready=trace_handler)
     self.record_function = record_function(record_func_name)
Esempio n. 16
0
    def test_execution_graph_with_kineto(self):
        trace_called_num = 0

        def trace_handler(p):
            nonlocal trace_called_num
            trace_called_num += 1

        use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities(
        )
        # Create a temp file to save execution graph data.
        fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False)
        fp.close()
        expected_loop_events = 0
        eg = ExecutionGraphObserver()
        eg.register_callback(fp.name)
        with profile(
                activities=supported_activities(),
                schedule=torch.profiler.schedule(skip_first=3,
                                                 wait=1,
                                                 warmup=1,
                                                 active=2),
                on_trace_ready=trace_handler,
        ) as p:
            eg.start()
            for idx in range(10):
                expected_loop_events += 1
                with record_function(f"## LOOP {idx} ##"):
                    self.payload(use_cuda=use_cuda)
                p.step()
            eg.stop()

        eg.unregister_callback()

        assert trace_called_num == 2
        assert fp.name == eg.get_output_file_path()
        nodes = self.get_execution_graph_root(fp.name)
        loop_count = 0
        for n in nodes:
            assert "name" in n
            if "[pytorch|profiler|execution_graph|process]" in n["name"]:
                found_root_node = True
            if n["name"].startswith("## LOOP "):
                loop_count += 1
        assert found_root_node
        assert loop_count == expected_loop_events
Esempio n. 17
0
    def bwd_run():
        for _ in range(0, n_repeats // n_layers):
            args = list(old_args[:])
            for arg in args:
                if isinstance(arg, torch.Tensor):
                    arg.grad = None
            ref = fn(*args)

            print(f"###### Backward profile for {string_id} starts #####")
            torch.cuda.synchronize()
            with profile(activities=[ProfilerActivity.CUDA],
                         record_shapes=True) as prof:
                with record_function("baseline"):
                    ref.backward(gO)
            print(prof.key_averages().table(sort_by="cuda_time_total",
                                            row_limit=30))
            torch.cuda.synchronize()
            print(f"###### Backward profile for {string_id} ends #####")
Esempio n. 18
0
 def test_utils_compute_self_time(self):
     with profile() as prof:
         t1, t2 = torch.ones(1, requires_grad=True), torch.ones(
             1, requires_grad=True)
         z = torch.add(t1, t2)
         y = torch.ones(1)
         loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
         loss.backward()
     basic_eval = _utils.BasicEvaluation(prof.profiler)
     metrics = basic_eval.metrics
     self.assertTrue(len(metrics) > 0)
     for event_key, event_metrics in metrics.items():
         self.assertEqual(
             event_metrics.self_time_ns,
             event_key.event.duration_time_ns - sum([
                 child.duration_time_ns
                 for child in event_key.event.children
             ]))
Esempio n. 19
0
def profile_conv_runtimes(model, filename):
    model = model.cuda()
    inputs = torch.randn(32, 3, 224, 224).cuda()
    with profile(activities=[ProfilerActivity.CUDA],
                 profile_memory=True,
                 record_shapes=True) as prof:
        with record_function("model_inference"):
            model(inputs)
    print(
        prof.key_averages(group_by_input_shape=True).table(
            sort_by="cpu_time_total", row_limit=10))
    print(
        prof.key_averages(group_by_input_shape=True).table(
            sort_by="cuda_time_total", row_limit=10))
    print(
        prof.key_averages(group_by_input_shape=True).table(
            sort_by="cuda_memory_usage", row_limit=10))
    prof.export_chrome_trace(filename + '.json')
Esempio n. 20
0
    def train_func():
        from ray.train.torch import TorchWorkerProfiler
        from torch.profiler import profile, record_function, schedule

        twp = TorchWorkerProfiler()
        with profile(
                activities=[],
                schedule=schedule(wait=0, warmup=0, active=1),
                on_trace_ready=twp.trace_handler,
        ) as p:

            for epoch in range(num_epochs):
                with record_function("test_function"):
                    pass

                p.step()

                profile_results = twp.get_and_clear_profile_traces()
                train.report(epoch=epoch, **profile_results)
Esempio n. 21
0
def train_func():
    twp = TorchWorkerProfiler()
    with profile(
            activities=[],
            schedule=schedule(wait=0, warmup=0, active=1),
            on_trace_ready=twp.trace_handler,
    ) as p:

        # Setup model.
        model = torch.nn.Linear(1, 1)
        model = train.torch.prepare_model(model)
        loss_fn = torch.nn.MSELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

        # Setup data.
        input = torch.randn(1000, 1)
        labels = input * 2
        dataset = torch.utils.data.TensorDataset(input, labels)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)
        dataloader = train.torch.prepare_data_loader(dataloader)

        # Train.
        for epoch in range(5):
            with record_function("train_epoch"):
                for X, y in dataloader:
                    pred = model(X)
                    loss = loss_fn(pred, y)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            with record_function("train_checkpoint"):
                state_dict = model.state_dict()
                consume_prefix_in_state_dict_if_present(state_dict, "module.")
                train.save_checkpoint(epoch=epoch, model_weights=state_dict)

            p.step()

            with record_function("train_report"):
                profile_results = twp.get_and_clear_profile_traces()
                train.report(epoch=epoch, **profile_results)
Esempio n. 22
0
 def test_utils_compute_queue_depth(self):
     x = torch.ones((8096, 8096), device="cuda")
     with profile() as prof:
         # First half we want it to be compute bound
         for _ in range(5):
             y = torch.mm(x, x)
         # Second half we want it to be overhead bound
         # So we are synchronize and sleeping
         torch.cuda.synchronize()
         for _ in range(3):
             y[0] += 1
             time.sleep(0.1)
     basic_evaluation = _utils.BasicEvaluation(prof.profiler)
     # We can assume golden because mm is compute intensive,
     # so kernel will queued up.
     # But later tensor indexing is overhead bound, and there
     # is sleep to make sure kernel finished before next dispatch.
     golden_queue_depth_list = [1, 2, 3, 4, 5, 1, 1, 1]
     for entry, golden in zip(basic_evaluation.compute_queue_depth(),
                              golden_queue_depth_list):
         self.assertTrue(entry.queue_depth == golden)
Esempio n. 23
0
            def wrap_forward(*args, **kwargs):
                """The forward pass is decorated and profiled here
                """
                # only torch 1.8.1+ is supported
                torch_version = torch.__version__
                if torch_version <= '1.8.1':
                    raise NotImplementedError(
                        "Profiler requires at least torch 1.8.1")

                # profile the forward pass
                with torch_profiler.profile(
                        activities=[
                            torch.profiler.ProfilerActivity.CPU,
                            torch.profiler.ProfilerActivity.CUDA,
                        ], profile_memory=self.profile_memory) as prof:
                    res = _forward(*args, **kwargs)

                event_list = prof.events()

                # each profile call should be contained in its own list
                self.trace_profile_events[path].append(event_list)
                return res
Esempio n. 24
0
 def test_profiler_correlation_id(self):
     '''
     We expect the correlation_id to be unique across multiple invokation of the profiler,
     So we will reuse id_uniqueness_set.
     '''
     id_uniqueness_set = set()
     model = torch.nn.Sequential(
         nn.Conv2d(16, 33, 18),
         nn.ReLU(),
         nn.Linear(243, 243),
         nn.ReLU(),
     )
     inputs = torch.randn(40, 16, 18, 260)
     uint32_max = 2**32 - 1
     for i in range(5):
         with profile() as prof:
             model(inputs)
         for event in prof.profiler.kineto_results.events():
             corr_id = event.correlation_id()
             if (corr_id):
                 self.assertTrue(corr_id not in id_uniqueness_set)
                 id_uniqueness_set.add(corr_id)
                 self.assertTrue(corr_id < uint32_max)
Esempio n. 25
0
def profile(args, model, model_info, device):
    """
    Profile.
    :param model:
    :param model_info:
    :return:
    """
    import copy
    from torch.profiler import profile, record_function, ProfilerActivity

    model = copy.deepcopy(model)
    model = model.to(device)
    model.eval()

    inputs = tuple(
        torch.ones((args.batch_size, ) + model_info['input_shapes'][k][1:],
                   dtype=torch.float32).to(device)
        for k in model_info['input_names'])
    for x in inputs:
        print(x.shape, x.device)

    def trace_handler(p):
        output = p.key_averages().table(sort_by="self_cuda_time_total",
                                        row_limit=50)
        print(output)
        p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                 schedule=torch.profiler.schedule(wait=2,
                                                  warmup=2,
                                                  active=6,
                                                  repeat=2),
                 on_trace_ready=trace_handler) as p:
        for idx in range(100):
            model(*inputs)
            p.step()
Esempio n. 26
0
#p.outShape = (1, 64, 1088, 1920)

getMemUsed = lambda i: torch.cuda.memory_stats(i)['reserved_bytes.all.peak']
t = torch.randn(shape, dtype=config.dtype(), device=config.device()) # pylint: disable=E1101
load = shape[-1] * shape[-2] * shape[0]
m = getMemUsed(config.device()) if config.cuda else None
print(config.dtype(), config.device(), m)
if config.cuda:
  p(t)
  #doCrop(p, t)
  getMemUsed(config.device())
  start = perf_counter()
  p(t)
  #doCrop(p, t).mean().cpu()
  print('time elpased: {}'.format(perf_counter() - start))
  m = getMemUsed(config.device())
else:
  schedule1 = schedule(
    wait=1,
    warmup=1,
    active=1)
  with profile(
    activities=[ProfilerActivity.CPU],
    schedule=schedule1, profile_memory=True) as pro:
    for _ in range(3):
      p(t)
      pro.step()
    avg = pro.key_averages()
    avg.sort(key=lambda o: o.cpu_memory_usage, reverse=True)
    m = avg[0].cpu_memory_usage
print(m, m / load, load)
Esempio n. 27
0
    def test_memory_profiler(self):
        def run_profiler(tensor_creation_fn):
            # collecting allocs / deallocs
            with _profile(profile_memory=True,
                          record_shapes=True,
                          use_kineto=kineto_available()) as prof:
                x = None
                with record_function("test_user_scope_alloc"):
                    x = tensor_creation_fn()
                with record_function("test_user_scope_dealloc"):
                    del x
            return prof.key_averages(group_by_input_shape=True)

        def check_metrics(stats, metric, allocs=None, deallocs=None):
            stat_metrics = {}
            for stat in stats:
                stat_metrics[stat.key] = getattr(stat, metric)
            if allocs is not None:
                for alloc_fn in allocs:
                    self.assertTrue(alloc_fn in stat_metrics)
                    self.assertTrue(stat_metrics[alloc_fn] > 0)
            if deallocs is not None:
                for dealloc_fn in deallocs:
                    self.assertTrue(dealloc_fn in stat_metrics)
                    self.assertTrue(stat_metrics[dealloc_fn] < 0)

        def create_cpu_tensor():
            return torch.rand(10, 10)

        def create_cuda_tensor():
            return torch.rand(10, 10).cuda()

        def create_mkldnn_tensor():
            return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()

        stats = run_profiler(create_cpu_tensor)
        check_metrics(stats,
                      "cpu_memory_usage",
                      allocs=[
                          "aten::empty",
                          "aten::rand",
                          "test_user_scope_alloc",
                      ],
                      deallocs=[
                          "test_user_scope_dealloc",
                      ])

        if kineto_available():
            with TemporaryFileName(mode="w+") as fname:
                with profile(profile_memory=True) as prof:
                    x = None
                    with record_function("test_user_scope_alloc"):
                        x = create_cpu_tensor()
                    with record_function("test_user_scope_dealloc"):
                        del x
                prof.export_chrome_trace(fname)
                with io.open(fname, 'r') as f:
                    trace = json.load(f)
                    assert "traceEvents" in trace
                    events = trace["traceEvents"]
                    found_memory_events = False
                    for evt in events:
                        assert "name" in evt
                        if evt["name"] == "[memory]":
                            found_memory_events = True
                            assert "args" in evt
                            assert "Device Type" in evt["args"]
                            assert "Device Id" in evt["args"]
                            assert "Bytes" in evt["args"]
                    assert found_memory_events

        if torch.cuda.is_available():
            create_cuda_tensor()
            stats = run_profiler(create_cuda_tensor)
            check_metrics(stats,
                          "cuda_memory_usage",
                          allocs=[
                              "test_user_scope_alloc",
                              "aten::to",
                              "aten::empty_strided",
                          ],
                          deallocs=[
                              "test_user_scope_dealloc",
                          ])
            check_metrics(stats,
                          "cpu_memory_usage",
                          allocs=[
                              "aten::rand",
                              "aten::empty",
                          ])

        if torch._C.has_mkldnn:
            create_mkldnn_tensor()
            stats = run_profiler(create_mkldnn_tensor)
            check_metrics(stats,
                          "cpu_memory_usage",
                          allocs=[
                              "test_user_scope_alloc",
                              "aten::rand",
                              "aten::empty",
                              "aten::to_mkldnn",
                          ],
                          deallocs=[
                              "test_user_scope_dealloc",
                          ])

        # check top-level memory events
        with _profile(profile_memory=True,
                      use_kineto=kineto_available()) as prof:
            x = torch.rand(10, 10)
            del x
            if torch.cuda.is_available():
                y = torch.rand(10, 10).cuda()
                del y
            gc.collect()
        stats = prof.key_averages(group_by_input_shape=True)
        check_metrics(stats,
                      "cpu_memory_usage",
                      allocs=["aten::rand", "aten::empty"],
                      deallocs=["[memory]"])
        if torch.cuda.is_available():
            check_metrics(stats, "cuda_memory_usage", deallocs=["[memory]"])
Esempio n. 28
0
def inference(model, dataloader, datatype, args):
    batch_time = AverageMeter('Time', ':6.3f')
    batch_size = args.batch_size
    warmup_iters = args.warmup_iterations
    max_iters = args.max_iterations if dataloader is None else len(dataloader)
    model.eval()
    coco = get_coco_api_from_dataset(dataloader.dataset)
    iou_types = ["bbox"]
    iou_types.append("segm")
    coco_evaluator = CocoEvaluator(coco, iou_types)
    if args.ipex:
        import intel_extension_for_pytorch as ipex
        model = model.to(memory_format=torch.channels_last)
        model = ipex.optimize(model,
                              dtype=datatype,
                              level="O1",
                              conv_bn_folding=False,
                              replace_dropout_with_identity=False)
        model.backbone = ipex.optimize(model.backbone,
                                       dtype=datatype,
                                       level="O1")
    else:
        if args.jit:
            model = model.to(memory_format=torch.channels_last)
        else:
            from torch.utils import mkldnn as mkldnn_utils
            model = mkldnn_utils.to_mkldnn(model, dtype=datatype)
    if args.jit:
        x = torch.randn(batch_size, 3, 1200,
                        1200).to(memory_format=torch.channels_last)
        if args.precision == "bf16":
            with torch.cpu.amp.autocast(), torch.no_grad():
                model.backbone = torch.jit.trace(model.backbone,
                                                 x,
                                                 strict=False)
            model.backbone = torch.jit.freeze(model.backbone)
        else:
            with torch.no_grad():
                model.backbone = torch.jit.trace(model.backbone,
                                                 x,
                                                 strict=False)
            model.backbone = torch.jit.freeze(model.backbone)
    with torch.no_grad():
        if dataloader is None:
            print(
                "Models for detection tasks need to use real dataset. You need to specify coco dataset. "
            )
            exit(1)
        else:
            for i, batch in enumerate(dataloader):
                images = batch[0]
                if not args.ipex and not args.jit:
                    images = list(img.to(datatype) for img in images)
                if args.ipex and args.precision == "bf16":
                    with torch.cpu.amp.autocast():
                        if i == warmup_iters:
                            with profile(
                                    activities=[ProfilerActivity.CPU],
                                    record_shapes=True
                            ) as prof, record_function("model_inference"):
                                output = model(images)
                        else:
                            output = model(images)
                else:
                    if i == warmup_iters:
                        with profile(
                                activities=[ProfilerActivity.CPU],
                                record_shapes=True) as prof, record_function(
                                    "model_inference"):
                            output = model(images)
                    else:
                        output = model(images)
                if i > warmup_iters:
                    break
            for i, batch in enumerate(dataloader):
                images = batch[0]
                end = time.time()
                if not args.ipex and not args.jit:
                    images = list(img.to(datatype) for img in images)
                if args.ipex and args.precision == "bf16":
                    with torch.cpu.amp.autocast():
                        output = model(images)
                else:
                    output = model(images)
                batch_time.update(time.time() - end)
                output = [{k: v.to(torch.float32)
                           for k, v in t.items()} for t in output]
                res = {
                    target["image_id"].item(): output
                    for target, output in zip(batch[1], output)
                }
                coco_evaluator.update(res)
                if max_iters != -1 and i >= max_iters:
                    break
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1))
    latency = batch_time.avg / batch_size * 1000
    perf = batch_size / batch_time.avg
    coco_evaluator.synchronize_between_processes()
    coco_evaluator.accumulate()
    coco_evaluator.summarize()
    print("Bbox AP: {:.5f} ".format(coco_evaluator.coco_eval['bbox'].stats[0]))
    print("Segm AP: {:.5f} ".format(coco_evaluator.coco_eval['segm'].stats[0]))
    print('Latency: %.3f ms' % latency)
    print("Throughput: {:.3f} fps".format(perf))
Esempio n. 29
0
def train(args):
    local_rank = int(os.environ['LOCAL_RANK'])
    verbose = local_rank == 0
    if verbose:
        print('Using PyTorch version:', torch.__version__)
        print(torch.__config__.show())

    dist.init_process_group(backend='nccl')
    world_size = dist.get_world_size()

    torch.manual_seed(0)
    torch.cuda.set_device(local_rank)

    # Set up standard model.
    if verbose:
        print(f'Using {args.model} model')
    model = getattr(models, args.model)()
    model = model.cuda()

    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)

    model = DistributedDataParallel(model, device_ids=[local_rank])

    train_dataset = dataset_from_datadir(args.datadir, verbose=verbose)
    train_sampler = DistributedSampler(train_dataset)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.batchsize,
                              shuffle=False,
                              num_workers=args.workers,
                              pin_memory=True,
                              sampler=train_sampler)

    if args.profiler:
        th = None
        if args.profiler_format == 'tb':
            th = torch.profiler.tensorboard_trace_handler('./logs/profiler')
        prof = profile(
            schedule=torch.profiler.schedule(
                wait=1,  # number of steps steps not active
                warmup=1,  # warmup steps (tracing, but results discarded)
                active=10,  # tracing steps
                repeat=1),  # repeat procedure this many times
            on_trace_ready=th,
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            record_shapes=True,
            with_stack=True)
        prof.start()

    total_step = args.steps if args.steps is not None else len(train_loader)

    # For each block of printed steps
    last_start = datetime.now()
    last_images = 0

    # For final average
    avg_images = 0
    avg_start = None
    tot_steps = 0

    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)

            outputs = model(images)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if args.profiler:
                prof.step()

            li = len(images)
            last_images += li

            tot_steps += 1
            if tot_steps == args.warmup_steps:
                avg_start = datetime.now()
            elif tot_steps > args.warmup_steps:
                avg_images += li

            if (i + 1) % args.print_steps == 0 and verbose:
                now = datetime.now()
                last_secs = (now - last_start).total_seconds()

                print(
                    f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], '
                    f'Loss: {loss.item():.4f}, '
                    f'Images/sec: {last_images*world_size/last_secs:.2f} '
                    f'(last {args.print_steps} steps)')

                last_start = now
                last_images = 0

            if args.steps is not None and tot_steps >= args.steps:
                break

    if args.profiler:
        if args.profiler_format == 'json' and verbose:
            trace_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
            trace_fname = f"profiler-trace-{trace_datetime}.json"
            print(f'Writing profiler trace to {trace_fname}')
            prof.export_chrome_trace(trace_fname)

        prof.stop()

    if verbose:
        if avg_start is None:
            print(
                "WARNING: stopped before warmup steps done, not printing stats."
            )
        else:
            dur = datetime.now() - avg_start
            print(f"Training completed in: {dur}")
            print(
                f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} "
                f"(average, skipping {args.warmup_steps} warmup steps)")
Esempio n. 30
0
def main(args):
    _logger.info('args:\n - %s',
                 '\n - '.join(str(it) for it in args.__dict__.items()))

    if args.file_fraction < 1:
        _logger.warning(
            'Use of `file-fraction` is not recommended in general -- prefer using `data-fraction` instead.'
        )

    # classification/regression mode
    if args.regression_mode:
        _logger.info('Running in regression mode')
        from utils.nn.tools import train_regression as train
        from utils.nn.tools import evaluate_regression as evaluate
    else:
        _logger.info('Running in classification mode')
        from utils.nn.tools import train_classification as train
        from utils.nn.tools import evaluate_classification as evaluate

    # training/testing mode
    training_mode = not args.predict

    # device
    if args.gpus:
        gpus = [int(i) for i in args.gpus.split(',')]
        dev = torch.device(gpus[0])
    else:
        gpus = None
        dev = torch.device('cpu')

    # load data
    if training_mode:
        train_loader, val_loader, data_config, train_input_names, train_label_names = train_load(
            args)
    else:
        test_loaders, data_config = test_load(args)

    if args.io_test:
        data_loader = train_loader if training_mode else list(
            test_loaders.values())[0]()
        iotest(args, data_loader)
        return

    model, model_info, network_module, network_options = model_setup(
        args, data_config)

    if args.print:
        return

    if args.profile:
        profile(args, model, model_info, device=dev)
        return

    # export to ONNX
    if args.export_onnx:
        onnx(args, model, data_config, model_info)
        return

    if args.tensorboard:
        from utils.nn.tools import TensorboardHelper
        tb = TensorboardHelper(tb_comment=args.tensorboard,
                               tb_custom_fn=args.tensorboard_custom_fn)
    else:
        tb = None

    # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel
    # so we do not convert it to nn.DataParallel now
    orig_model = model

    if training_mode:
        model = orig_model.to(dev)
        # loss function
        try:
            loss_func = network_module.get_loss(data_config, **network_options)
            _logger.info('Using loss function %s with options %s' %
                         (loss_func, network_options))
        except AttributeError:
            loss_func = torch.nn.CrossEntropyLoss()
            _logger.warning(
                'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.',
                args.network_config)

        # optimizer & learning rate
        opt, scheduler = optim(args, model, dev)

        # multi-gpu
        if gpus is not None and len(gpus) > 1:
            # model becomes `torch.nn.DataParallel` w/ model.module being the original `torch.nn.Module`
            model = torch.nn.DataParallel(model, device_ids=gpus)
        model = model.to(dev)

        # lr finder: keep it after all other setups
        if args.lr_finder is not None:
            start_lr, end_lr, num_iter = args.lr_finder.replace(' ',
                                                                '').split(',')
            from utils.lr_finder import LRFinder
            lr_finder = LRFinder(model,
                                 opt,
                                 loss_func,
                                 device=dev,
                                 input_names=train_input_names,
                                 label_names=train_label_names)
            lr_finder.range_test(train_loader,
                                 start_lr=float(start_lr),
                                 end_lr=float(end_lr),
                                 num_iter=int(num_iter))
            lr_finder.plot(output='lr_finder.png'
                           )  # to inspect the loss-learning rate graph
            return

        if args.use_amp:
            from torch.cuda.amp import GradScaler
            scaler = GradScaler()
        else:
            scaler = None

        # training loop
        best_valid_metric = np.inf if args.regression_mode else 0
        for epoch in range(args.num_epochs):
            if args.load_epoch is not None:
                if epoch <= args.load_epoch:
                    continue
            print('-' * 50)
            _logger.info('Epoch #%d training' % epoch)
            train(model,
                  loss_func,
                  opt,
                  scheduler,
                  train_loader,
                  dev,
                  epoch,
                  steps_per_epoch=args.steps_per_epoch,
                  grad_scaler=scaler,
                  tb_helper=tb)
            if args.model_prefix:
                dirname = os.path.dirname(args.model_prefix)
                if dirname and not os.path.exists(dirname):
                    os.makedirs(dirname)
                state_dict = model.module.state_dict() if isinstance(
                    model, torch.nn.DataParallel) else model.state_dict()
                torch.save(state_dict,
                           args.model_prefix + '_epoch-%d_state.pt' % epoch)
                torch.save(
                    opt.state_dict(),
                    args.model_prefix + '_epoch-%d_optimizer.pt' % epoch)

            _logger.info('Epoch #%d validating' % epoch)
            valid_metric = evaluate(model,
                                    val_loader,
                                    dev,
                                    epoch,
                                    loss_func=loss_func,
                                    steps_per_epoch=args.steps_per_epoch_val,
                                    tb_helper=tb)
            is_best_epoch = (valid_metric < best_valid_metric
                             ) if args.regression_mode else (
                                 valid_metric > best_valid_metric)
            if is_best_epoch:
                best_valid_metric = valid_metric
                if args.model_prefix:
                    shutil.copy2(
                        args.model_prefix + '_epoch-%d_state.pt' % epoch,
                        args.model_prefix + '_best_epoch_state.pt')
                    torch.save(model,
                               args.model_prefix + '_best_epoch_full.pt')
            _logger.info(
                'Epoch #%d: Current validation metric: %.5f (best: %.5f)' %
                (epoch, valid_metric, best_valid_metric),
                color='bold')

    if args.data_test:
        if training_mode:
            del train_loader, val_loader
            test_loaders, data_config = test_load(args)

        if not args.model_prefix.endswith('.onnx'):
            model = orig_model.to(dev)
            model_path = args.model_prefix if args.model_prefix.endswith(
                '.pt') else args.model_prefix + '_best_epoch_state.pt'
            _logger.info('Loading model %s for eval' % model_path)
            model.load_state_dict(torch.load(model_path, map_location=dev))
            if gpus is not None and len(gpus) > 1:
                model = torch.nn.DataParallel(model, device_ids=gpus)
            model = model.to(dev)

        for name, get_test_loader in test_loaders.items():
            test_loader = get_test_loader()
            # run prediction
            if args.model_prefix.endswith('.onnx'):
                _logger.info('Loading model %s for eval' % args.model_prefix)
                from utils.nn.tools import evaluate_onnx
                test_metric, scores, labels, observers = evaluate_onnx(
                    args.model_prefix, test_loader)
            else:
                test_metric, scores, labels, observers = evaluate(
                    model,
                    test_loader,
                    dev,
                    epoch=None,
                    for_training=False,
                    tb_helper=tb)
            _logger.info('Test metric %.5f' % test_metric, color='bold')
            del test_loader

            if args.predict_output:
                if '/' not in args.predict_output:
                    args.predict_output = os.path.join(
                        os.path.dirname(args.model_prefix), 'predict_output',
                        args.predict_output)
                os.makedirs(os.path.dirname(args.predict_output),
                            exist_ok=True)
                if name == '':
                    output_path = args.predict_output
                else:
                    base, ext = os.path.splitext(args.predict_output)
                    output_path = base + '_' + name + ext
                if output_path.endswith('.root'):
                    save_root(args, output_path, data_config, scores, labels,
                              observers)
                else:
                    save_awk(args, output_path, scores, labels, observers)
                _logger.info('Written output to %s' % output_path,
                             color='bold')