def test_kineto_profiler_api(self): called_num = [0] use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities( ) with profile(activities=supported_activities()): self.payload(use_cuda=use_cuda) def trace_handler(p): output = p.key_averages().table( sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1) # print(output) # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json") called_num[0] += 1 with profile(activities=supported_activities(), schedule=torch.profiler.schedule(wait=1, warmup=1, active=2), on_trace_ready=trace_handler) as p: for idx in range(8): self.payload(use_cuda=use_cuda) p.step() self.assertEqual(called_num[0], 2) # case without schedule with profile(activities=supported_activities()) as p: self.payload(use_cuda=use_cuda) self.payload(use_cuda=use_cuda) output = p.key_averages().table(sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
def test_kineto_profiler_api(self): called_num = [0] with _profile(use_cuda=True, use_kineto=True): self.payload() def trace_handler(p): print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1)) # p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json") called_num[0] += 1 with profile(activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA ], schedule=torch.profiler.schedule(wait=1, warmup=1, active=2), on_trace_ready=trace_handler) as p: for idx in range(8): self.payload() p.step() self.assertEqual(called_num[0], 2) # case without enable_pred with profile(activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA ]) as p: self.payload() self.payload() print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
def test_tensorboard_trace_handler(self): use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities( ) with _profile(use_cuda=use_cuda, use_kineto=True): self.payload(use_cuda=use_cuda) with TemporaryDirectoryName() as dname: with profile( activities=[torch.profiler.ProfilerActivity.CPU] + ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []), schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3), on_trace_ready=torch.profiler.tensorboard_trace_handler( dname)) as p: for _ in range(18): self.payload(use_cuda=use_cuda) p.step() self.assertTrue(os.path.exists(dname)) file_num = 0 for file_name in os.listdir(dname): parts = file_name.split('.') self.assertTrue(len(parts) > 4) self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0, "Wrong tracing file name pattern") self.assertEqual(parts[-3:], ['pt', 'trace', 'json']) file_num += 1 self.assertEqual(file_num, 3) # test case for gzip file format with TemporaryDirectoryName() as dname: p = profile( activities=[torch.profiler.ProfilerActivity.CPU] + ([torch.profiler.ProfilerActivity.CUDA] if use_cuda else []), schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3), on_trace_ready=torch.profiler.tensorboard_trace_handler( dname, use_gzip=True)) p.start() for _ in range(18): self.payload(use_cuda=use_cuda) p.step() p.stop() self.assertTrue(os.path.exists(dname)) file_num = 0 for file_name in os.listdir(dname): parts = file_name.split('.') self.assertTrue(len(parts) > 4) self.assertTrue(parts[-5].isdigit() and int(parts[-5]) > 0, "Wrong tracing file name pattern") self.assertEqual(parts[-4:], ['pt', 'trace', 'json', 'gz']) file_num += 1 self.assertEqual(file_num, 3)
def test_extra_fields(self): with profile(with_stack=True, profile_memory=True) as p: _ = torch.ones((1, )) def find_ones(nodes): for n in nodes: if n.name() == "aten::ones": return n result = find_ones(n.children) if result: return result node = find_ones(p.profiler.kineto_results.experimental_event_tree()) self.assertIsNotNone(node) self.assertIsInstance(node.extra_fields, torch._C._autograd._ExtraFields_TorchOp) self.assertIsInstance(node.parent.extra_fields, torch._C._autograd._ExtraFields_PyCCall) self.assertEqual(node.children[0].name(), "aten::empty") self.assertEqual(node.children[0].children[0].name(), "[memory]") self.assertIsInstance(node.children[0].children[0].extra_fields, torch._C._autograd._ExtraFields_Allocation)
def test_kineto_multigpu(self): with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA ]) as prof: for gpu_id in [0, 1]: x = torch.randn(10, 10).cuda(gpu_id) y = torch.randn(10, 10).cuda(gpu_id) z = x.matmul(y) found_gemm_0 = False found_gemm_1 = False found_cuda = False for evt in prof.events(): if "gemm" in evt.name.lower( ) and evt.device_type == DeviceType.CUDA: if evt.device_index == 0: found_gemm_0 = True elif evt.device_index == 1: found_gemm_1 = True if "cuda" in evt.name.lower( ) and evt.device_type == DeviceType.CPU: found_cuda = True self.assertTrue(found_gemm_0) self.assertTrue(found_gemm_1) self.assertTrue(found_cuda)
def test_flops(self): model = torch.nn.Sequential( nn.Conv2d(16, 33, 18), nn.ReLU(), nn.Linear(243, 243), nn.ReLU(), ) inputs = torch.randn(40, 16, 18, 260) with _profile(record_shapes=True, with_flops=True, use_kineto=kineto_available()) as prof: model(inputs) profiler_output = prof.key_averages(group_by_input_shape=True).table( sort_by="cpu_time_total", row_limit=10) self.assertIn("FLOPS", profiler_output) if not (kineto_available() and torch.cuda.is_available()): return with profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA ], record_shapes=True, with_flops=True, ) as kineto_profiler: model(inputs) profiler_output = kineto_profiler.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1) self.assertIn("FLOPS", profiler_output)
def test_tensorboard_trace_handler(self): with _profile(use_cuda=True, use_kineto=True): self.payload() with TemporaryDirectoryName() as dname: with profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA ], schedule=torch.profiler.schedule(wait=1, warmup=1, active=2, repeat=3), on_trace_ready=torch.profiler.tensorboard_trace_handler( dname)) as p: for _ in range(18): self.payload() p.step() self.assertTrue(os.path.exists(dname)) file_num = 0 for file_name in os.listdir(dname): parts = file_name.split('.') self.assertTrue(len(parts) > 4) self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0, "Wrong tracing file name pattern") self.assertEqual(parts[-3:], ['pt', 'trace', 'json']) file_num += 1 self.assertEqual(file_num, 3)
def inference(self, model, device): with torch.no_grad(): # for data, target in zip(self.x_train, self.y_train): if self.params["profile_pytorch"]: try: from torch.profiler import profile with profile(record_shapes=True) as prof: self.inner_loop(model) profile_data = prof.key_averages().table( sort_by="cpu_time_total", row_limit=20, ) print(profile_data) except Exception: from .torchprof import Profile # Profile using torchprof (TODO:profile_per_batch for all batches and epochs) profile_cuda = self.device.type == "cuda" with Profile(model, use_cuda=profile_cuda) as prof: self.inner_loop(model) data = prof.display(show_events=False) profile_data = json.dumps(data, indent=4, separators=(",", ": ")) self.params["profile_data"] = profile_data else: self.inner_loop(model) if self.params["nb_gpus"] > 0: torch.cuda.synchronize()
def test_utils_compute_queue_depth_when_no_cuda_events(self): # For traces with only cpu events, we expect empty queue depth list x = torch.ones((1024, 1024)) with profile() as prof: for _ in range(5): x = x @ x basic_evaluation = _utils.BasicEvaluation(prof.profiler) self.assertFalse(basic_evaluation.compute_queue_depth())
def profile_cuda_kernels(fn, args, string_id="Model time"): print("################################################") print(f"#### Profiling for {string_id} starts #########") print("################################################") warmup = 50 old_args = args[:] n_repeats = 1 n_layers = 1 ref = fn(*old_args) gO = torch.rand_like(ref) for _ in range(0, warmup // n_layers): args = list(old_args[:]) ref = fn(*args) ref.backward(gO) torch.cuda.synchronize() # Forward profile def fwd_run(): for _ in range(0, n_repeats // n_layers): args = list(old_args[:]) for arg in args: if isinstance(arg, torch.Tensor): arg.grad = None ref = fn(*args) print(f"###### Forward profile for {string_id} starts #####") with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof: with record_function("baseline"): fwd_run() print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30)) print(f"###### Forward profile for {string_id} ends #####") # Backward profile def bwd_run(): for _ in range(0, n_repeats // n_layers): args = list(old_args[:]) for arg in args: if isinstance(arg, torch.Tensor): arg.grad = None ref = fn(*args) print(f"###### Backward profile for {string_id} starts #####") torch.cuda.synchronize() with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof: with record_function("baseline"): ref.backward(gO) print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30)) torch.cuda.synchronize() print(f"###### Backward profile for {string_id} ends #####") bwd_run() print("################################################") print(f"#### Profiling for {string_id} ends #########") print("################################################\n\n\n\n")
def dump_chrome_trace(f, input, trace_filename, optimize_ctx, activities, num_runs=1, devices=None, kwargs_for_f=None, kwargs_for_profiler=None): """ Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times to [trace_filename]. [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA. Return total runtime without the profiler Outputs to trace_filename """ if devices is None: devices = ["cuda"] global synchronize if devices != ["cpu"] and torch.cuda.is_available(): synchronize = torch.cuda.synchronize if kwargs_for_f is None: kwargs_for_f = {} if kwargs_for_profiler is None: kwargs_for_profiler = {} with optimize_ctx: torch.manual_seed(1337) for _ in range(5): # warmup runs f(input, **kwargs_for_f) synchronize() torch.manual_seed(1337) t0 = time.perf_counter() for _ in range(num_runs): f(input, **kwargs_for_f) synchronize() t1 = time.perf_counter() timing = t1 - t0 with profile(activities=activities, **kwargs_for_profiler) as prof: with optimize_ctx: synchronize() torch.manual_seed(1337) for _ in range(num_runs): f(input, **kwargs_for_f) synchronize() prof.export_chrome_trace(trace_filename) return timing
def test_profiler_type(self): profiler_type = torch._C._autograd._profiler_type ActiveProfilerType = torch._C._autograd.ActiveProfilerType self.assertEqual(profiler_type(), ActiveProfilerType.NONE) # Autograd profiler with _profile_legacy(): self.assertEqual(profiler_type(), ActiveProfilerType.LEGACY) # Kineto profiler with profile(): self.assertEqual(profiler_type(), ActiveProfilerType.KINETO)
def profile_it(f, inp): for _ in range(5): f(inp) itr = 5 with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof: for _ in range(itr): f(inp) timing = prof.key_averages() cuda_time_total = 0 for e in timing: cuda_time_total = cuda_time_total + e.cuda_time_total return cuda_time_total / itr
def test_profiler_metadata(self): t1, t2 = torch.ones(1), torch.ones(1) with profile() as prof: torch.add(t1, t2) prof.add_metadata("test_key1", "test_value1") prof.add_metadata_json("test_key2", "[1,2,3]") with TemporaryFileName(mode="w+") as fname: prof.export_chrome_trace(fname) with io.open(fname, 'r') as f: trace = json.load(f) assert "test_key1" in trace assert trace["test_key1"] == "test_value1" assert "test_key2" in trace assert trace["test_key2"] == [1, 2, 3]
def __init__(self, record_func_name='inference', activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=False, profile_memory=True, scheduler=schedule(wait=1, warmup=1, active=2), trace_handler=tensorboard_trace_handler('./log')): self.activities = activities self.profile = profile(activities=activities, record_shapes=record_shapes, profile_memory=profile_memory, with_flops=True, schedule=scheduler, on_trace_ready=trace_handler) self.record_function = record_function(record_func_name)
def test_execution_graph_with_kineto(self): trace_called_num = 0 def trace_handler(p): nonlocal trace_called_num trace_called_num += 1 use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities( ) # Create a temp file to save execution graph data. fp = tempfile.NamedTemporaryFile('w+t', suffix='.json', delete=False) fp.close() expected_loop_events = 0 eg = ExecutionGraphObserver() eg.register_callback(fp.name) with profile( activities=supported_activities(), schedule=torch.profiler.schedule(skip_first=3, wait=1, warmup=1, active=2), on_trace_ready=trace_handler, ) as p: eg.start() for idx in range(10): expected_loop_events += 1 with record_function(f"## LOOP {idx} ##"): self.payload(use_cuda=use_cuda) p.step() eg.stop() eg.unregister_callback() assert trace_called_num == 2 assert fp.name == eg.get_output_file_path() nodes = self.get_execution_graph_root(fp.name) loop_count = 0 for n in nodes: assert "name" in n if "[pytorch|profiler|execution_graph|process]" in n["name"]: found_root_node = True if n["name"].startswith("## LOOP "): loop_count += 1 assert found_root_node assert loop_count == expected_loop_events
def bwd_run(): for _ in range(0, n_repeats // n_layers): args = list(old_args[:]) for arg in args: if isinstance(arg, torch.Tensor): arg.grad = None ref = fn(*args) print(f"###### Backward profile for {string_id} starts #####") torch.cuda.synchronize() with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof: with record_function("baseline"): ref.backward(gO) print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30)) torch.cuda.synchronize() print(f"###### Backward profile for {string_id} ends #####")
def test_utils_compute_self_time(self): with profile() as prof: t1, t2 = torch.ones(1, requires_grad=True), torch.ones( 1, requires_grad=True) z = torch.add(t1, t2) y = torch.ones(1) loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y) loss.backward() basic_eval = _utils.BasicEvaluation(prof.profiler) metrics = basic_eval.metrics self.assertTrue(len(metrics) > 0) for event_key, event_metrics in metrics.items(): self.assertEqual( event_metrics.self_time_ns, event_key.event.duration_time_ns - sum([ child.duration_time_ns for child in event_key.event.children ]))
def profile_conv_runtimes(model, filename): model = model.cuda() inputs = torch.randn(32, 3, 224, 224).cuda() with profile(activities=[ProfilerActivity.CUDA], profile_memory=True, record_shapes=True) as prof: with record_function("model_inference"): model(inputs) print( prof.key_averages(group_by_input_shape=True).table( sort_by="cpu_time_total", row_limit=10)) print( prof.key_averages(group_by_input_shape=True).table( sort_by="cuda_time_total", row_limit=10)) print( prof.key_averages(group_by_input_shape=True).table( sort_by="cuda_memory_usage", row_limit=10)) prof.export_chrome_trace(filename + '.json')
def train_func(): from ray.train.torch import TorchWorkerProfiler from torch.profiler import profile, record_function, schedule twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: for epoch in range(num_epochs): with record_function("test_function"): pass p.step() profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
def train_func(): twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: # Setup model. model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) loss_fn = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) # Setup data. input = torch.randn(1000, 1) labels = input * 2 dataset = torch.utils.data.TensorDataset(input, labels) dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) dataloader = train.torch.prepare_data_loader(dataloader) # Train. for epoch in range(5): with record_function("train_epoch"): for X, y in dataloader: pred = model(X) loss = loss_fn(pred, y) optimizer.zero_grad() loss.backward() optimizer.step() with record_function("train_checkpoint"): state_dict = model.state_dict() consume_prefix_in_state_dict_if_present(state_dict, "module.") train.save_checkpoint(epoch=epoch, model_weights=state_dict) p.step() with record_function("train_report"): profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
def test_utils_compute_queue_depth(self): x = torch.ones((8096, 8096), device="cuda") with profile() as prof: # First half we want it to be compute bound for _ in range(5): y = torch.mm(x, x) # Second half we want it to be overhead bound # So we are synchronize and sleeping torch.cuda.synchronize() for _ in range(3): y[0] += 1 time.sleep(0.1) basic_evaluation = _utils.BasicEvaluation(prof.profiler) # We can assume golden because mm is compute intensive, # so kernel will queued up. # But later tensor indexing is overhead bound, and there # is sleep to make sure kernel finished before next dispatch. golden_queue_depth_list = [1, 2, 3, 4, 5, 1, 1, 1] for entry, golden in zip(basic_evaluation.compute_queue_depth(), golden_queue_depth_list): self.assertTrue(entry.queue_depth == golden)
def wrap_forward(*args, **kwargs): """The forward pass is decorated and profiled here """ # only torch 1.8.1+ is supported torch_version = torch.__version__ if torch_version <= '1.8.1': raise NotImplementedError( "Profiler requires at least torch 1.8.1") # profile the forward pass with torch_profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA, ], profile_memory=self.profile_memory) as prof: res = _forward(*args, **kwargs) event_list = prof.events() # each profile call should be contained in its own list self.trace_profile_events[path].append(event_list) return res
def test_profiler_correlation_id(self): ''' We expect the correlation_id to be unique across multiple invokation of the profiler, So we will reuse id_uniqueness_set. ''' id_uniqueness_set = set() model = torch.nn.Sequential( nn.Conv2d(16, 33, 18), nn.ReLU(), nn.Linear(243, 243), nn.ReLU(), ) inputs = torch.randn(40, 16, 18, 260) uint32_max = 2**32 - 1 for i in range(5): with profile() as prof: model(inputs) for event in prof.profiler.kineto_results.events(): corr_id = event.correlation_id() if (corr_id): self.assertTrue(corr_id not in id_uniqueness_set) id_uniqueness_set.add(corr_id) self.assertTrue(corr_id < uint32_max)
def profile(args, model, model_info, device): """ Profile. :param model: :param model_info: :return: """ import copy from torch.profiler import profile, record_function, ProfilerActivity model = copy.deepcopy(model) model = model.to(device) model.eval() inputs = tuple( torch.ones((args.batch_size, ) + model_info['input_shapes'][k][1:], dtype=torch.float32).to(device) for k in model_info['input_names']) for x in inputs: print(x.shape, x.device) def trace_handler(p): output = p.key_averages().table(sort_by="self_cuda_time_total", row_limit=50) print(output) p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json") with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], schedule=torch.profiler.schedule(wait=2, warmup=2, active=6, repeat=2), on_trace_ready=trace_handler) as p: for idx in range(100): model(*inputs) p.step()
#p.outShape = (1, 64, 1088, 1920) getMemUsed = lambda i: torch.cuda.memory_stats(i)['reserved_bytes.all.peak'] t = torch.randn(shape, dtype=config.dtype(), device=config.device()) # pylint: disable=E1101 load = shape[-1] * shape[-2] * shape[0] m = getMemUsed(config.device()) if config.cuda else None print(config.dtype(), config.device(), m) if config.cuda: p(t) #doCrop(p, t) getMemUsed(config.device()) start = perf_counter() p(t) #doCrop(p, t).mean().cpu() print('time elpased: {}'.format(perf_counter() - start)) m = getMemUsed(config.device()) else: schedule1 = schedule( wait=1, warmup=1, active=1) with profile( activities=[ProfilerActivity.CPU], schedule=schedule1, profile_memory=True) as pro: for _ in range(3): p(t) pro.step() avg = pro.key_averages() avg.sort(key=lambda o: o.cpu_memory_usage, reverse=True) m = avg[0].cpu_memory_usage print(m, m / load, load)
def test_memory_profiler(self): def run_profiler(tensor_creation_fn): # collecting allocs / deallocs with _profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof: x = None with record_function("test_user_scope_alloc"): x = tensor_creation_fn() with record_function("test_user_scope_dealloc"): del x return prof.key_averages(group_by_input_shape=True) def check_metrics(stats, metric, allocs=None, deallocs=None): stat_metrics = {} for stat in stats: stat_metrics[stat.key] = getattr(stat, metric) if allocs is not None: for alloc_fn in allocs: self.assertTrue(alloc_fn in stat_metrics) self.assertTrue(stat_metrics[alloc_fn] > 0) if deallocs is not None: for dealloc_fn in deallocs: self.assertTrue(dealloc_fn in stat_metrics) self.assertTrue(stat_metrics[dealloc_fn] < 0) def create_cpu_tensor(): return torch.rand(10, 10) def create_cuda_tensor(): return torch.rand(10, 10).cuda() def create_mkldnn_tensor(): return torch.rand(10, 10, dtype=torch.float32).to_mkldnn() stats = run_profiler(create_cpu_tensor) check_metrics(stats, "cpu_memory_usage", allocs=[ "aten::empty", "aten::rand", "test_user_scope_alloc", ], deallocs=[ "test_user_scope_dealloc", ]) if kineto_available(): with TemporaryFileName(mode="w+") as fname: with profile(profile_memory=True) as prof: x = None with record_function("test_user_scope_alloc"): x = create_cpu_tensor() with record_function("test_user_scope_dealloc"): del x prof.export_chrome_trace(fname) with io.open(fname, 'r') as f: trace = json.load(f) assert "traceEvents" in trace events = trace["traceEvents"] found_memory_events = False for evt in events: assert "name" in evt if evt["name"] == "[memory]": found_memory_events = True assert "args" in evt assert "Device Type" in evt["args"] assert "Device Id" in evt["args"] assert "Bytes" in evt["args"] assert found_memory_events if torch.cuda.is_available(): create_cuda_tensor() stats = run_profiler(create_cuda_tensor) check_metrics(stats, "cuda_memory_usage", allocs=[ "test_user_scope_alloc", "aten::to", "aten::empty_strided", ], deallocs=[ "test_user_scope_dealloc", ]) check_metrics(stats, "cpu_memory_usage", allocs=[ "aten::rand", "aten::empty", ]) if torch._C.has_mkldnn: create_mkldnn_tensor() stats = run_profiler(create_mkldnn_tensor) check_metrics(stats, "cpu_memory_usage", allocs=[ "test_user_scope_alloc", "aten::rand", "aten::empty", "aten::to_mkldnn", ], deallocs=[ "test_user_scope_dealloc", ]) # check top-level memory events with _profile(profile_memory=True, use_kineto=kineto_available()) as prof: x = torch.rand(10, 10) del x if torch.cuda.is_available(): y = torch.rand(10, 10).cuda() del y gc.collect() stats = prof.key_averages(group_by_input_shape=True) check_metrics(stats, "cpu_memory_usage", allocs=["aten::rand", "aten::empty"], deallocs=["[memory]"]) if torch.cuda.is_available(): check_metrics(stats, "cuda_memory_usage", deallocs=["[memory]"])
def inference(model, dataloader, datatype, args): batch_time = AverageMeter('Time', ':6.3f') batch_size = args.batch_size warmup_iters = args.warmup_iterations max_iters = args.max_iterations if dataloader is None else len(dataloader) model.eval() coco = get_coco_api_from_dataset(dataloader.dataset) iou_types = ["bbox"] iou_types.append("segm") coco_evaluator = CocoEvaluator(coco, iou_types) if args.ipex: import intel_extension_for_pytorch as ipex model = model.to(memory_format=torch.channels_last) model = ipex.optimize(model, dtype=datatype, level="O1", conv_bn_folding=False, replace_dropout_with_identity=False) model.backbone = ipex.optimize(model.backbone, dtype=datatype, level="O1") else: if args.jit: model = model.to(memory_format=torch.channels_last) else: from torch.utils import mkldnn as mkldnn_utils model = mkldnn_utils.to_mkldnn(model, dtype=datatype) if args.jit: x = torch.randn(batch_size, 3, 1200, 1200).to(memory_format=torch.channels_last) if args.precision == "bf16": with torch.cpu.amp.autocast(), torch.no_grad(): model.backbone = torch.jit.trace(model.backbone, x, strict=False) model.backbone = torch.jit.freeze(model.backbone) else: with torch.no_grad(): model.backbone = torch.jit.trace(model.backbone, x, strict=False) model.backbone = torch.jit.freeze(model.backbone) with torch.no_grad(): if dataloader is None: print( "Models for detection tasks need to use real dataset. You need to specify coco dataset. " ) exit(1) else: for i, batch in enumerate(dataloader): images = batch[0] if not args.ipex and not args.jit: images = list(img.to(datatype) for img in images) if args.ipex and args.precision == "bf16": with torch.cpu.amp.autocast(): if i == warmup_iters: with profile( activities=[ProfilerActivity.CPU], record_shapes=True ) as prof, record_function("model_inference"): output = model(images) else: output = model(images) else: if i == warmup_iters: with profile( activities=[ProfilerActivity.CPU], record_shapes=True) as prof, record_function( "model_inference"): output = model(images) else: output = model(images) if i > warmup_iters: break for i, batch in enumerate(dataloader): images = batch[0] end = time.time() if not args.ipex and not args.jit: images = list(img.to(datatype) for img in images) if args.ipex and args.precision == "bf16": with torch.cpu.amp.autocast(): output = model(images) else: output = model(images) batch_time.update(time.time() - end) output = [{k: v.to(torch.float32) for k, v in t.items()} for t in output] res = { target["image_id"].item(): output for target, output in zip(batch[1], output) } coco_evaluator.update(res) if max_iters != -1 and i >= max_iters: break print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=-1)) latency = batch_time.avg / batch_size * 1000 perf = batch_size / batch_time.avg coco_evaluator.synchronize_between_processes() coco_evaluator.accumulate() coco_evaluator.summarize() print("Bbox AP: {:.5f} ".format(coco_evaluator.coco_eval['bbox'].stats[0])) print("Segm AP: {:.5f} ".format(coco_evaluator.coco_eval['segm'].stats[0])) print('Latency: %.3f ms' % latency) print("Throughput: {:.3f} fps".format(perf))
def train(args): local_rank = int(os.environ['LOCAL_RANK']) verbose = local_rank == 0 if verbose: print('Using PyTorch version:', torch.__version__) print(torch.__config__.show()) dist.init_process_group(backend='nccl') world_size = dist.get_world_size() torch.manual_seed(0) torch.cuda.set_device(local_rank) # Set up standard model. if verbose: print(f'Using {args.model} model') model = getattr(models, args.model)() model = model.cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), 1e-4) model = DistributedDataParallel(model, device_ids=[local_rank]) train_dataset = dataset_from_datadir(args.datadir, verbose=verbose) train_sampler = DistributedSampler(train_dataset) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=train_sampler) if args.profiler: th = None if args.profiler_format == 'tb': th = torch.profiler.tensorboard_trace_handler('./logs/profiler') prof = profile( schedule=torch.profiler.schedule( wait=1, # number of steps steps not active warmup=1, # warmup steps (tracing, but results discarded) active=10, # tracing steps repeat=1), # repeat procedure this many times on_trace_ready=th, activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) prof.start() total_step = args.steps if args.steps is not None else len(train_loader) # For each block of printed steps last_start = datetime.now() last_images = 0 # For final average avg_images = 0 avg_start = None tot_steps = 0 for epoch in range(args.epochs): for i, (images, labels) in enumerate(train_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() if args.profiler: prof.step() li = len(images) last_images += li tot_steps += 1 if tot_steps == args.warmup_steps: avg_start = datetime.now() elif tot_steps > args.warmup_steps: avg_images += li if (i + 1) % args.print_steps == 0 and verbose: now = datetime.now() last_secs = (now - last_start).total_seconds() print( f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], ' f'Loss: {loss.item():.4f}, ' f'Images/sec: {last_images*world_size/last_secs:.2f} ' f'(last {args.print_steps} steps)') last_start = now last_images = 0 if args.steps is not None and tot_steps >= args.steps: break if args.profiler: if args.profiler_format == 'json' and verbose: trace_datetime = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') trace_fname = f"profiler-trace-{trace_datetime}.json" print(f'Writing profiler trace to {trace_fname}') prof.export_chrome_trace(trace_fname) prof.stop() if verbose: if avg_start is None: print( "WARNING: stopped before warmup steps done, not printing stats." ) else: dur = datetime.now() - avg_start print(f"Training completed in: {dur}") print( f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} " f"(average, skipping {args.warmup_steps} warmup steps)")
def main(args): _logger.info('args:\n - %s', '\n - '.join(str(it) for it in args.__dict__.items())) if args.file_fraction < 1: _logger.warning( 'Use of `file-fraction` is not recommended in general -- prefer using `data-fraction` instead.' ) # classification/regression mode if args.regression_mode: _logger.info('Running in regression mode') from utils.nn.tools import train_regression as train from utils.nn.tools import evaluate_regression as evaluate else: _logger.info('Running in classification mode') from utils.nn.tools import train_classification as train from utils.nn.tools import evaluate_classification as evaluate # training/testing mode training_mode = not args.predict # device if args.gpus: gpus = [int(i) for i in args.gpus.split(',')] dev = torch.device(gpus[0]) else: gpus = None dev = torch.device('cpu') # load data if training_mode: train_loader, val_loader, data_config, train_input_names, train_label_names = train_load( args) else: test_loaders, data_config = test_load(args) if args.io_test: data_loader = train_loader if training_mode else list( test_loaders.values())[0]() iotest(args, data_loader) return model, model_info, network_module, network_options = model_setup( args, data_config) if args.print: return if args.profile: profile(args, model, model_info, device=dev) return # export to ONNX if args.export_onnx: onnx(args, model, data_config, model_info) return if args.tensorboard: from utils.nn.tools import TensorboardHelper tb = TensorboardHelper(tb_comment=args.tensorboard, tb_custom_fn=args.tensorboard_custom_fn) else: tb = None # note: we should always save/load the state_dict of the original model, not the one wrapped by nn.DataParallel # so we do not convert it to nn.DataParallel now orig_model = model if training_mode: model = orig_model.to(dev) # loss function try: loss_func = network_module.get_loss(data_config, **network_options) _logger.info('Using loss function %s with options %s' % (loss_func, network_options)) except AttributeError: loss_func = torch.nn.CrossEntropyLoss() _logger.warning( 'Loss function not defined in %s. Will use `torch.nn.CrossEntropyLoss()` by default.', args.network_config) # optimizer & learning rate opt, scheduler = optim(args, model, dev) # multi-gpu if gpus is not None and len(gpus) > 1: # model becomes `torch.nn.DataParallel` w/ model.module being the original `torch.nn.Module` model = torch.nn.DataParallel(model, device_ids=gpus) model = model.to(dev) # lr finder: keep it after all other setups if args.lr_finder is not None: start_lr, end_lr, num_iter = args.lr_finder.replace(' ', '').split(',') from utils.lr_finder import LRFinder lr_finder = LRFinder(model, opt, loss_func, device=dev, input_names=train_input_names, label_names=train_label_names) lr_finder.range_test(train_loader, start_lr=float(start_lr), end_lr=float(end_lr), num_iter=int(num_iter)) lr_finder.plot(output='lr_finder.png' ) # to inspect the loss-learning rate graph return if args.use_amp: from torch.cuda.amp import GradScaler scaler = GradScaler() else: scaler = None # training loop best_valid_metric = np.inf if args.regression_mode else 0 for epoch in range(args.num_epochs): if args.load_epoch is not None: if epoch <= args.load_epoch: continue print('-' * 50) _logger.info('Epoch #%d training' % epoch) train(model, loss_func, opt, scheduler, train_loader, dev, epoch, steps_per_epoch=args.steps_per_epoch, grad_scaler=scaler, tb_helper=tb) if args.model_prefix: dirname = os.path.dirname(args.model_prefix) if dirname and not os.path.exists(dirname): os.makedirs(dirname) state_dict = model.module.state_dict() if isinstance( model, torch.nn.DataParallel) else model.state_dict() torch.save(state_dict, args.model_prefix + '_epoch-%d_state.pt' % epoch) torch.save( opt.state_dict(), args.model_prefix + '_epoch-%d_optimizer.pt' % epoch) _logger.info('Epoch #%d validating' % epoch) valid_metric = evaluate(model, val_loader, dev, epoch, loss_func=loss_func, steps_per_epoch=args.steps_per_epoch_val, tb_helper=tb) is_best_epoch = (valid_metric < best_valid_metric ) if args.regression_mode else ( valid_metric > best_valid_metric) if is_best_epoch: best_valid_metric = valid_metric if args.model_prefix: shutil.copy2( args.model_prefix + '_epoch-%d_state.pt' % epoch, args.model_prefix + '_best_epoch_state.pt') torch.save(model, args.model_prefix + '_best_epoch_full.pt') _logger.info( 'Epoch #%d: Current validation metric: %.5f (best: %.5f)' % (epoch, valid_metric, best_valid_metric), color='bold') if args.data_test: if training_mode: del train_loader, val_loader test_loaders, data_config = test_load(args) if not args.model_prefix.endswith('.onnx'): model = orig_model.to(dev) model_path = args.model_prefix if args.model_prefix.endswith( '.pt') else args.model_prefix + '_best_epoch_state.pt' _logger.info('Loading model %s for eval' % model_path) model.load_state_dict(torch.load(model_path, map_location=dev)) if gpus is not None and len(gpus) > 1: model = torch.nn.DataParallel(model, device_ids=gpus) model = model.to(dev) for name, get_test_loader in test_loaders.items(): test_loader = get_test_loader() # run prediction if args.model_prefix.endswith('.onnx'): _logger.info('Loading model %s for eval' % args.model_prefix) from utils.nn.tools import evaluate_onnx test_metric, scores, labels, observers = evaluate_onnx( args.model_prefix, test_loader) else: test_metric, scores, labels, observers = evaluate( model, test_loader, dev, epoch=None, for_training=False, tb_helper=tb) _logger.info('Test metric %.5f' % test_metric, color='bold') del test_loader if args.predict_output: if '/' not in args.predict_output: args.predict_output = os.path.join( os.path.dirname(args.model_prefix), 'predict_output', args.predict_output) os.makedirs(os.path.dirname(args.predict_output), exist_ok=True) if name == '': output_path = args.predict_output else: base, ext = os.path.splitext(args.predict_output) output_path = base + '_' + name + ext if output_path.endswith('.root'): save_root(args, output_path, data_config, scores, labels, observers) else: save_awk(args, output_path, scores, labels, observers) _logger.info('Written output to %s' % output_path, color='bold')