def __init__(self, by_epoch: bool = True, profile_iters: int = 1, activities: List[str] = ['cpu', 'cuda'], schedule: Optional[dict] = None, on_trace_ready: Optional[Union[Callable, dict]] = None, record_shapes: bool = False, profile_memory: bool = False, with_stack: bool = False, with_flops: bool = False, json_trace_path: Optional[str] = None) -> None: try: from torch import profiler # torch version >= 1.8.1 except ImportError: raise ImportError('profiler is the new feature of torch1.8.1, ' f'but your version is {torch.__version__}') assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.' self.by_epoch = by_epoch if profile_iters < 1: raise ValueError('profile_iters should be greater than 0, but got ' f'{profile_iters}') self.profile_iters = profile_iters if not isinstance(activities, list): raise ValueError( f'activities should be list, but got {type(activities)}') self.activities = [] for activity in activities: activity = activity.lower() if activity == 'cpu': self.activities.append(profiler.ProfilerActivity.CPU) elif activity == 'cuda': self.activities.append(profiler.ProfilerActivity.CUDA) else: raise ValueError( f'activity should be "cpu" or "cuda", but got {activity}') if schedule is not None: self.schedule = profiler.schedule(**schedule) else: self.schedule = None self.on_trace_ready = on_trace_ready self.record_shapes = record_shapes self.profile_memory = profile_memory self.with_stack = with_stack self.with_flops = with_flops self.json_trace_path = json_trace_path
def __init__(self, record_func_name='inference', activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=False, profile_memory=True, scheduler=schedule(wait=1, warmup=1, active=2), trace_handler=tensorboard_trace_handler('./log')): self.activities = activities self.profile = profile(activities=activities, record_shapes=record_shapes, profile_memory=profile_memory, with_flops=True, schedule=scheduler, on_trace_ready=trace_handler) self.record_function = record_function(record_func_name)
def train_func(): from ray.train.torch import TorchWorkerProfiler from torch.profiler import profile, record_function, schedule twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: for epoch in range(num_epochs): with record_function("test_function"): pass p.step() profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
def train_func(): twp = TorchWorkerProfiler() with profile( activities=[], schedule=schedule(wait=0, warmup=0, active=1), on_trace_ready=twp.trace_handler, ) as p: # Setup model. model = torch.nn.Linear(1, 1) model = train.torch.prepare_model(model) loss_fn = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-2) # Setup data. input = torch.randn(1000, 1) labels = input * 2 dataset = torch.utils.data.TensorDataset(input, labels) dataloader = torch.utils.data.DataLoader(dataset, batch_size=32) dataloader = train.torch.prepare_data_loader(dataloader) # Train. for epoch in range(5): with record_function("train_epoch"): for X, y in dataloader: pred = model(X) loss = loss_fn(pred, y) optimizer.zero_grad() loss.backward() optimizer.step() with record_function("train_checkpoint"): state_dict = model.state_dict() consume_prefix_in_state_dict_if_present(state_dict, "module.") train.save_checkpoint(epoch=epoch, model_weights=state_dict) p.step() with record_function("train_report"): profile_results = twp.get_and_clear_profile_traces() train.report(epoch=epoch, **profile_results)
#p.outShape = (1, 64, 1088, 1920) getMemUsed = lambda i: torch.cuda.memory_stats(i)['reserved_bytes.all.peak'] t = torch.randn(shape, dtype=config.dtype(), device=config.device()) # pylint: disable=E1101 load = shape[-1] * shape[-2] * shape[0] m = getMemUsed(config.device()) if config.cuda else None print(config.dtype(), config.device(), m) if config.cuda: p(t) #doCrop(p, t) getMemUsed(config.device()) start = perf_counter() p(t) #doCrop(p, t).mean().cpu() print('time elpased: {}'.format(perf_counter() - start)) m = getMemUsed(config.device()) else: schedule1 = schedule( wait=1, warmup=1, active=1) with profile( activities=[ProfilerActivity.CPU], schedule=schedule1, profile_memory=True) as pro: for _ in range(3): p(t) pro.step() avg = pro.key_averages() avg.sort(key=lambda o: o.cpu_memory_usage, reverse=True) m = avg[0].cpu_memory_usage print(m, m / load, load)
# (such as training loops). Tracing all of the execution can be # slow and result in very large trace files. To avoid this, use optional # arguments: # # - ``schedule`` - specifies a function that takes an integer argument (step number) # as an input and returns an action for the profiler, the best way to use this parameter # is to use ``torch.profiler.schedule`` helper function that can generate a schedule for you; # - ``on_trace_ready`` - specifies a function that takes a reference to the profiler as # an input and is called by the profiler each time the new trace is ready. # # To illustrate how the API works, let's first consider the following example with # ``torch.profiler.schedule`` helper function: from torch.profiler import schedule my_schedule = schedule(skip_first=10, wait=5, warmup=1, active=3, repeat=2) ###################################################################### # Profiler assumes that the long-running job is composed of steps, numbered # starting from zero. The example above defines the following sequence of actions # for the profiler: # # 1. Parameter ``skip_first`` tells profiler that it should ignore the first 10 steps # (default value of ``skip_first`` is zero); # 2. After the first ``skip_first`` steps, profiler starts executing profiler cycles; # 3. Each cycle consists of three phases: # # - idling (``wait=5`` steps), during this phase profiler is not active; # - warming up (``warmup=1`` steps), during this phase profiler starts tracing, but # the results are discarded; this phase is used to discard the samples obtained by # the profiler at the beginning of the trace since they are usually skewed by an extra