def test_profiling() -> None: """ Test profiling. """ # get the current profile values (gpu_names, total_memory_per, used_memory_per, load_per, ram_total, ram_used, ram_avail) = profile_gpu_and_ram() # average / sum over all GPUs gpu_mem_used: float = sum(used_memory_per) gpu_mem_total: float = sum(total_memory_per) gpu_mem_percent: float = gpu_mem_used / max(1, gpu_mem_total) load_avg: float = sum(load_per) / max(1, len(load_per)) print("Metrics.PROFILE_GPU_MEM_USED", gpu_mem_used) print("Metrics.PROFILE_GPU_MEM_TOTAL", gpu_mem_total) print("Metrics.PROFILE_GPU_LOAD", load_avg) print("Metrics.PROFILE_RAM_USED", ram_used) print("Metrics.PROFILE_RAM_TOTAL", ram_total) print("Metrics.PROFILE_GPU_MEM_PERCENT", gpu_mem_percent) print("Metrics.PROFILE_RAM_AVAILABLE", ram_avail) # log the values gpu_names_str = " ".join(set(gpu_names)) multi_load, multi_mem = "", "" if len(load_per) > 1: multi_load = " [" + ", ".join(f"{load:.0%}" for load in load_per) + "]" multi_mem = " [" + ", ".join(f"{mem:.1f}GB" for mem in used_memory_per) + "]" print( f"RAM GB used/avail/total: {ram_used:.1f}/{ram_avail:.1f}/{ram_total:.1f} - " f"GPU {gpu_names_str} Load: {load_avg:.1%}{multi_load} " f"Mem: {gpu_mem_used:.1f}GB/{gpu_mem_total:.1f}GB{multi_mem}")
def hook_post_step( self, epoch_step: int, loss: th.Tensor, lr: float, additional_log: Optional[str] = None, disable_grad_clip: bool = False) -> bool: """ Hook called after one optimization step. Profile gpu and update step-based meters. Feed everything to tensorboard. Needs some information to be passed down from the trainer for proper logging. Args: epoch_step: Current step in the epoch. loss: Training loss. lr: Training learning rate. additional_log: Additional string to print in the train step log. disable_grad_clip: Disable gradient clipping if it's done already somewhere else Returns: Whether log output should be printed in this step or not. """ # compute total time for this step and restart the timer total_step_time = timer() - self.timer_step self.timer_step = timer() # clip gradients total_norm = 0 if self.cfg.train.clip_gradient > -1 and not disable_grad_clip: # get all parameters to clip _params, _param_names, params_flat = self.model_mgr.get_all_params() # clip using pytorch total_norm = clip_grad_norm_(params_flat, self.cfg.train.clip_gradient) if total_norm > self.cfg.train.clip_gradient: # print log message if gradients where clipped grad_clip_coef = self.cfg.train.clip_gradient / (total_norm + 1e-6) self.logger.info(f"Clipping gradient: {total_norm} with coef {grad_clip_coef}") total_norm = total_norm.item() self.state.last_grad_norm = total_norm # print infos if epoch_step % self.cfg.logging.step_train == 0: total_train_time = (timer() - self.timer_train_epoch) / 60 str_step = ("{:" + str(len(str(self.steps_per_epoch))) + "d}").format(epoch_step) print_string = "".join([ f"E{self.state.current_epoch}[{str_step}/{self.steps_per_epoch}] T {total_train_time:.3f}m ", f"LR {lr:.1e} L {loss:.4f} ", f"Grad {self.state.last_grad_norm:.3e} " if self.state.last_grad_norm != 0 else "", f"{additional_log}" if additional_log is not None else ""]) self.logger.info(print_string) # check GPU / RAM profiling if ((self.state.epoch_step % self.cfg.logging.step_gpu == 0 and self.cfg.logging.step_gpu > 0) or self.state.epoch_step == self.cfg.logging.step_gpu_once and self.cfg.logging.step_gpu_once > 0): # get the current profile values (gpu_names, total_memory_per, used_memory_per, load_per, ram_total, ram_used, ram_avail ) = utils_torch.profile_gpu_and_ram() # average / sum over all GPUs gpu_mem_used: float = sum(used_memory_per) gpu_mem_total: float = sum(total_memory_per) # gpu_mem_percent: float = gpu_mem_used / gpu_mem_total load_avg: float = sum(load_per) / max(1, len(load_per)) self.metrics.update_meter(Metrics.PROFILE_GPU_MEM_USED, gpu_mem_used) self.metrics.update_meter(Metrics.PROFILE_GPU_MEM_TOTAL, gpu_mem_total) self.metrics.update_meter(Metrics.PROFILE_GPU_LOAD, load_avg) self.metrics.update_meter(Metrics.PROFILE_RAM_USED, ram_used) self.metrics.update_meter(Metrics.PROFILE_RAM_TOTAL, ram_total) # # these 2 are not logged as they are redundant with the others. # self.metrics.update_meter(Metrics.PROFILE_GPU_MEM_PERCENT, gpu_mem_percent) # self.metrics.update_meter(Metrics.PROFILE_RAM_AVAILABLE, ram_avail) # log the values gpu_names_str = " ".join(set(gpu_names)) multi_load, multi_mem = "", "" if len(load_per) > 1: multi_load = " [" + ", ".join(f"{load:.0%}" for load in load_per) + "]" multi_mem = " [" + ", ".join(f"{mem:.1f}GB" for mem in used_memory_per) + "]" self.logger.info(f"RAM GB used/avail/total: {ram_used:.1f}/{ram_avail:.1f}/{ram_total:.1f} - " f"GPU {gpu_names_str} Load: {load_avg:.1%}{multi_load} " f"Mem: {gpu_mem_used:.1f}GB/{gpu_mem_total:.1f}GB{multi_mem}") # update timings other_t = total_step_time - self.timedelta_step_forward - self.timedelta_step_backward self.metrics.update_meter(Metrics.TIME_STEP_FORWARD, self.timedelta_step_forward) self.metrics.update_meter(Metrics.TIME_STEP_BACKWARD, self.timedelta_step_backward) self.metrics.update_meter(Metrics.TIME_STEP_TOTAL, total_step_time) self.metrics.update_meter(Metrics.TIME_STEP_OTHER, other_t) # update clipped gradient self.metrics.update_meter(Metrics.TRAIN_GRAD_CLIP, self.state.last_grad_norm) # update LR self.metrics.update_meter(Metrics.TRAIN_LR, lr) # MODIFIED!!!!!!!!!!!!!!!!!!!!!! self.metrics.update_meter(Metrics.TRAIN_LOSS, loss.item()) ''' if self.state.epoch_step % self.cfg.logging.step_train == 0 and self.cfg.logging.step_train > 0: # loss update necessary self.metrics.update_meter(Metrics.TRAIN_LOSS, loss.item()) ''' # Save epoch step and increase total step counter self.state.epoch_step = epoch_step self.state.total_step += 1 # feed step-based metrics to tensorboard and collector self.metrics.feed_metrics(True, self.state.total_step, self.state.current_epoch) # End of batch, step lr scheduler depending on flag if self.lr_scheduler is not None: self.lr_scheduler.step()
def test_profile_gpu_and_ram(): pprint(profile_gpu_and_ram())