def test_tensorboard_log_metrics(tmpdir, step_idx): logger = TensorBoardLogger(tmpdir) metrics = { "float": 0.3, "int": 1, "FloatTensor": torch.tensor(0.1), "IntTensor": torch.tensor(1) } logger.log_metrics(metrics, step_idx)
class CustomLogger(LightningLoggerBase): def __init__(self, config): super().__init__() self._logger = logger self.config = config self._name = self.config.model_name self._save_dir = config.log_path self._version = None if config.test: self._save_dir = "test_" + config.log_path self._experiment = None # @rank_zero_only def _create_logger(self): # CLI logger self._logger.remove() self._logger.configure(handlers=[ dict( sink=lambda msg: tqdm.write(msg, end=''), level='DEBUG', colorize=True, format= "<green>{time: MM-DD at HH:mm}</green> <level>{message}</level>", enqueue=True), ]) # add file handler for training mode if self.config.disable_logfile == 'false': os.makedirs(self.log_dir, exist_ok=True) logfile = os.path.join(self.log_dir, "log.txt") self._logger.info(f"Log to file {logfile}") self._logger.add(sink=logfile, mode='w', format="{time: MM-DD at HH:mm} | {message}", level="DEBUG", enqueue=True) # enable tensorboard logger self.tb_log = self.config.tb_log if self.config.tb_log: self.tb_logger = TensorBoardLogger(os.path.join( self.log_dir, "tb_logs"), name=self.name) @property def log_dir(self): version = self.version if isinstance( self.version, str) else f"version_{self.version}" log_dir = os.path.join(self.root_dir, version) return log_dir @property @rank_zero_experiment def experiment(self): if self._experiment: return self._experiment self._create_logger() self._experiment = self._logger return self._experiment @staticmethod def _handle_value(value): if isinstance(value, torch.Tensor): try: return value.item() except ValueError: return value.mean().item() return value @rank_zero_only def log_metrics(self, metrics, step=None): if len(metrics) == 0: return if self.tb_log: self.tb_logger.log_metrics(metrics, step) metrics_str = " ".join([ f"{k}: {self._handle_value(v):<4.4f}" for k, v in metrics.items() if k != 'epoch' ]) if metrics_str.strip() == '': return if step is not None: metrics_str = f"step: {step:<6d} :: " + metrics_str if 'epoch' in metrics: metrics_str = f"epoch: {int(metrics['epoch']):<4d} " + metrics_str self.experiment.info(metrics_str) @rank_zero_only def info_metrics(self, metrics, epoch=None, step=None, level='INFO'): if isinstance(metrics, str): self.experiment.info(metrics) return _str = "" for k, v in metrics.items(): if isinstance(v, torch.Tensor): v = v.item() _str += f"{k}= {v:<4.4f} " self.experiment.log(level, f"epoch {epoch: <4d}: step {step:<6d}:: {_str}") @rank_zero_only def log(self, msg, level='DEBUG'): self.experiment.log(level, msg) @property def name(self): return self._name @property def version(self) -> int: if self._version is None: self._version = self._get_next_version() return self._version def _get_next_version(self): root_dir = os.path.join(self._save_dir, self.name) if not os.path.isdir(root_dir): logger.warning('Missing logger folder: %s', root_dir) return 0 existing_versions = [] for d in os.listdir(root_dir): if os.path.isdir(os.path.join(root_dir, d)) and d.startswith("version_"): existing_versions.append(int(d.split("_")[1])) if len(existing_versions) == 0: return 0 return max(existing_versions) + 1 @rank_zero_only def log_hyperparams(self, params): _str = "" for k in sorted(params): v = params[k] _str += Fore.LIGHTCYAN_EX + str(k) + "=" _str += Fore.WHITE + str(v) + ", " self.experiment.info("\nhyper-parameters:\n" + _str) return @property def root_dir(self) -> str: if not self.name: return self.save_dir return os.path.join(self.save_dir, self.name) @property def save_dir(self): return self._save_dir
class TrainerLoggingMixin(ABC): def __init__(self): # this is just a summary on variables used in this abstract class, # the proper values/initialisation should be done in child class self.current_epoch = None self.on_gpu = None self.log_gpu_memory = None self.logger = None self.tqdm_metrics = None self.global_step = None self.proc_rank = None self.use_dp = None self.use_ddp2 = None self.num_gpus = None def configure_logger(self, logger): if logger is True: # default logger self.logger = TensorBoardLogger( save_dir=self.default_save_path, version=self.slurm_job_id, name='lightning_logs' ) self.logger.rank = 0 elif logger is False: self.logger = None else: self.logger = logger self.logger.rank = 0 def log_metrics(self, metrics, grad_norm_dic, step=None): """Logs the metric dict passed in. If `step` parameter is None and `step` key is presented is metrics, uses metrics["step"] as a step :param metrics (dict): Metric values :param grad_norm_dic (dict): Gradient norms :param step (int): Step for which metrics should be logged. Default value corresponds to `self.global_step` """ # add gpu memory if self.on_gpu and self.log_gpu_memory: mem_map = memory.get_memory_profile(self.log_gpu_memory) metrics.update(mem_map) # add norms metrics.update(grad_norm_dic) # turn all tensors to scalars scalar_metrics = self.metrics_to_scalars(metrics) if "step" in scalar_metrics and step is None: step = scalar_metrics.pop("step") else: # added metrics by Lightning for convenience metrics['epoch'] = self.current_epoch step = step if step is not None else self.global_step # log actual metrics if self.proc_rank == 0 and self.logger is not None: self.logger.log_metrics(scalar_metrics, step=step) self.logger.save() def add_tqdm_metrics(self, metrics): for k, v in metrics.items(): if isinstance(v, torch.Tensor): v = v.item() self.tqdm_metrics[k] = v def metrics_to_scalars(self, metrics): new_metrics = {} for k, v in metrics.items(): if isinstance(v, torch.Tensor): v = v.item() if isinstance(v, dict): v = self.metrics_to_scalars(v) new_metrics[k] = v return new_metrics def process_output(self, output, train=False): """Reduces output according to the training mode. Separates loss from logging and tqdm metrics :param output: :return: """ # --------------- # EXTRACT CALLBACK KEYS # --------------- # all keys not progress_bar or log are candidates for callbacks callback_metrics = {} for k, v in output.items(): if k not in ['progress_bar', 'log', 'hiddens']: callback_metrics[k] = v if train and (self.use_dp or self.use_ddp2): num_gpus = self.num_gpus callback_metrics = self.reduce_distributed_output(callback_metrics, num_gpus) for k, v in callback_metrics.items(): if isinstance(v, torch.Tensor): callback_metrics[k] = v.item() # --------------- # EXTRACT PROGRESS BAR KEYS # --------------- try: progress_output = output['progress_bar'] # reduce progress metrics for tqdm when using dp if train and (self.use_dp or self.use_ddp2): num_gpus = self.num_gpus progress_output = self.reduce_distributed_output(progress_output, num_gpus) progress_bar_metrics = progress_output except Exception: progress_bar_metrics = {} # --------------- # EXTRACT LOGGING KEYS # --------------- # extract metrics to log to experiment try: log_output = output['log'] # reduce progress metrics for tqdm when using dp if train and (self.use_dp or self.use_ddp2): num_gpus = self.num_gpus log_output = self.reduce_distributed_output(log_output, num_gpus) log_metrics = log_output except Exception: log_metrics = {} # --------------- # EXTRACT LOSS # --------------- # if output dict doesn't have the keyword loss # then assume the output=loss if scalar loss = None if train: try: loss = output['loss'] except Exception: if isinstance(output, torch.Tensor): loss = output else: raise RuntimeError( 'No `loss` value in the dictionary returned from `model.training_step()`.' ) # when using dp need to reduce the loss if self.use_dp or self.use_ddp2: loss = self.reduce_distributed_output(loss, self.num_gpus) # --------------- # EXTRACT HIDDEN # --------------- hiddens = output.get('hiddens') # use every metric passed in as a candidate for callback callback_metrics.update(progress_bar_metrics) callback_metrics.update(log_metrics) # convert tensors to numpy for k, v in callback_metrics.items(): if isinstance(v, torch.Tensor): callback_metrics[k] = v.item() return loss, progress_bar_metrics, log_metrics, callback_metrics, hiddens def reduce_distributed_output(self, output, num_gpus): if num_gpus <= 1: return output # when using DP, we get one output per gpu # average outputs and return if isinstance(output, torch.Tensor): return output.mean() for k, v in output.items(): # recurse on nested dics if isinstance(output[k], dict): output[k] = self.reduce_distributed_output(output[k], num_gpus) # do nothing when there's a scalar elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0: pass # reduce only metrics that have the same number of gpus elif output[k].size(0) == num_gpus: reduced = torch.mean(output[k]) output[k] = reduced return output
val_class_recall_scores = literal_eval(val_class_recall_scores_list[epoch]) val_class_f1_scores_dict = { label_id_to_label_text[i]: val_class_f1_scores[i] for i in range(len(label_id_to_label_text)) } val_class_recall_scores_dict = { label_id_to_label_text[i]: val_class_recall_scores[i] for i in range(len(label_id_to_label_text)) } val_class_precision_scores_dict = { label_id_to_label_text[i]: val_class_precision_scores[i] for i in range(len(label_id_to_label_text)) } logger.log_metrics(metrics={'avg_train_loss': avg_train_loss[epoch]}, step=epoch) logger.log_metrics(metrics={'avg_val_loss': avg_val_loss[epoch]}, step=epoch) logger.log_metrics(metrics={'avg_val_f1_score': avg_val_f1_score[epoch]}, step=epoch) logger.log_metrics( metrics={'avg_val_precision_score': avg_val_precision_score[epoch]}, step=epoch) logger.log_metrics( metrics={'avg_val_recall_score': avg_val_recall_score[epoch]}, step=epoch) logger.log_metrics( metrics={'avg_val_accuracy_score': avg_val_accuracy_score[epoch]}, step=epoch) logger.experiment.add_scalars('val_class_f1_scores', val_class_f1_scores_dict,