def _define_logger(self): # Use double-checked locking to avoid taking lock unnecessarily. if self._logger is not None: return self._logger with self._logger_lock: try: self._logger = _logging.getLogger("nemo_logger") # By default, silence all loggers except the logger for rank 0 self.remove_stream_handlers() if get_envbool(NEMO_ENV_VARNAME_TESTING, False): old_factory = _logging.getLogRecordFactory() def record_factory(*args, **kwargs): record = old_factory(*args, **kwargs) record.rank = get_envint("RANK", 0) return record _logging.setLogRecordFactory(record_factory) self.add_stream_handlers(formatter=DebugNeMoFormatter) elif get_envint("RANK", 0) == 0: self.add_stream_handlers() finally: level = Logger.INFO if get_envbool(NEMO_ENV_VARNAME_TESTING, False): level = Logger.DEBUG self.set_verbosity(verbosity_level=level) self._logger.propagate = False
def _define_logger(self, capture_warnings=True): """ Creates the logger if not already created. Called in init""" # Use double-checked locking to avoid taking lock unnecessarily. if self._logger is not None: return self._logger with self._logger_lock: try: self._logger = _logging.getLogger("nemo_logger") # By default, silence all loggers except the logger for rank 0 self.remove_stream_handlers() # If NEMO_TESTING is set, add a streamhandler to all ranks if get_envbool(NEMO_ENV_VARNAME_TESTING, False): old_factory = _logging.getLogRecordFactory() def record_factory(*args, **kwargs): record = old_factory(*args, **kwargs) record.rank = self.rank return record _logging.setLogRecordFactory(record_factory) self.add_stream_handlers(formatter=DebugNeMoFormatter) elif is_global_rank_zero(): self.add_stream_handlers() # Add memoryhandlers, essentially buffers. They are used to save messages that we will flush to file # once the appropriate file handlers are added. if is_global_rank_zero(): # Add a memoryhandler for error messages. Only logged on rank 0 self._handlers["memory_err"] = MemoryHandler(-1) self._handlers["memory_err"].addFilter( lambda record: record.levelno > _logging.INFO) formatter = BaseNeMoFormatter self._handlers["memory_err"].setFormatter(formatter()) self._logger.addHandler(self._handlers["memory_err"]) # Add a memoryhandler for all messages on all ranks self._handlers["memory_all"] = MemoryHandler(-1) formatter = BaseNeMoFormatter self._handlers["memory_all"].setFormatter(formatter()) self._logger.addHandler(self._handlers["memory_all"]) finally: level = Logger.INFO if get_envbool(NEMO_ENV_VARNAME_TESTING, False): level = Logger.DEBUG self.set_verbosity(verbosity_level=level) self.captureWarnings(capture_warnings) self._logger.propagate = False
def add_stream_handlers(self): if self._logger is None: raise RuntimeError( "Impossible to set handlers if the Logger is not predefined") # Add the output handler. if get_envbool(NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR, False): self._handlers["stream_stdout"] = _logging.StreamHandler( sys.stderr) else: self._handlers["stream_stdout"] = _logging.StreamHandler( sys.stdout) self._handlers["stream_stdout"].addFilter( lambda record: record.levelno <= _logging.INFO) self._handlers["stream_stderr"] = _logging.StreamHandler( sys.stderr) self._handlers["stream_stderr"].addFilter( lambda record: record.levelno > _logging.INFO) formatter = BaseNeMoFormatter self._handlers["stream_stdout"].setFormatter(formatter()) self._logger.addHandler(self._handlers["stream_stdout"]) try: self._handlers["stream_stderr"].setFormatter(formatter()) self._logger.addHandler(self._handlers["stream_stderr"]) except KeyError: pass
def add_stream_handlers(self, formatter=BaseNeMoFormatter): """Add StreamHandler that log to stdout and stderr to the logger. INFO and lower logs are streamed to stdout while WARNING and higher are streamed to stderr. If the NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR environment variable is set, all logs are sent to stderr instead. """ if self._logger is None: raise RuntimeError( "Impossible to set handlers if the Logger is not predefined") # Add the output handler. if get_envbool(NEMO_ENV_VARNAME_REDIRECT_LOGS_TO_STDERR, False): self._handlers["stream_stdout"] = _logging.StreamHandler( sys.stderr) else: self._handlers["stream_stdout"] = _logging.StreamHandler( sys.stdout) self._handlers["stream_stdout"].addFilter( lambda record: record.levelno <= _logging.INFO) self._handlers["stream_stderr"] = _logging.StreamHandler( sys.stderr) self._handlers["stream_stderr"].addFilter( lambda record: record.levelno > _logging.INFO) self._handlers["stream_stdout"].setFormatter(formatter()) self._logger.addHandler(self._handlers["stream_stdout"]) try: self._handlers["stream_stderr"].setFormatter(formatter()) self._logger.addHandler(self._handlers["stream_stderr"]) except KeyError: pass
def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Path: """ exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir, name, and version from the logger. Otherwise it will use the exp_dir and name arguments to create the logging directory. exp_manager also allows for explicit folder creation via explicit_log_dir. The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning. It copies sys.argv, and git information if available to the logging directory. It creates a log file for each process to log their output into. exp_manager additionally has a resume feature (resume_if_exists) which can be used to continuing training from the constructed log_dir. When you need to continue the training repeatedly (like on a cluster which you need multiple consecutive jobs), you need to avoid creating the version folders. Therefore from v1.0.0, when resume_if_exists is set to True, creating the version folders is ignored. Args: trainer (pytorch_lightning.Trainer): The lightning trainer. cfg (DictConfig, dict): Can have the following keys: - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to None, which will use exp_dir, name, and version to construct the logging directory. - exp_dir (str, Path): The base directory to create the logging directory. Defaults to None, which logs to ./nemo_experiments. - name (str): The name of the experiment. Defaults to None which turns into "default" via name = name or "default". - version (str): The version of the experiment. Defaults to None which uses either a datetime string or lightning's TensorboardLogger system of using version_{int}. - use_datetime_version (bool): Whether to use a datetime string for version. Defaults to True. - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets trainer.checkpoint_connector.resume_from_checkpoint_fit_path so that the trainer should auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}. Defaults to False. From v1.0.0, when resume_if_exists is True, we would not create version folders to make it easier to find the log folder for next runs. - resume_past_end (bool): exp_manager errors out if resume_if_exists is True and a checkpoint matching *end.ckpt indicating a previous training run fully completed. This behaviour can be disabled, in which case the *end.ckpt will be loaded by setting resume_past_end to True. Defaults to False. - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint could be found. This behaviour can be disabled, in which case exp_manager will print a message and continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False. - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch lightning trainer. Defaults to True. - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger class. Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None. - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it to the pytorch lightning trainer. Defaults to False. - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger class. Note that name and project are required parameters if create_wandb_logger is True. Defaults to None. - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most recent checkpoint under *last.ckpt, and the final checkpoint after training completes under *end.ckpt. Defaults to True. - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which copies no files. returns: log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of exp_dir, name, and version. """ # Add rank information to logger # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it local_rank = int(os.environ.get("LOCAL_RANK", 0)) global_rank = trainer.node_rank * trainer.num_gpus + local_rank logging.rank = global_rank world_size = trainer.world_size if cfg is None: logging.error( "exp_manager did not receive a cfg argument. It will be disabled.") return if trainer.fast_dev_run: logging.info( "Trainer was called with fast_dev_run. exp_manager will return without any functionality." ) return # Ensure passed cfg is compliant with ExpManagerConfig schema = OmegaConf.structured(ExpManagerConfig) if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)) cfg = OmegaConf.merge(schema, cfg) error_checks( trainer, cfg ) # Ensures that trainer options are compliant with NeMo and exp_manager arguments log_dir, exp_dir, name, version = get_log_dir( trainer=trainer, exp_dir=cfg.exp_dir, name=cfg.name, version=cfg.version, explicit_log_dir=cfg.explicit_log_dir, use_datetime_version=cfg.use_datetime_version, resume_if_exists=cfg.resume_if_exists, ) if cfg.resume_if_exists: check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) checkpoint_name = name # If name returned from get_log_dir is "", use cfg.name for checkpointing if checkpoint_name is None or checkpoint_name == '': checkpoint_name = cfg.name or "default" cfg.name = name # Used for configure_loggers so that the log_dir is properly set even if name is "" cfg.version = version # update app_state with log_dir, exp_dir, etc app_state = AppState() app_state.log_dir = log_dir app_state.exp_dir = exp_dir app_state.name = name app_state.version = version app_state.checkpoint_name = checkpoint_name app_state.create_checkpoint_callback = cfg.create_checkpoint_callback app_state.checkpoint_callback_params = cfg.checkpoint_callback_params # Create the logging directory if it does not exist os.makedirs( log_dir, exist_ok=True ) # Cannot limit creation to global zero as all ranks write to own log file logging.info(f'Experiments will be logged at {log_dir}') trainer._default_root_dir = log_dir # Handle logging to file if get_envbool(NEMO_ENV_VARNAME_TESTING, False) or world_size <= 32: # If NEMO_TESTING is set (debug mode) or if less than 32 ranks save all log files log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) elif world_size <= 256 and local_rank == 0: # If less than 256 ranks, try to save 1 log file per "machine" log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) elif global_rank == 0: # If running more than 256 ranks, only save 1 log file log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks # not just global rank 0. if cfg.create_tensorboard_logger or cfg.create_wandb_logger: configure_loggers( trainer, exp_dir, cfg.name, cfg.version, cfg.create_tensorboard_logger, cfg.summary_writer_kwargs, cfg.create_wandb_logger, cfg.wandb_logger_kwargs, ) # add loggers timing callbacks if cfg.log_step_timing: timing_callback = TimingCallback( timer_kwargs=cfg.step_timing_kwargs or {}) trainer.callbacks.insert(0, timing_callback) if cfg.create_checkpoint_callback: configure_checkpointing(trainer, log_dir, checkpoint_name, cfg.resume_if_exists, cfg.checkpoint_callback_params) if is_global_rank_zero(): # Move files_to_copy to folder and add git information if present if cfg.files_to_copy: for _file in cfg.files_to_copy: copy(Path(_file), log_dir) # Create files for cmd args and git info with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file: _file.write(" ".join(sys.argv)) # Try to get git hash git_repo, git_hash = get_git_hash() if git_repo: with open(log_dir / 'git-info.log', 'w', encoding='utf-8') as _file: _file.write(f'commit hash: {git_hash}') _file.write(get_git_diff()) # Add err_file logging to global_rank zero logging.add_err_file_handler(log_dir / 'nemo_error_log.txt') # Add lightning file logging to global_rank zero add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt') return log_dir
def check_color_support(): # Colors can be forced with an env variable if not sys.platform.lower().startswith("win") and get_envbool(NEMO_ENV_VARNAME_ENABLE_COLORING, False): return True