def _default_save_to(self, save_path: str): app_state = AppState() if app_state.model_parallel_size is not None: # each model parallel rank creates a .nemo file # after all .nemo files are created, each rank # will add their checkpoint to global rank 0 base_dir = os.path.dirname(save_path) # use the directory to merge mp_rank .nemo files into one # update save_path based on model parallel_rank base_path = os.path.splitext(save_path)[0] # everything except the extension mp_save_path = f'{base_path}_mp_rank_{app_state.model_parallel_rank:02d}.nemo' if app_state.data_parallel_rank == 0: super()._default_save_to(mp_save_path) # barrier so that all processes have finished writing their weights before creating .nemo file torch.distributed.barrier() if is_global_rank_zero(): # extract all tar files for mp_rank in range(app_state.model_parallel_size): mp_tar_path = f'{base_path}_mp_rank_{mp_rank:02d}.nemo' mp_tar = tarfile.open(mp_tar_path, 'r:gz') mp_tar.extractall(path=os.path.join(base_dir, f'mp_rank_{mp_rank:02d}')) mp_tar.close() os.remove(mp_tar_path) # move rank 0 .nemo extract to base_path shutil.move(os.path.join(base_dir, 'mp_rank_00'), base_path) # move mp_rank_00 checkpoint to mp_rank_00 directory inside base_path os.mkdir(os.path.join(base_path, 'mp_rank_00')) shutil.move(os.path.join(base_path, 'model_weights.ckpt'), os.path.join(base_path, 'mp_rank_00')) # move other mp_rank checkpoints from base_dir to base_path for mp_rank in range(1, app_state.model_parallel_size): os.mkdir(os.path.join(base_path, f'mp_rank_{mp_rank:02d}')) shutil.move( os.path.join(base_dir, f'mp_rank_{mp_rank:02d}', 'model_weights.ckpt'), os.path.join(base_path, f'mp_rank_{mp_rank:02d}'), ) # clean up leftover directory shutil.rmtree(os.path.join(base_dir, f'mp_rank_{mp_rank:02d}')) # create tar file from base_path self._make_nemo_file_from_folder(save_path, base_path) # clean up base_path shutil.rmtree(base_path) elif is_global_rank_zero(): return super()._default_save_to(save_path) else: return
def _parse_as_cmu_dict(phoneme_dict_path=None, encoding='latin-1'): if phoneme_dict_path is None: # this part of code downloads file, but it is not rank zero guarded # Try to check if torch distributed is available, if not get global rank zero to download corpora and make # all other ranks sleep for a minute if torch.distributed.is_available( ) and torch.distributed.is_initialized(): group = torch.distributed.group.WORLD if is_global_rank_zero(): try: nltk.data.find('corpora/cmudict.zip') except LookupError: nltk.download('cmudict', quiet=True) torch.distributed.barrier(group=group) elif is_global_rank_zero(): logging.error( f"Torch distributed needs to be initialized before you initialized EnglishG2p. This class is prone to " "data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this " "before rank 0, errors might result.") try: nltk.data.find('corpora/cmudict.zip') except LookupError: nltk.download('cmudict', quiet=True) else: logging.error( f"Torch distributed needs to be initialized before you initialized EnglishG2p. This class is prone to " "data access race conditions. This process is not rank 0, and now going to sleep for 1 min. If this " "rank wakes from sleep prior to rank 0 finishing downloading, errors might result." ) time.sleep(60) logging.warning( f"English g2p_dict will be used from nltk.corpus.cmudict.dict(), because phoneme_dict_path=None. " "Note that nltk.corpus.cmudict.dict() has old version (0.6) of CMUDict. " "You can use the latest official version of CMUDict (0.7b) with additional changes from NVIDIA directly from NeMo " "using the path scripts/tts_dataset_files/cmudict-0.7b_nv22.01." ) return nltk.corpus.cmudict.dict() _alt_re = re.compile(r'\([0-9]+\)') g2p_dict = {} with open(phoneme_dict_path, encoding=encoding) as file: for line in file: if len(line) and ('A' <= line[0] <= 'Z' or line[0] == "'"): parts = line.split(' ') word = re.sub(_alt_re, '', parts[0]) word = word.lower() pronunciation = parts[1].strip().split(" ") if word in g2p_dict: g2p_dict[word].append(pronunciation) else: g2p_dict[word] = [pronunciation] return g2p_dict
def _define_logger(self, capture_warnings=True): """ Creates the logger if not already created. Called in init""" # Use double-checked locking to avoid taking lock unnecessarily. if self._logger is not None: return self._logger with self._logger_lock: try: self._logger = _logging.getLogger("nemo_logger") # By default, silence all loggers except the logger for rank 0 self.remove_stream_handlers() # If NEMO_TESTING is set, add a streamhandler to all ranks if get_envbool(NEMO_ENV_VARNAME_TESTING, False): old_factory = _logging.getLogRecordFactory() def record_factory(*args, **kwargs): record = old_factory(*args, **kwargs) record.rank = self.rank return record _logging.setLogRecordFactory(record_factory) self.add_stream_handlers(formatter=DebugNeMoFormatter) elif is_global_rank_zero(): self.add_stream_handlers() # Add memoryhandlers, essentially buffers. They are used to save messages that we will flush to file # once the appropriate file handlers are added. if is_global_rank_zero(): # Add a memoryhandler for error messages. Only logged on rank 0 self._handlers["memory_err"] = MemoryHandler(-1) self._handlers["memory_err"].addFilter( lambda record: record.levelno > _logging.INFO) formatter = BaseNeMoFormatter self._handlers["memory_err"].setFormatter(formatter()) self._logger.addHandler(self._handlers["memory_err"]) # Add a memoryhandler for all messages on all ranks self._handlers["memory_all"] = MemoryHandler(-1) formatter = BaseNeMoFormatter self._handlers["memory_all"].setFormatter(formatter()) self._logger.addHandler(self._handlers["memory_all"]) finally: level = Logger.INFO if get_envbool(NEMO_ENV_VARNAME_TESTING, False): level = Logger.DEBUG self.set_verbosity(verbosity_level=level) self.captureWarnings(capture_warnings) self._logger.propagate = False
def check_explicit_log_dir(trainer: 'pytorch_lightning.Trainer', explicit_log_dir: [Path, str], exp_dir: str, name: str, version: str) -> (Path, str, str, str): """ Checks that the passed arguments are compatible with explicit_log_dir. Returns: log_dir (Path): the log_dir exp_dir (str): the base exp_dir without name nor version name (str): The name of the experiment version (str): The version of the experiment Raise: LoggerMisconfigurationError """ if trainer.logger is not None: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger and explicit_log_dir: " f"{explicit_log_dir} was pass to exp_manager. Please remove the logger from the lightning trainer." ) # Checking only (explicit_log_dir) vs (exp_dir and version). # The `name` will be used as the actual name of checkpoint/archive. if exp_dir or version: logging.error( f"exp_manager received explicit_log_dir: {explicit_log_dir} and at least one of exp_dir: {exp_dir}, " f"or version: {version}. Please note that exp_dir, name, and version will be ignored." ) if is_global_rank_zero() and Path(explicit_log_dir).exists(): logging.warning( f"Exp_manager is logging to {explicit_log_dir}, but it already exists." ) return Path(explicit_log_dir), str(explicit_log_dir), "", ""
def prepare_data(self): """ Preprocessed schema and dialogues and caches this """ if self.data_prepared: return schema_config = { "MAX_NUM_CAT_SLOT": self._cfg.dataset.max_num_cat_slot, "MAX_NUM_NONCAT_SLOT": self._cfg.dataset.max_num_noncat_slot, "MAX_NUM_VALUE_PER_CAT_SLOT": self._cfg.dataset.max_value_per_cat_slot, "MAX_NUM_INTENT": self._cfg.dataset.max_num_intent, "NUM_TASKS": NUM_TASKS, "MAX_SEQ_LENGTH": self._cfg.dataset.max_seq_length, } all_schema_json_paths = [] for dataset_split in ['train', 'test', 'dev']: all_schema_json_paths.append(os.path.join(self._cfg.dataset.data_dir, dataset_split, "schema.json")) schemas = Schema(all_schema_json_paths) self.dialogues_processor = SGDDataProcessor( task_name=self._cfg.dataset.task_name, data_dir=self._cfg.dataset.data_dir, dialogues_example_dir=self._cfg.dataset.dialogues_example_dir, tokenizer=self.tokenizer, schemas=schemas, schema_config=schema_config, subsample=self._cfg.dataset.subsample, ) if is_global_rank_zero(): overwrite_dial_files = not self._cfg.dataset.use_cache self.dialogues_processor.save_dialog_examples(overwrite_dial_files=overwrite_dial_files) self.data_prepared = True
def setup(self, stage: str) -> None: """ PTL hook that is called on all DDP processes. """ if stage == 'fit': # adds self.bert_model config to .nemo file if hasattr(self, 'bert_model') and self.bert_model is not None: self.register_bert_model() app_state = AppState() if app_state.model_parallel_size is not None: self._trainer.checkpoint_connector = NLPCheckpointConnector( self._trainer) # Configure checkpointing for model parallel if app_state.create_checkpoint_callback: # global rank 0 is configured by exp_manager if not is_global_rank_zero( ) and app_state.data_parallel_rank == 0: configure_checkpointing( self._trainer, app_state.log_dir, app_state.checkpoint_name, app_state.checkpoint_callback_params, )
def save_to(self, model, save_path: str): """ Saves model instance (weights and configuration) into .nemo file. You can use "restore_from" method to fully restore instance from .nemo file. .nemo file is an archive (tar.gz) with the following: model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor model_wights.chpt - model checkpoint Args: model: ModelPT object to be saved. save_path: Path to .nemo file where model instance should be saved """ if is_global_rank_zero(): with tempfile.TemporaryDirectory() as tmpdir: config_yaml = os.path.join(tmpdir, self.model_config_yaml) model_weights = os.path.join(tmpdir, self.model_weights_ckpt) model.to_config_file(path2yaml_file=config_yaml) if hasattr(model, 'artifacts') and model.artifacts is not None: self._handle_artifacts(model, nemo_file_folder=tmpdir) # We should not update self._cfg here - the model can still be in use self._update_artifact_paths(model, path2yaml_file=config_yaml) self._save_state_dict_to_disk(model.state_dict(), model_weights) self._make_nemo_file_from_folder(filename=save_path, source_dir=tmpdir) else: return
def main(cfg: ParallelAlignmentConfig): if cfg.model.endswith(".nemo"): logging.info("Attempting to initialize from .nemo file") model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu") elif cfg.model.endswith(".ckpt"): logging.info("Attempting to initialize from .ckpt file") model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu") else: logging.info( "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt" ) model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu") trainer = ptl.Trainer(**cfg.trainer) cfg.predict_ds.return_sample_id = True cfg.return_predictions = False cfg.use_cer = False cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model._cfg.train_ds) data_loader = model._setup_dataloader_from_config(cfg.predict_ds) os.makedirs(cfg.output_path, exist_ok=True) # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank. global_rank = trainer.node_rank * trainer.num_devices + int(os.environ.get("LOCAL_RANK", 0)) output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json") output_ctm_dir = os.path.join(cfg.output_path, "ctm") predictor_writer = ASRCTMPredictionWriter( dataset=data_loader.dataset, output_file=output_file, output_ctm_dir=output_ctm_dir, time_per_frame=cfg.model_stride * model._cfg.preprocessor['window_stride'], ) trainer.callbacks.extend([predictor_writer]) aligner_wrapper = AlignerWrapperModel(model=model, cfg=cfg.aligner_args) trainer.predict(model=aligner_wrapper, dataloaders=data_loader, return_predictions=cfg.return_predictions) samples_num = predictor_writer.close_output_file() logging.info( f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}." ) if torch.distributed.is_initialized(): torch.distributed.barrier() samples_num = 0 if is_global_rank_zero(): output_file = os.path.join(cfg.output_path, f"predictions_all.json") logging.info(f"Prediction files are being aggregated in {output_file}.") with open(output_file, 'tw', encoding="utf-8") as outf: for rank in range(trainer.world_size): input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json") with open(input_file, 'r', encoding="utf-8") as inpf: lines = inpf.readlines() samples_num += len(lines) outf.writelines(lines) logging.info( f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}." )
def __init__(self, capture_warnings=True): self._logger = None # Multi-GPU runs run in separate processes, thread locks shouldn't be needed self._logger_lock = threading.Lock() self._handlers = dict() self.old_warnings_showwarning = None self._define_logger(capture_warnings) self.once_logged = set() self.rank = 0 if is_global_rank_zero() else "UNK"
def _del_model_without_trainer(self, filepath: str) -> None: app_state = AppState() if app_state.model_parallel_size is not None and app_state.model_parallel_size > 1: # filepath needs to be updated to include mp_rank filepath = inject_model_parallel_rank(filepath) # each model parallel rank needs to remove its model if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0): try: self._fs.rm(filepath) logging.info(f"Removed checkpoint: {filepath}") except: logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
def _del_model_without_trainer(self, filepath: str) -> None: app_state = AppState() if app_state.model_parallel_size is not None: # filepath needs to be updated to include mp_rank dirname = os.path.dirname(filepath) basename = os.path.basename(filepath) filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}' # each model parallel rank needs to remove its model if is_global_rank_zero() or (app_state.model_parallel_size is not None and app_state.data_parallel_rank == 0): try: self._fs.rm(filepath) logging.info(f"Removed checkpoint: {filepath}") except: logging.info(f"Tried to remove checkpoint: {filepath} but failed.")
def __init__(self, datasets, weights): self.datasets = datasets num_datasets = len(datasets) assert num_datasets == len(weights) self.size = 0 for dataset in self.datasets: self.size += len(dataset) # Normalize weights. weights = np.array(weights, dtype=np.float64) sum_weights = np.sum(weights) assert sum_weights > 0.0 weights /= sum_weights # Build indecies. start_time = time.time() assert num_datasets < 255 self.dataset_index = np.zeros(self.size, dtype=np.uint8) self.dataset_sample_index = np.zeros(self.size, dtype=np.int64) try: if is_global_rank_zero(): from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() from nemo.collections.nlp.data.language_modeling.megatron import helpers except: raise Exception(f'Could not compile helpers.') helpers.build_blending_indices( self.dataset_index, self.dataset_sample_index, weights, num_datasets, self.size, torch.distributed.get_rank() == 0, ) logging.info('> elapsed time for building blendable dataset indices: ' '{:.2f} (sec)'.format(time.time() - start_time))
def save_to(self, save_path: str): """ Saves model instance (weights and configuration) into EFF archive or . You can use "restore_from" method to fully restore instance from .nemo file. .nemo file is an archive (tar.gz) with the following: model_config.yaml - model configuration in .yaml format. You can deserialize this into cfg argument for model's constructor model_wights.chpt - model checkpoint Args: save_path: Path to .nemo file where model instance should be saved """ # Add nemo rank check as well if not is_global_rank_zero(): return if _EFF_PRESENT_ and self.use_eff_save(): # Save EFF archive. self._eff_save_to(save_path) else: # Save .nemo tar archive. self._default_save_to(save_path)
def __init__(self, cfg: DictConfig, trainer: Trainer): app_state = AppState() if not app_state._is_megatron_initialized: logging.info( f"Initializing megatron since it hasn't been initialized by the model. This is normal if you are using a NeMo model with Megatron dataloaders." ) app_state.global_rank = trainer.global_rank app_state.world_size = trainer.world_size app_state.tensor_model_parallel_size = 1 app_state.tensor_model_parallel_rank = trainer.global_rank initialize_model_parallel_for_nemo( world_size=trainer.world_size, global_rank=trainer.global_rank, local_rank=trainer.local_rank, tensor_model_parallel_size=cfg.get('tensor_model_parallel_size', 1), seed=self.cfg.get('seed', 1234), ) try: from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper if is_global_rank_zero(): compile_helper() if torch.distributed.is_available() and torch.distributed.is_initialized(): torch.distributed.barrier() from nemo.collections.nlp.data.language_modeling.megatron import helpers logging.info('Megatron dataset helper compiled successfully.') except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' )
def check_resume( trainer: 'pytorch_lightning.Trainer', log_dir: str, resume_past_end: bool = False, resume_ignore_no_checkpoint: bool = False, ): """Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets trainer.checkpoint_connector.resume_from_checkpoint_fit_path as necessary. Returns: log_dir (Path): the log_dir exp_dir (str): the base exp_dir without name nor version name (str): The name of the experiment version (str): The version of the experiment Raises: NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found. ValueError: If resume is True, and there were more than 1 checkpoint could found. """ if not log_dir: raise ValueError( f"Resuming requires the log_dir {log_dir} to be passed to exp_manager" ) checkpoint_dir = Path(Path(log_dir) / "checkpoints") checkpoint = None end_checkpoints = list(checkpoint_dir.rglob("*end.ckpt")) last_checkpoints = list(checkpoint_dir.rglob("*last.ckpt")) if not checkpoint_dir.exists(): if resume_ignore_no_checkpoint: logging.warning( f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Training from scratch." ) return else: raise NotFoundError( f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume." ) elif len(end_checkpoints) > 0: if resume_past_end: if len(end_checkpoints) > 1: if 'mp_rank' in str(end_checkpoints[0]): checkpoint = end_checkpoints[0] else: raise ValueError( f"Multiple checkpoints {end_checkpoints} that matches *end.ckpt." ) logging.info(f"Resuming from {end_checkpoints[0]}") else: raise ValueError( f"Found {end_checkpoints[0]} indicating that the last training run has already completed." ) elif not len(last_checkpoints) > 0: if resume_ignore_no_checkpoint: logging.warning( f"There were no checkpoints found in {checkpoint_dir}. Training from scratch." ) return else: raise NotFoundError( f"There were no checkpoints found in {checkpoint_dir}. Cannot resume." ) elif len(last_checkpoints) > 1: if 'mp_rank' in str(last_checkpoints[0]): checkpoint = last_checkpoints[0] else: raise ValueError( f"Multiple checkpoints {last_checkpoints} that matches *last.ckpt." ) else: logging.info(f"Resuming from {last_checkpoints[0]}") checkpoint = last_checkpoints[0] trainer.checkpoint_connector.resume_from_checkpoint_fit_path = str( checkpoint) if is_global_rank_zero(): # Check to see if any files exist that need to be moved files_to_move = [] for child in Path(log_dir).iterdir(): if child.is_file(): files_to_move.append(child) if len(files_to_move) > 0: # Move old files to a new folder other_run_dirs = Path(log_dir).glob("run_*") run_count = 0 for fold in other_run_dirs: if fold.is_dir(): run_count += 1 new_run_dir = Path(Path(log_dir) / f"run_{run_count}") new_run_dir.mkdir() for _file in files_to_move: move(str(_file), str(new_run_dir))
def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Path: """ exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir, name, and version from the logger. Otherwise it will use the exp_dir and name arguments to create the logging directory. exp_manager also allows for explicit folder creation via explicit_log_dir. The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning. It copies sys.argv, and git information if available to the logging directory. It creates a log file for each process to log their output into. exp_manager additionally has a resume feature (resume_if_exists) which can be used to continuing training from the constructed log_dir. When you need to continue the training repeatedly (like on a cluster which you need multiple consecutive jobs), you need to avoid creating the version folders. Therefore from v1.0.0, when resume_if_exists is set to True, creating the version folders is ignored. Args: trainer (pytorch_lightning.Trainer): The lightning trainer. cfg (DictConfig, dict): Can have the following keys: - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to None, which will use exp_dir, name, and version to construct the logging directory. - exp_dir (str, Path): The base directory to create the logging directory. Defaults to None, which logs to ./nemo_experiments. - name (str): The name of the experiment. Defaults to None which turns into "default" via name = name or "default". - version (str): The version of the experiment. Defaults to None which uses either a datetime string or lightning's TensorboardLogger system of using version_{int}. - use_datetime_version (bool): Whether to use a datetime string for version. Defaults to True. - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets trainer.checkpoint_connector.resume_from_checkpoint_fit_path so that the trainer should auto-resume. exp_manager will move files under log_dir to log_dir/run_{int}. Defaults to False. From v1.0.0, when resume_if_exists is True, we would not create version folders to make it easier to find the log folder for next runs. - resume_past_end (bool): exp_manager errors out if resume_if_exists is True and a checkpoint matching *end.ckpt indicating a previous training run fully completed. This behaviour can be disabled, in which case the *end.ckpt will be loaded by setting resume_past_end to True. Defaults to False. - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint could be found. This behaviour can be disabled, in which case exp_manager will print a message and continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False. - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch lightning trainer. Defaults to True. - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger class. Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None. - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it to the pytorch lightning trainer. Defaults to False. - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger class. Note that name and project are required parameters if create_wandb_logger is True. Defaults to None. - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most recent checkpoint under *last.ckpt, and the final checkpoint after training completes under *end.ckpt. Defaults to True. - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which copies no files. returns: log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of exp_dir, name, and version. """ # Add rank information to logger # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it local_rank = int(os.environ.get("LOCAL_RANK", 0)) global_rank = trainer.node_rank * trainer.num_gpus + local_rank logging.rank = global_rank world_size = trainer.world_size if cfg is None: logging.error( "exp_manager did not receive a cfg argument. It will be disabled.") return if trainer.fast_dev_run: logging.info( "Trainer was called with fast_dev_run. exp_manager will return without any functionality." ) return # Ensure passed cfg is compliant with ExpManagerConfig schema = OmegaConf.structured(ExpManagerConfig) if isinstance(cfg, dict): cfg = OmegaConf.create(cfg) elif not isinstance(cfg, DictConfig): raise ValueError( f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig" ) cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True)) cfg = OmegaConf.merge(schema, cfg) error_checks( trainer, cfg ) # Ensures that trainer options are compliant with NeMo and exp_manager arguments log_dir, exp_dir, name, version = get_log_dir( trainer=trainer, exp_dir=cfg.exp_dir, name=cfg.name, version=cfg.version, explicit_log_dir=cfg.explicit_log_dir, use_datetime_version=cfg.use_datetime_version, resume_if_exists=cfg.resume_if_exists, ) if cfg.resume_if_exists: check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) checkpoint_name = name # If name returned from get_log_dir is "", use cfg.name for checkpointing if checkpoint_name is None or checkpoint_name == '': checkpoint_name = cfg.name or "default" cfg.name = name # Used for configure_loggers so that the log_dir is properly set even if name is "" cfg.version = version # update app_state with log_dir, exp_dir, etc app_state = AppState() app_state.log_dir = log_dir app_state.exp_dir = exp_dir app_state.name = name app_state.version = version app_state.checkpoint_name = checkpoint_name app_state.create_checkpoint_callback = cfg.create_checkpoint_callback app_state.checkpoint_callback_params = cfg.checkpoint_callback_params # Create the logging directory if it does not exist os.makedirs( log_dir, exist_ok=True ) # Cannot limit creation to global zero as all ranks write to own log file logging.info(f'Experiments will be logged at {log_dir}') trainer._default_root_dir = log_dir # Handle logging to file if get_envbool(NEMO_ENV_VARNAME_TESTING, False) or world_size <= 32: # If NEMO_TESTING is set (debug mode) or if less than 32 ranks save all log files log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) elif world_size <= 256 and local_rank == 0: # If less than 256 ranks, try to save 1 log file per "machine" log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) elif global_rank == 0: # If running more than 256 ranks, only save 1 log file log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{local_rank}.txt' logging.add_file_handler(log_file) # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks # not just global rank 0. if cfg.create_tensorboard_logger or cfg.create_wandb_logger: configure_loggers( trainer, exp_dir, cfg.name, cfg.version, cfg.create_tensorboard_logger, cfg.summary_writer_kwargs, cfg.create_wandb_logger, cfg.wandb_logger_kwargs, ) # add loggers timing callbacks if cfg.log_step_timing: timing_callback = TimingCallback( timer_kwargs=cfg.step_timing_kwargs or {}) trainer.callbacks.insert(0, timing_callback) if cfg.create_checkpoint_callback: configure_checkpointing(trainer, log_dir, checkpoint_name, cfg.resume_if_exists, cfg.checkpoint_callback_params) if is_global_rank_zero(): # Move files_to_copy to folder and add git information if present if cfg.files_to_copy: for _file in cfg.files_to_copy: copy(Path(_file), log_dir) # Create files for cmd args and git info with open(log_dir / 'cmd-args.log', 'w', encoding='utf-8') as _file: _file.write(" ".join(sys.argv)) # Try to get git hash git_repo, git_hash = get_git_hash() if git_repo: with open(log_dir / 'git-info.log', 'w', encoding='utf-8') as _file: _file.write(f'commit hash: {git_hash}') _file.write(get_git_diff()) # Add err_file logging to global_rank zero logging.add_err_file_handler(log_dir / 'nemo_error_log.txt') # Add lightning file logging to global_rank zero add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt') return log_dir
def setup(self, stage: str) -> None: """ PTL hook that is called after DDP is initialized. Called at the beginning of fit and test. Args: stage (str): either 'fit' or 'test' """ # TODO: implement model parallel for test stage if stage == 'fit': # set find_unused_parameters to True by default for NLP models if isinstance(self.trainer.accelerator.training_type_plugin, DDPPlugin): self.trainer.accelerator.training_type_plugin._ddp_kwargs[ 'find_unused_parameters'] = True # adds self.bert_model config to .nemo file if hasattr(self, 'bert_model') and self.bert_model is not None: self.register_bert_model() app_state = AppState() if app_state.model_parallel_size is not None: if app_state.model_parallel_group is None: self.init_model_parallel(app_state.global_rank, app_state.world_size) # mpu grad clipping needs parameters to have the attribute model_parallel parameters = self._trainer.get_model().parameters() for p in parameters: if not hasattr(p, 'model_parallel'): p.model_parallel = False # Update PTL trainer to use our configure_ddp self._trainer.accelerator_backend.ddp_plugin.configure_ddp = self.configure_ddp # Update PTL trainer to use our _clip_gradients self._trainer.accelerator_backend._clip_gradients = self._clip_gradients self._trainer.checkpoint_connector = NLPCheckpointConnector( self._trainer) # Configure checkpointing for model parallel if app_state.create_checkpoint_callback: # global rank 0 is configured by exp_manager if not is_global_rank_zero( ) and app_state.data_parallel_rank == 0: configure_checkpointing( self._trainer, app_state.log_dir, app_state.checkpoint_name, app_state.checkpoint_callback_params, ) if isinstance(self.bert_model, MegatronBertEncoder): self.bert_model.complete_lazy_init() # model parallel checkpoints need to be restored after torch.distributed is initialized if self._trainer.resume_from_checkpoint is not None: # update path based on model parallel rank filepath = self._trainer.resume_from_checkpoint dirname = os.path.dirname(os.path.dirname(filepath)) basename = os.path.basename(filepath) filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}' self._trainer.resume_from_checkpoint = filepath logging.info( f'Resuming training from checkpoint {self._trainer.resume_from_checkpoint}' ) # need to set checkpoint version for megatron-lm checkpoint_version = torch.load( self._trainer.resume_from_checkpoint).get( 'checkpoint_version', None) if checkpoint_version is not None: set_checkpoint_version(checkpoint_version) else: logging.warning( 'Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.' ) set_checkpoint_version(0) else: logging.info( f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}" ) self.bert_model.restore_weights( self.bert_model._restore_path) logging.info( "Replacing sampler with model parallel sampler") mp_sampler = torch.utils.data.distributed.DistributedSampler( self._train_dl.dataset, num_replicas=app_state.data_parallel_size, rank=app_state.data_parallel_rank, ) mp_dl = self._trainer.replace_sampler( self._train_dl, mp_sampler) self._train_dl = mp_dl else: raise NotImplementedError( f'The BERT encoder: {self.bert_model} does not support model parallelism yet.' )
def __init__( self, punct=True, stresses=False, spaces=True, chars=False, *, space=' ', silence=None, apostrophe=True, oov=Base.OOV, sep='|', # To be able to distinguish between 2/3 letters codes. add_blank_at="last_but_one", pad_with_space=False, improved_version_g2p=False, phoneme_dict_path=None, ): labels = [] self.space, labels = len(labels), labels + [space] # Space if silence is not None: self.silence, labels = len(labels), labels + [silence] # Silence labels.extend(self.CONSONANTS) vowels = list(self.VOWELS) if stresses: vowels = [f'{p}{s}' for p, s in itertools.product(vowels, (0, 1, 2))] labels.extend(vowels) if chars: labels.extend(string.ascii_lowercase) if apostrophe: labels.append("'") # Apostrophe if punct: labels.extend(self.PUNCT) super().__init__(labels, oov=oov, sep=sep, add_blank_at=add_blank_at) self.punct = punct self.stresses = stresses self.spaces = spaces self.pad_with_space = pad_with_space # g2p_en tries to run download_corpora() on import but it is not rank zero guarded # Try to check if torch distributed is available, if not get global rank zero to download corpora and make # all other ranks sleep for a minute if torch.distributed.is_available() and torch.distributed.is_initialized(): group = torch.distributed.group.WORLD if is_global_rank_zero(): download_corpora() torch.distributed.barrier(group=group) elif is_global_rank_zero(): logging.error( f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to " "data access race conditions. Now downloading corpora from global rank 0. If other ranks pass this " "before rank 0, errors might result." ) download_corpora() else: logging.error( f"Torch distributed needs to be initialized before you initialized {self}. This class is prone to " "data access race conditions. This process is not rank 0, and now going to sleep for 1 min. If this " "rank wakes from sleep prior to rank 0 finishing downloading, errors might result." ) time.sleep(60) import g2p_en # noqa pylint: disable=import-outside-toplevel _g2p = g2p_en.G2p() _g2p.variables = None if improved_version_g2p: self.g2p = G2p(_g2p, phoneme_dict_path) else: self.g2p = _g2p
def main(cfg: ParallelTranscriptionConfig): if cfg.model.endswith(".nemo"): logging.info("Attempting to initialize from .nemo file") model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu") elif cfg.model.endswith(".ckpt"): logging.info("Attempting to initialize from .ckpt file") model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu") else: logging.info( "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt" ) model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu") trainer = ptl.Trainer(**cfg.trainer) cfg.predict_ds.return_sample_id = True cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds) data_loader = model._setup_dataloader_from_config(cfg.predict_ds) os.makedirs(cfg.output_path, exist_ok=True) # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank. global_rank = trainer.node_rank * trainer.num_gpus + int( os.environ.get("LOCAL_RANK", 0)) output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json") predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset, output_file=output_file) trainer.callbacks.extend([predictor_writer]) predictions = trainer.predict(model=model, dataloaders=data_loader, return_predictions=cfg.return_predictions) if predictions is not None: predictions = list(itertools.chain.from_iterable(predictions)) samples_num = predictor_writer.close_output_file() logging.info( f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}." ) if torch.distributed.is_initialized(): torch.distributed.barrier() samples_num = 0 pred_text_list = [] text_list = [] if is_global_rank_zero(): output_file = os.path.join(cfg.output_path, f"predictions_all.json") logging.info( f"Prediction files are being aggregated in {output_file}.") with open(output_file, 'w') as outf: for rank in range(trainer.world_size): input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json") with open(input_file, 'r') as inpf: lines = inpf.readlines() for line in lines: item = json.loads(line) pred_text_list.append(item["pred_text"]) text_list.append(item["text"]) outf.write(json.dumps(item) + "\n") samples_num += 1 wer_cer = word_error_rate(hypotheses=pred_text_list, references=text_list, use_cer=cfg.use_cer) logging.info( f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}." ) logging.info("{} for all predictions is {:.4f}.".format( "CER" if cfg.use_cer else "WER", wer_cer))
def get_samples_mapping(indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, name, binary_head): """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank( ) == 0 and not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = torch.distributed.get_rank() == 0 start_time = time.time() logging.info( ' > building samples index mapping for {} ...'.format(name)) # First compile and then import. try: if is_global_rank_zero(): compile_helper() from nemo.collections.nlp.data.language_modeling.megatron import helpers except ImportError: raise ImportError( f'Could not compile megatron dataset C++ helper functions and therefore cannot import helpers python file.' ) samples_mapping = helpers.build_mapping( indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose, 2 if binary_head else 1, ) logging.info(' > done building samples index maping') np.save(indexmap_filename, samples_mapping, allow_pickle=True) logging.info( ' > saved the index mapping in {}'.format(indexmap_filename)) # Make sure all the ranks have built the mapping logging.info(' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) torch.distributed.barrier() counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce( counts, group=parallel_state.get_data_parallel_group()) torch.distributed.all_reduce( counts, group=parallel_state.get_pipeline_model_parallel_group()) assert counts[0].item() == ( torch.distributed.get_world_size() // torch.distributed.get_world_size( group=parallel_state.get_tensor_model_parallel_group())) # Load indexed dataset. logging.info( ' > loading indexed mapping from {}'.format(indexmap_filename)) start_time = time.time() samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') logging.info( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) logging.info(' total number of samples: {}'.format( samples_mapping.shape[0])) return samples_mapping
def __init__( self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None, ): """ Constructs DialogueSGDDataProcessor Args: data_dir: path to data directory dialogues_example_dir: path to store processed dialogue examples tokenizer: tokenizer object cfg: cfg container for dataset """ self.data_dir = data_dir self.cfg = cfg self._task_name = self.cfg.task_name # e.g. "sgd_single_domain" self._subsample = self.cfg.subsample all_schema_json_paths = [] for dataset_split in ['train', 'test', 'dev']: all_schema_json_paths.append( os.path.join(self.cfg.data_dir, dataset_split, "schema.json")) self.schemas = Schema(all_schema_json_paths) self.schema_config = { "MAX_NUM_CAT_SLOT": self.cfg.max_num_cat_slot, "MAX_NUM_NONCAT_SLOT": self.cfg.max_num_noncat_slot, "MAX_NUM_VALUE_PER_CAT_SLOT": self.cfg.max_value_per_cat_slot, "MAX_NUM_INTENT": self.cfg.max_num_intent, "NUM_TASKS": self.cfg.num_tasks, "MAX_SEQ_LENGTH": self.cfg.max_seq_length, } train_file_range = FILE_RANGES[self._task_name]["train"] dev_file_range = FILE_RANGES[self._task_name]["dev"] test_file_range = FILE_RANGES[self._task_name]["test"] self._file_ranges = { "train": train_file_range, "dev": dev_file_range, "test": test_file_range, } self._seen_services = { "train": set(), "dev": set(), "test": set(), } self._tokenizer = tokenizer self._dialogues_example_dir = dialogues_example_dir self.dial_files = {} # slots_relation_list.np would contain the candidate list of slots for each (service, slot) which would be # looked into when a switch between two services happens in the dialogue and we can not find any value for a slot in the current user utterance. # This file would get generated from the dialogues in the training set. self.slots_relation_file = os.path.join( dialogues_example_dir, f"{self._task_name}_train_slots_relation_list.np") for dataset in ["train", "dev", "test"]: # Process dialogue files dial_file = f"{self._task_name}_{dataset}_examples.json" dial_file = os.path.join(dialogues_example_dir, dial_file) self.dial_files[(self._task_name, dataset)] = dial_file dialog_paths = DialogueSGDDataProcessor.get_dialogue_files( data_dir, dataset, self._task_name) dialogs = DialogueSGDDataProcessor.load_dialogues(dialog_paths) for dialog in dialogs: self._seen_services[dataset].update(set(dialog['services'])) if is_global_rank_zero(): overwrite_dial_files = not self.cfg.use_cache self.save_dialog_examples( overwrite_dial_files=overwrite_dial_files)
def _build_index_mappings(name, data_prefix, documents, sizes, num_samples, seq_length, seed): """Build doc-idx, sample-idx, and shuffle-idx. doc-idx: is an array (ordered) of documents to be used in training. sample-idx: is the start document index and document offset for each training sample. shuffle-idx: maps the sample index into a random index into sample-idx. """ # Number of tokens in each epoch and number of required epochs. tokens_per_epoch = _num_tokens(documents, sizes) num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples) # rng state np_rng = np.random.RandomState(seed=seed) # Filename of the index mappings. _filename = data_prefix _filename += '_{}_indexmap'.format(name) _filename += '_{}ns'.format(num_samples) _filename += '_{}sl'.format(seq_length) _filename += '_{}s'.format(seed) doc_idx_filename = _filename + '_doc_idx.npy' sample_idx_filename = _filename + '_sample_idx.npy' shuffle_idx_filename = _filename + '_shuffle_idx.npy' # Build the indexed mapping if not exist. if torch.distributed.get_rank() == 0: if ((not os.path.isfile(doc_idx_filename)) or (not os.path.isfile(sample_idx_filename)) or (not os.path.isfile(shuffle_idx_filename))): logging.info( ' > WARNING: could not find index map files, building ' 'the indices on rank 0 ...') # For the last epoch, decide whether include the entire epoch # in the global shuffle or not. # If we need only one epoch, then separating last epoch does # not mean anything. if num_epochs == 1: separate_last_epoch = False print( ' > only one epoch required, setting ' 'separate_last_epoch to False', flush=True) else: # Get the number of samples for the last epoch num_samples_from_epochs_minus_one = ( (num_epochs - 1) * tokens_per_epoch - 1) // seq_length last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one assert last_epoch_num_samples >= 0, 'last epoch number of samples should be non-negative.' num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length assert last_epoch_num_samples < ( num_samples_per_epoch + 1), 'last epoch number of samples exceeded max value.' # If we have less than 80% of the samples for the last epoch, # seperate out the epoch and treat it differently. # Note: the 80% number is just based on common sense and can # be adjusted if needed. separate_last_epoch = last_epoch_num_samples < int( 0.80 * num_samples_per_epoch) if separate_last_epoch: string = ( ' > last epoch number of samples ({}) is smaller ' 'than 80% of number of samples per epoch ({}), ' 'setting separate_last_epoch to True') else: string = (' > last epoch number of samples ({}) is larger ' 'than 80% of number of samples per epoch ({}), ' 'setting separate_last_epoch to False') print(string.format(last_epoch_num_samples, num_samples_per_epoch), flush=True) # doc-idx. start_time = time.time() doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch) np.save(doc_idx_filename, doc_idx, allow_pickle=True) logging.info(' > elasped time to build and save doc-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # sample-idx. start_time = time.time() # Use C++ implementation for speed. # First compile and then import. assert doc_idx.dtype == np.int32 assert sizes.dtype == np.int32 try: if is_global_rank_zero(): from nemo.collections.nlp.data.language_modeling.megatron.dataset_utils import compile_helper compile_helper() from nemo.collections.nlp.data.language_modeling.megatron import helpers except: raise Exception(f'Could not compile helpers.') sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch) # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length, # num_epochs, tokens_per_epoch) np.save(sample_idx_filename, sample_idx, allow_pickle=True) logging.info( ' > elasped time to build and save sample-idx mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) # shuffle-idx. start_time = time.time() # -1 is due to data structure used to retieve the index: # sample i --> [sample_idx[i], sample_idx[i+1]) if separate_last_epoch: num_samples_ = num_samples_from_epochs_minus_one else: num_samples_ = sample_idx.shape[0] - 1 shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng) np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True) logging.info( ' > elasped time to build and save shuffle-idx mapping' ' (seconds): {:4f}'.format(time.time() - start_time)) torch.distributed.barrier() counts = torch.cuda.LongTensor([1]) torch.distributed.all_reduce( counts, group=parallel_state.get_data_parallel_group()) torch.distributed.all_reduce( counts, group=parallel_state.get_pipeline_model_parallel_group()) assert counts[0].item() == ( torch.distributed.get_world_size() // torch.distributed.get_world_size( group=parallel_state.get_tensor_model_parallel_group())) # Load mappings. start_time = time.time() logging.info(' > loading doc-idx mapping from {}'.format(doc_idx_filename)) doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode='r') logging.info( ' > loading sample-idx mapping from {}'.format(sample_idx_filename)) sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode='r') logging.info( ' > loading shuffle-idx mapping from {}'.format(shuffle_idx_filename)) shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode='r') logging.info( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) logging.info(' total number of samples: {}'.format(sample_idx.shape[0])) logging.info(' total number of epochs: {}'.format(num_epochs)) return doc_idx, sample_idx, shuffle_idx
def get_log_dir( trainer: 'pytorch_lightning.Trainer', exp_dir: str = None, name: str = None, version: str = None, explicit_log_dir: str = None, use_datetime_version: bool = True, resume_if_exists: bool = False, ) -> (Path, str, str, str): """ Obtains the log_dir used for exp_manager. Returns: log_dir (Path): the log_dir exp_dir (str): the base exp_dir without name nor version name (str): The name of the experiment version (str): The version of the experiment explicit_log_dir (str): The explicit path to the log folder. Defaults to False. use_datetime_version (bool): Uses date and time as the version of the log folder. Defaults to True. resume_if_exists (bool): if resume_if_exists of the exp_manager's config is enabled or not. When enabled, the version folders would not get created. Raise: LoggerMisconfigurationError: If trainer is incompatible with arguments NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found. ValueError: If resume is True, and there were more than 1 checkpoint could found. """ if explicit_log_dir: # If explicit log_dir was passed, short circuit return check_explicit_log_dir(trainer, explicit_log_dir, exp_dir, name, version) # Default exp_dir to ./nemo_experiments if None was passed _exp_dir = exp_dir if exp_dir is None: _exp_dir = str(Path.cwd() / 'nemo_experiments') # If the user has already defined a logger for the trainer, use the logger defaults for logging directory if trainer.logger is not None: if trainer.logger.save_dir: if exp_dir: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, the logger's " f"save_dir was not None, and exp_dir ({exp_dir}) was not None. If trainer.logger.save_dir " "exists, exp_manager will use trainer.logger.save_dir as the logging directory and exp_dir " "must be None.") _exp_dir = trainer.logger.save_dir if name: raise LoggerMisconfigurationError( "The pytorch lightning trainer that was passed to exp_manager contained a logger, and name: " f"{name} was also passed to exp_manager. If the trainer contains a " "logger, exp_manager will use trainer.logger.name, and name passed to exp_manager must be None." ) name = trainer.logger.name version = f"version_{trainer.logger.version}" # Use user-defined exp_dir, project_name, exp_name, and versioning options else: name = name or "default" version = version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None) if not version: if resume_if_exists: logging.warning( "No version folders would be created under the log folder as 'resume_if_exists' is enabled." ) version = None elif is_global_rank_zero(): if use_datetime_version: version = time.strftime('%Y-%m-%d_%H-%M-%S') else: tensorboard_logger = TensorBoardLogger( save_dir=Path(_exp_dir), name=name, version=version) version = f"version_{tensorboard_logger.version}" os.environ[ NEMO_ENV_VARNAME_VERSION] = "" if version is None else version log_dir = Path(_exp_dir) / Path( str(name)) / Path("" if version is None else str(version)) return log_dir, str(_exp_dir), name, version
def __init__( self, text_file: str, label_file: str, max_seq_length: int, tokenizer: TokenizerSpec, num_samples: int = -1, pad_label: str = 'O', label_ids: Dict[str, int] = None, ignore_extra_tokens: bool = False, ignore_start_end: bool = False, use_cache: bool = True, ): """ Initializes BertTokenClassificationDataset. """ data_dir = os.path.dirname(text_file) filename = os.path.basename(text_file) if not filename.endswith('.txt'): raise ValueError("{text_file} should have extension .txt") vocab_size = getattr(tokenizer, "vocab_size", 0) features_pkl = os.path.join( data_dir, "cached_{}_{}_{}_{}_{}".format(filename, tokenizer.name, str(max_seq_length), str(vocab_size), str(num_samples)), ) master_device = is_global_rank_zero() features = None if master_device and (not use_cache or not os.path.exists(features_pkl)): if num_samples == 0: raise ValueError("num_samples has to be positive", num_samples) with open(text_file, 'r') as f: text_lines = f.readlines() labels_lines = [] with open(label_file, 'r') as f: for line in f: line = line.strip().split() labels_lines.append(line) if len(labels_lines) != len(text_lines): raise ValueError( "Labels file should contain labels for every word") if num_samples > 0: dataset = list(zip(text_lines, labels_lines)) dataset = dataset[:num_samples] dataset = list(zip(*dataset)) text_lines = dataset[0] labels_lines = dataset[1] features = get_features( queries=text_lines, max_seq_length=max_seq_length, tokenizer=tokenizer, pad_label=pad_label, raw_labels=labels_lines, label_ids=label_ids, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, ) pickle.dump(features, open(features_pkl, "wb")) logging.info(f'features saved to {features_pkl}') # wait until the master process writes to the processed data files if torch.distributed.is_initialized(): torch.distributed.barrier() if features is None: features = pickle.load(open(features_pkl, 'rb')) logging.info(f'features restored from {features_pkl}') self.all_input_ids = features[0] self.all_segment_ids = features[1] self.all_input_mask = features[2] self.all_subtokens_mask = features[3] self.all_loss_mask = features[4] self.all_labels = features[5]