def main(cfg: ParallelAlignmentConfig): if cfg.model.endswith(".nemo"): logging.info("Attempting to initialize from .nemo file") model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu") elif cfg.model.endswith(".ckpt"): logging.info("Attempting to initialize from .ckpt file") model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu") else: logging.info( "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt" ) model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu") trainer = ptl.Trainer(**cfg.trainer) cfg.predict_ds.return_sample_id = True cfg.return_predictions = False cfg.use_cer = False cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model._cfg.train_ds) data_loader = model._setup_dataloader_from_config(cfg.predict_ds) os.makedirs(cfg.output_path, exist_ok=True) # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank. global_rank = trainer.node_rank * trainer.num_devices + int(os.environ.get("LOCAL_RANK", 0)) output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json") output_ctm_dir = os.path.join(cfg.output_path, "ctm") predictor_writer = ASRCTMPredictionWriter( dataset=data_loader.dataset, output_file=output_file, output_ctm_dir=output_ctm_dir, time_per_frame=cfg.model_stride * model._cfg.preprocessor['window_stride'], ) trainer.callbacks.extend([predictor_writer]) aligner_wrapper = AlignerWrapperModel(model=model, cfg=cfg.aligner_args) trainer.predict(model=aligner_wrapper, dataloaders=data_loader, return_predictions=cfg.return_predictions) samples_num = predictor_writer.close_output_file() logging.info( f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}." ) if torch.distributed.is_initialized(): torch.distributed.barrier() samples_num = 0 if is_global_rank_zero(): output_file = os.path.join(cfg.output_path, f"predictions_all.json") logging.info(f"Prediction files are being aggregated in {output_file}.") with open(output_file, 'tw', encoding="utf-8") as outf: for rank in range(trainer.world_size): input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json") with open(input_file, 'r', encoding="utf-8") as inpf: lines = inpf.readlines() samples_num += len(lines) outf.writelines(lines) logging.info( f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}." )
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if cfg.model.pretrained_model is None and cfg.model.nemo_model is None: raise ValueError( "Either set `cfg.model.nemo_model` or `cfg.model.pretrained_model`" ) if cfg.model.pretrained_model is not None and cfg.model.nemo_model is not None: raise ValueError( "Cannot set `cfg.model.nemo_model` and `cfg.model.pretrained_model`. Select one only." ) trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if cfg.model.pretrained_model is not None: model_cfg = ASRModel.from_pretrained(cfg.model.pretrained_model, return_config=True) update_encoder_config_to_support_adapter(model_cfg) model = ASRModel.from_pretrained(cfg.model.pretrained_model, override_config_path=model_cfg, trainer=trainer) else: model_cfg = ASRModel.restore_from(cfg.model.nemo_model, return_config=True) update_encoder_config_to_support_adapter(model_cfg) model = ASRModel.restore_from(cfg.model.nemo_model, override_config_path=model_cfg, trainer=trainer) # Setup model for finetuning (train and validation only) cfg.model.test_ds = update_model_cfg(model.cfg.test_ds, cfg.model.test_ds) # Call the dataloaders and optimizer + scheduler model.setup_multiple_test_data(cfg.model.test_ds) # Setup adapters with open_dict(cfg.model.adapter): adapter_name = cfg.model.adapter.pop("adapter_name", None) # Disable all other adapters, enable just the current adapter. model.set_enabled_adapters( enabled=False) # disable all adapters prior to training if adapter_name is not None: model.set_enabled_adapters( adapter_name, enabled=True) # enable just one adapter by name if provided # First, Freeze all the weights of the model (not just encoder, everything) model.freeze() # Finally, train model trainer.test(model)
def main(): parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5Base-En", required=False, help="Pass: '******'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--batch_size", type=int, default=4) parser.add_argument("--out_dir", type=str, required=True, help="Destination dir for output files") parser.add_argument("--sctk_dir", type=str, required=False, default="", help="Path to sctk root dir") parser.add_argument("--glm", type=str, required=False, default="", help="Path to glm file") args = parser.parse_args() torch.set_grad_enabled(False) if not os.path.exists(args.out_dir): os.makedirs(args.out_dir, exist_ok=True) use_sctk = os.path.exists(args.sctk_dir) if args.asr_model.endswith('.nemo'): logging.info(f"Using local ASR model from {args.asr_model}") asr_model = ASRModel.restore_from(restore_path=args.asr_model, map_location='cpu') else: logging.info(f"Using NGC cloud ASR model {args.asr_model}") asr_model = ASRModel.from_pretrained(model_name=args.asr_model, map_location='cpu') if can_gpu: asr_model = asr_model.cuda() asr_model.eval() manifest_data = read_manifest(args.dataset) references = [data['text'] for data in manifest_data] audio_filepaths = [data['audio_filepath'] for data in manifest_data] with autocast(): hypotheses = asr_model.transcribe(audio_filepaths, batch_size=args.batch_size) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(hypotheses) == tuple and len(hypotheses) == 2: hypotheses = hypotheses[0] info_list = get_utt_info(args.dataset) hypfile = os.path.join(args.out_dir, "hyp.trn") reffile = os.path.join(args.out_dir, "ref.trn") with open(hypfile, "w") as hyp_f, open(reffile, "w") as ref_f: for i in range(len(hypotheses)): utt_id = os.path.splitext(os.path.basename(info_list[i]['audio_filepath']))[0] # rfilter in sctk likes each transcript to have a space at the beginning hyp_f.write(" " + hypotheses[i] + " (" + utt_id + ")" + "\n") ref_f.write(" " + references[i] + " (" + utt_id + ")" + "\n") if use_sctk: score_with_sctk(args.sctk_dir, reffile, hypfile, args.out_dir, glm=args.glm)
def get_asr_model(asr_model): """ Returns ASR Model Args: asr_model: NeMo ASR model """ if os.path.exists(args.model): asr_model = ASRModel.restore_from(asr_model) elif args.model in ASRModel.get_available_model_names(): asr_model = ASRModel.from_pretrained(asr_model) else: raise ValueError( f'Provide path to the pretrained checkpoint or choose from {ASRModel.get_available_model_names()}' ) return asr_model
def test_constructor_pretrained(self): # Check to/from config_dict: cfg = ASRModel.from_pretrained('stt_en_citrinet_256', map_location='cpu', return_config=True) adapter_metadata = get_registered_adapter(cfg.encoder._target_) if adapter_metadata is not None: cfg.encoder._target_ = adapter_metadata.adapter_class_path model = ASRModel.from_pretrained('stt_en_citrinet_256', override_config_path=cfg) assert isinstance(model, AdapterModuleMixin) assert hasattr(model, 'encoder') assert isinstance(model.encoder, AdapterModuleMixin) model.add_adapter( 'adapter_0', cfg=get_adapter_cfg(in_features=cfg.encoder.jasper[0].filters, dim=5)) assert model.is_adapter_available() model.freeze() model.unfreeze_enabled_adapters() assert model.num_weights < 1e5
def _normalize_line(normalizer: NormalizerWithAudio, line: str, asr_model: ASRModel = None): line = json.loads(line) audio = line['audio_filepath'] if 'transcript' in line: transcript = line['transcript'] else: transcript = asr_model.transcribe([audio])[0] normalized_texts = normalizer.normalize( text=line['text'], verbose=args.verbose, n_tagged=args.n_tagged, punct_pre_process=not args.no_punct_pre_process, punct_post_process=not args.no_punct_post_process, ) normalized_text, cer = normalizer.select_best_match(normalized_texts, transcript, args.verbose, args.remove_punct) line['nemo_normalized'] = normalized_text line['CER_nemo_normalized'] = cer return line
def _asr_on_filepaths( audio_filepaths, audio_dir, pretrained_model, asr_batch_size, asr_ckpt_path, device, asr_model, logprobs, **kw ): # Load acustic model if asr_model is None: if asr_ckpt_path: warnings.warn("Models loaded from a .ckpt run on CPU") # TODO: Infer this kind of model in GPU asr_model = EncDecCTCModel.load_from_checkpoint( checkpoint_path=asr_ckpt_path, map_location=device ) else: asr_model = ASRModel.from_pretrained( model_name=pretrained_model, map_location=device ) trainer = pl.Trainer(gpus=int(device != "cpu")) asr_model.set_trainer(trainer) asr_model = asr_model.eval() # Transcribe if audio_dir: full_filepaths = [os.path.join(audio_dir, p) for p in audio_filepaths] else: full_filepaths = audio_filepaths @contextlib.contextmanager def autocast(): yield with autocast(): with torch.no_grad(): preds = asr_model.transcribe( full_filepaths, batch_size=asr_batch_size, logprobs=logprobs ) filepath_to_pred = {} for filepath, pred in zip(audio_filepaths, preds): filepath_to_pred[filepath] = pred return filepath_to_pred, asr_model
def main(): args = parse_arguments() # Instantiate pytorch model nemo_model = args.nemo_model nemo_model = ASRModel.restore_from(nemo_model, map_location='cpu') # type: ASRModel nemo_model.freeze() if torch.cuda.is_available(): nemo_model = nemo_model.to('cuda') export_model_if_required(args, nemo_model) # Instantiate RNNT Decoding loop encoder_model = args.onnx_encoder decoder_model = args.onnx_decoder max_symbols_per_step = args.max_symbold_per_step decoding = ONNXGreedyBatchedRNNTInfer(encoder_model, decoder_model, max_symbols_per_step) audio_filepath = resolve_audio_filepaths(args) # Evaluate Pytorch Model (CPU/GPU) actual_transcripts = nemo_model.transcribe(audio_filepath, batch_size=args.batch_size)[0] # Evaluate ONNX model (on CPU) with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp: for audio_file in audio_filepath: entry = { 'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing' } fp.write(json.dumps(entry) + '\n') config = { 'paths2audio_files': audio_filepath, 'batch_size': args.batch_size, 'temp_dir': tmpdir } # Push nemo model to CPU nemo_model = nemo_model.to('cpu') nemo_model.preprocessor.featurizer.dither = 0.0 nemo_model.preprocessor.featurizer.pad_to = 0 temporary_datalayer = nemo_model._setup_transcribe_dataloader(config) all_hypothesis = [] for test_batch in tqdm(temporary_datalayer, desc="ONNX Transcribing"): input_signal, input_signal_length = test_batch[0], test_batch[1] # Acoustic features processed_audio, processed_audio_len = nemo_model.preprocessor( input_signal=input_signal, length=input_signal_length) # RNNT Decoding loop hypotheses = decoding(audio_signal=processed_audio, length=processed_audio_len) # Process hypothesis (map char/subword token ids to text) hypotheses = nemo_model.decoding.decode_hypothesis( hypotheses) # type: List[str] # Extract text from the hypothesis texts = [h.text for h in hypotheses] all_hypothesis += texts del processed_audio, processed_audio_len del test_batch if args.log: for pt_transcript, onnx_transcript in zip(actual_transcripts, all_hypothesis): print(f"Pytorch Transcripts : {pt_transcript}") print(f"ONNX Transcripts : {onnx_transcript}") print() # Measure error rate between onnx and pytorch transcipts pt_onnx_cer = word_error_rate(all_hypothesis, actual_transcripts, use_cer=True) assert pt_onnx_cer < args.threshold, "Threshold violation !" print("Character error rate between Pytorch and ONNX :", pt_onnx_cer)
def main(cfg: TranscriptionConfig): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if cfg.model_path is None and cfg.pretrained_name is None: raise ValueError("Both cfg.model_path and cfg.pretrained_name cannot be None !") # setup gpu if cfg.cuda is None: cfg.cuda = torch.cuda.is_available() if type(cfg.cuda) == int: device_id = int(cfg.cuda) else: device_id = 0 device = torch.device(f'cuda:{device_id}' if cfg.cuda else 'cpu') # setup model if cfg.model_path is not None: # restore model from .nemo file path model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True) classpath = model_cfg.target # original class path imported_class = model_utils.import_class_by_path(classpath) # type: ASRModel logging.info(f"Restoring model : {imported_class.__name__}") asr_model = imported_class.restore_from(restore_path=cfg.model_path, map_location=device) # type: ASRModel else: # restore model by name asr_model = ASRModel.from_pretrained(model_name=cfg.pretrained_name, map_location=device) # type: ASRModel trainer = pl.Trainer(gpus=int(cfg.cuda)) asr_model.set_trainer(trainer) asr_model = asr_model.eval() # Setup decoding strategy if hasattr(asr_model, 'change_decoding_strategy'): asr_model.change_decoding_strategy(cfg.rnnt_decoding) # load paths to audio filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"*.{cfg.audio_type}"))) logging.info(f"\nTranscribing {len(filepaths)} files...\n") # setup AMP (optional) if cfg.amp and torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP enabled!\n") autocast = torch.cuda.amp.autocast else: @contextlib.contextmanager def autocast(): yield # transcribe audio with autocast(): with torch.no_grad(): transcriptions = asr_model.transcribe(filepaths, batch_size=cfg.batch_size) logging.info(f"Finished transcribing {len(filepaths)} files !") logging.info(f"Writing transcriptions into file: {cfg.output_filename}") with open(cfg.output_filename, 'w', encoding='utf-8') as f: for line in transcriptions: f.write(f"{line}\n") logging.info("Finished writing predictions !")
def main(cfg: TranscriptionConfig) -> TranscriptionConfig: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if cfg.model_path is None and cfg.pretrained_name is None: raise ValueError( "Both cfg.model_path and cfg.pretrained_name cannot be None!") if cfg.audio_dir is None and cfg.dataset_manifest is None: raise ValueError( "Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") # setup GPU if cfg.cuda is None: if torch.cuda.is_available(): cfg.cuda = 0 # use 0th CUDA device else: cfg.cuda = -1 # use CPU device = torch.device(f'cuda:{cfg.cuda}' if cfg.cuda >= 0 else 'cpu') # setup model if cfg.model_path is not None: # restore model from .nemo file path model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True) classpath = model_cfg.target # original class path imported_class = model_utils.import_class_by_path( classpath) # type: ASRModel logging.info(f"Restoring model : {imported_class.__name__}") asr_model = imported_class.restore_from( restore_path=cfg.model_path, map_location=device) # type: ASRModel model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] else: # restore model by name asr_model = ASRModel.from_pretrained( model_name=cfg.pretrained_name, map_location=device) # type: ASRModel model_name = cfg.pretrained_name trainer = pl.Trainer(gpus=[cfg.cuda] if cfg.cuda >= 0 else 0) asr_model.set_trainer(trainer) asr_model = asr_model.eval() # Setup decoding strategy if hasattr(asr_model, 'change_decoding_strategy'): asr_model.change_decoding_strategy(cfg.rnnt_decoding) # get audio filenames if cfg.audio_dir is not None: filepaths = list( glob.glob(os.path.join(cfg.audio_dir, f"*.{cfg.audio_type}"))) else: # get filenames from manifest filepaths = [] with open(cfg.dataset_manifest, 'r') as f: for line in f: item = json.loads(line) filepaths.append(item['audio_filepath']) logging.info(f"\nTranscribing {len(filepaths)} files...\n") # setup AMP (optional) if cfg.amp and torch.cuda.is_available() and hasattr( torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP enabled!\n") autocast = torch.cuda.amp.autocast else: @contextlib.contextmanager def autocast(): yield # Compute output filename if cfg.output_filename is None: # create default output filename if cfg.audio_dir is not None: cfg.output_filename = os.path.dirname( os.path.join(cfg.audio_dir, '.')) + '.json' else: cfg.output_filename = cfg.dataset_manifest.replace( '.json', f'_{model_name}.json') # if transcripts should not be overwritten, and already exists, skip re-transcription step and return if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename): logging.info( f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`" f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text." ) return cfg # transcribe audio with autocast(): with torch.no_grad(): transcriptions = asr_model.transcribe(filepaths, batch_size=cfg.batch_size) logging.info(f"Finished transcribing {len(filepaths)} files !") logging.info(f"Writing transcriptions into file: {cfg.output_filename}") # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] # write audio transcriptions with open(cfg.output_filename, 'w', encoding='utf-8') as f: if cfg.audio_dir is not None: for idx, text in enumerate(transcriptions): item = {'audio_filepath': filepaths[idx], 'pred_text': text} f.write(json.dumps(item) + "\n") else: with open(cfg.dataset_manifest, 'r') as fr: for idx, line in enumerate(fr): item = json.loads(line) item['pred_text'] = transcriptions[idx] f.write(json.dumps(item) + "\n") logging.info("Finished writing predictions !") return cfg
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if cfg.model.pretrained_model is None and cfg.model.nemo_model is None: raise ValueError("Either set `cfg.model.nemo_model` or `cfg.model.pretrained_model`") if cfg.model.pretrained_model is not None and cfg.model.nemo_model is not None: raise ValueError("Cannot set both `cfg.model.nemo_model` and `cfg.model.pretrained_model`. Select one only.") trainer = pl.Trainer(**cfg.trainer) exp_log_dir = exp_manager(trainer, cfg.get("exp_manager", None)) if cfg.model.pretrained_model is not None: model_cfg = ASRModel.from_pretrained(cfg.model.pretrained_model, return_config=True) update_model_config_to_support_adapter(model_cfg, cfg) model = ASRModel.from_pretrained(cfg.model.pretrained_model, override_config_path=model_cfg, trainer=trainer) else: model_cfg = ASRModel.restore_from(cfg.model.nemo_model, return_config=True) update_model_config_to_support_adapter(model_cfg, cfg) model = ASRModel.restore_from(cfg.model.nemo_model, override_config_path=model_cfg, trainer=trainer) # Setup model for finetuning (train and validation only) cfg.model.train_ds = update_model_cfg(model.cfg.train_ds, cfg.model.train_ds) cfg.model.validation_ds = update_model_cfg(model.cfg.validation_ds, cfg.model.validation_ds) # Call the dataloaders and optimizer + scheduler model.setup_training_data(cfg.model.train_ds) model.setup_multiple_validation_data(cfg.model.validation_ds) # Setup optimizer model.setup_optimization(cfg.model.optim) # Setup spec augmentation if 'spec_augment' in cfg.model: model.spec_augmentation = model.from_config_dict(cfg.model.spec_augment) else: model.spec_augmentation = None del model.cfg.spec_augment # Setup adapters with open_dict(cfg.model.adapter): # Extract the name of the adapter (must be give for training) adapter_name = cfg.model.adapter.pop("adapter_name") adapter_module_name = cfg.model.adapter.pop("adapter_module_name", None) adapter_state_dict_name = cfg.model.adapter.pop("adapter_state_dict_name", None) # augment adapter name with module name, if not provided by user if adapter_module_name is not None and ':' not in adapter_name: adapter_name = f'{adapter_module_name}:{adapter_name}' # Extract the global adapter config, if provided adapter_global_cfg = cfg.model.adapter.pop(model.adapter_global_cfg_key, None) if adapter_global_cfg is not None: add_global_adapter_cfg(model, adapter_global_cfg) model.add_adapter(adapter_name, cfg=cfg.model.adapter) assert model.is_adapter_available() # Disable all other adapters, enable just the current adapter. model.set_enabled_adapters(enabled=False) # disable all adapters prior to training model.set_enabled_adapters(adapter_name, enabled=True) # enable just one adapter by name # First, Freeze all the weights of the model (not just encoder, everything) model.freeze() # Activate dropout() and other modules that depend on train mode. model = model.train() # Then, Unfreeze just the adapter weights that were enabled above (no part of encoder/decoder/joint/etc) model.unfreeze_enabled_adapters() # Update model config prior to training model.cfg = model.cfg # Finally, train model trainer.fit(model) # Save the adapter state dict if adapter_state_dict_name is not None: state_path = exp_log_dir if exp_log_dir is not None else os.getcwd() ckpt_path = os.path.join(state_path, "checkpoints") if os.path.exists(ckpt_path): state_path = ckpt_path state_path = os.path.join(state_path, adapter_state_dict_name) # Save the adapter modules in a seperate file model.save_adapters(str(state_path))
def main(cfg: TranscriptionConfig) -> TranscriptionConfig: logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if is_dataclass(cfg): cfg = OmegaConf.structured(cfg) if cfg.model_path is None and cfg.pretrained_name is None: raise ValueError( "Both cfg.model_path and cfg.pretrained_name cannot be None!") if cfg.audio_dir is None and cfg.dataset_manifest is None: raise ValueError( "Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") # setup GPU if cfg.cuda is None: if torch.cuda.is_available(): device = [0] # use 0th CUDA device accelerator = 'gpu' else: device = 1 accelerator = 'cpu' else: device = [cfg.cuda] accelerator = 'gpu' map_location = torch.device('cuda:{}'.format(device[0]) if accelerator == 'gpu' else 'cpu') # setup model if cfg.model_path is not None: # restore model from .nemo file path model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True) classpath = model_cfg.target # original class path imported_class = model_utils.import_class_by_path( classpath) # type: ASRModel logging.info(f"Restoring model : {imported_class.__name__}") asr_model = imported_class.restore_from( restore_path=cfg.model_path, map_location=map_location) # type: ASRModel model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] else: # restore model by name asr_model = ASRModel.from_pretrained( model_name=cfg.pretrained_name, map_location=map_location) # type: ASRModel model_name = cfg.pretrained_name trainer = pl.Trainer(devices=device, accelerator=accelerator) asr_model.set_trainer(trainer) asr_model = asr_model.eval() partial_audio = False # Setup decoding strategy if hasattr(asr_model, 'change_decoding_strategy'): asr_model.change_decoding_strategy(cfg.rnnt_decoding) # get audio filenames if cfg.audio_dir is not None: filepaths = list( glob.glob(os.path.join(cfg.audio_dir, f"*.{cfg.audio_type}"))) else: # get filenames from manifest filepaths = [] if os.stat(cfg.dataset_manifest).st_size == 0: logging.error( f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!" ) return None with open(cfg.dataset_manifest, 'r') as f: has_two_fields = [] for line in f: item = json.loads(line) if "offset" in item and "duration" in item: has_two_fields.append(True) else: has_two_fields.append(False) filepaths.append(item['audio_filepath']) partial_audio = all(has_two_fields) logging.info(f"\nTranscribing {len(filepaths)} files...\n") # setup AMP (optional) if cfg.amp and torch.cuda.is_available() and hasattr( torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP enabled!\n") autocast = torch.cuda.amp.autocast else: @contextlib.contextmanager def autocast(): yield # Compute output filename if cfg.output_filename is None: # create default output filename if cfg.audio_dir is not None: cfg.output_filename = os.path.dirname( os.path.join(cfg.audio_dir, '.')) + '.json' else: cfg.output_filename = cfg.dataset_manifest.replace( '.json', f'_{model_name}.json') # if transcripts should not be overwritten, and already exists, skip re-transcription step and return if not cfg.overwrite_transcripts and os.path.exists(cfg.output_filename): logging.info( f"Previous transcripts found at {cfg.output_filename}, and flag `overwrite_transcripts`" f"is {cfg.overwrite_transcripts}. Returning without re-transcribing text." ) return cfg # transcribe audio with autocast(): with torch.no_grad(): if partial_audio: if isinstance(asr_model, EncDecCTCModel): transcriptions = transcribe_partial_audio( asr_model=asr_model, path2manifest=cfg.dataset_manifest, batch_size=cfg.batch_size, num_workers=cfg.num_workers, ) else: logging.warning( "RNNT models do not support transcribe partial audio for now. Transcribing full audio." ) transcriptions = asr_model.transcribe( paths2audio_files=filepaths, batch_size=cfg.batch_size, num_workers=cfg.num_workers, ) else: transcriptions = asr_model.transcribe( paths2audio_files=filepaths, batch_size=cfg.batch_size, num_workers=cfg.num_workers, ) logging.info(f"Finished transcribing {len(filepaths)} files !") logging.info(f"Writing transcriptions into file: {cfg.output_filename}") # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] # write audio transcriptions with open(cfg.output_filename, 'w', encoding='utf-8') as f: if cfg.audio_dir is not None: for idx, text in enumerate(transcriptions): item = {'audio_filepath': filepaths[idx], 'pred_text': text} f.write(json.dumps(item) + "\n") else: with open(cfg.dataset_manifest, 'r') as fr: for idx, line in enumerate(fr): item = json.loads(line) item['pred_text'] = transcriptions[idx] f.write(json.dumps(item) + "\n") logging.info("Finished writing predictions !") return cfg
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if cfg.model.pretrained_model is None and cfg.model.nemo_model is None: raise ValueError( "Either set `cfg.model.nemo_model` or `cfg.model.pretrained_model`" ) if cfg.model.pretrained_model is not None and cfg.model.nemo_model is not None: raise ValueError( "Cannot set `cfg.model.nemo_model` and `cfg.model.pretrained_model`. Select one only." ) trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if cfg.model.pretrained_model is not None: model_cfg = ASRModel.from_pretrained(cfg.model.pretrained_model, return_config=True) update_encoder_config_to_support_adapter(model_cfg) model = ASRModel.from_pretrained(cfg.model.pretrained_model, override_config_path=model_cfg, trainer=trainer) else: model_cfg = ASRModel.restore_from(cfg.model.nemo_model, return_config=True) update_encoder_config_to_support_adapter(model_cfg) model = ASRModel.restore_from(cfg.model.nemo_model, override_config_path=model_cfg, trainer=trainer) # Setup model for finetuning (train and validation only) cfg.model.train_ds = update_model_cfg(model.cfg.train_ds, cfg.model.train_ds) cfg.model.validation_ds = update_model_cfg(model.cfg.validation_ds, cfg.model.validation_ds) # Call the dataloaders and optimizer + scheduler model.setup_training_data(cfg.model.train_ds) model.setup_multiple_validation_data(cfg.model.validation_ds) # Setup optimizer cfg.model.optim = update_model_cfg(model.cfg.optim, cfg.model.optim) model.setup_optimization(cfg.model.optim) # Setup adapters with open_dict(cfg.model.adapter): # Extract the name of the adapter (must be give for training) adapter_name = cfg.model.adapter.pop("adapter_name") # Extract the global adapter config, if provided adapter_global_cfg = cfg.model.adapter.pop( model.adapter_global_cfg_key, None) if adapter_global_cfg is not None: add_global_adapter_cfg(model, adapter_global_cfg) model.add_adapter(adapter_name, cfg=cfg.model.adapter) assert model.is_adapter_available() # Disable all other adapters, enable just the current adapter. model.set_enabled_adapters( enabled=False) # disable all adapters prior to training model.set_enabled_adapters(adapter_name, enabled=True) # enable just one adapter by name # First, Freeze all the weights of the model (not just encoder, everything) model.freeze() # Activate dropout() and other modules that depend on train mode. model = model.train() # Then, Unfreeze just the adapter weights that were enabled above (no part of encoder/decoder/joint/etc) model.unfreeze_enabled_adapters() # Finally, train model trainer.fit(model)
def main(cfg: TranscriptionConfig): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') if cfg.model_path is None and cfg.pretrained_name is None: raise ValueError( "Both cfg.model_path and cfg.pretrained_name cannot be None!") if cfg.audio_dir is None and cfg.dataset_manifest is None: raise ValueError( "Both cfg.audio_dir and cfg.dataset_manifest cannot be None!") # setup GPU if cfg.cuda is None: cfg.cuda = torch.cuda.is_available() if type(cfg.cuda) == int: device_id = int(cfg.cuda) else: device_id = 0 device = torch.device(f'cuda:{device_id}' if cfg.cuda else 'cpu') # setup model if cfg.model_path is not None: # restore model from .nemo file path model_cfg = ASRModel.restore_from(restore_path=cfg.model_path, return_config=True) classpath = model_cfg.target # original class path imported_class = model_utils.import_class_by_path( classpath) # type: ASRModel logging.info(f"Restoring model : {imported_class.__name__}") asr_model = imported_class.restore_from( restore_path=cfg.model_path, map_location=device) # type: ASRModel model_name = os.path.splitext(os.path.basename(cfg.model_path))[0] else: # restore model by name asr_model = ASRModel.from_pretrained( model_name=cfg.pretrained_name, map_location=device) # type: ASRModel model_name = cfg.pretrained_name trainer = pl.Trainer(gpus=int(cfg.cuda)) asr_model.set_trainer(trainer) asr_model = asr_model.eval() # Setup decoding strategy if hasattr(asr_model, 'change_decoding_strategy'): asr_model.change_decoding_strategy(cfg.rnnt_decoding) # get audio filenames if cfg.audio_dir is not None: filepaths = list( glob.glob(os.path.join(cfg.audio_dir, f"*.{cfg.audio_type}"))) else: # get filenames from manifest filepaths = [] with open(cfg.dataset_manifest, 'r') as f: for line in f: item = json.loads(line) filepaths.append(item['audio_filepath']) logging.info(f"\nTranscribing {len(filepaths)} files...\n") # setup AMP (optional) if cfg.amp and torch.cuda.is_available() and hasattr( torch.cuda, 'amp') and hasattr(torch.cuda.amp, 'autocast'): logging.info("AMP enabled!\n") autocast = torch.cuda.amp.autocast else: @contextlib.contextmanager def autocast(): yield # transcribe audio with autocast(): with torch.no_grad(): transcriptions = asr_model.transcribe(filepaths, batch_size=cfg.batch_size) logging.info(f"Finished transcribing {len(filepaths)} files !") if cfg.output_filename is None: # create default output filename if cfg.audio_dir is not None: cfg.output_filename = os.path.dirname( os.path.join(cfg.audio_dir, '.')) + '.json' else: cfg.output_filename = cfg.dataset_manifest.replace( '.json', f'_{model_name}.json') logging.info(f"Writing transcriptions into file: {cfg.output_filename}") with open(cfg.output_filename, 'w', encoding='utf-8') as f: if cfg.audio_dir is not None: for idx, text in enumerate(transcriptions): item = {'audio_filepath': filepaths[idx], 'pred_text': text} f.write(json.dumps(item) + "\n") else: with open(cfg.dataset_manifest, 'r') as fr: for idx, line in enumerate(fr): item = json.loads(line) item['pred_text'] = transcriptions[idx] f.write(json.dumps(item) + "\n") logging.info("Finished writing predictions !")
def main(cfg: ParallelTranscriptionConfig): if cfg.model.endswith(".nemo"): logging.info("Attempting to initialize from .nemo file") model = ASRModel.restore_from(restore_path=cfg.model, map_location="cpu") elif cfg.model.endswith(".ckpt"): logging.info("Attempting to initialize from .ckpt file") model = ASRModel.load_from_checkpoint(checkpoint_path=cfg.model, map_location="cpu") else: logging.info( "Attempting to initialize from a pretrained model as the model name does not have the extension of .nemo or .ckpt" ) model = ASRModel.from_pretrained(model_name=cfg.model, map_location="cpu") trainer = ptl.Trainer(**cfg.trainer) cfg.predict_ds.return_sample_id = True cfg.predict_ds = match_train_config(predict_ds=cfg.predict_ds, train_ds=model.cfg.train_ds) data_loader = model._setup_dataloader_from_config(cfg.predict_ds) os.makedirs(cfg.output_path, exist_ok=True) # trainer.global_rank is not valid before predict() is called. Need this hack to find the correct global_rank. global_rank = trainer.node_rank * trainer.num_gpus + int( os.environ.get("LOCAL_RANK", 0)) output_file = os.path.join(cfg.output_path, f"predictions_{global_rank}.json") predictor_writer = ASRPredictionWriter(dataset=data_loader.dataset, output_file=output_file) trainer.callbacks.extend([predictor_writer]) predictions = trainer.predict(model=model, dataloaders=data_loader, return_predictions=cfg.return_predictions) if predictions is not None: predictions = list(itertools.chain.from_iterable(predictions)) samples_num = predictor_writer.close_output_file() logging.info( f"Prediction on rank {global_rank} is done for {samples_num} samples and results are stored in {output_file}." ) if torch.distributed.is_initialized(): torch.distributed.barrier() samples_num = 0 pred_text_list = [] text_list = [] if is_global_rank_zero(): output_file = os.path.join(cfg.output_path, f"predictions_all.json") logging.info( f"Prediction files are being aggregated in {output_file}.") with open(output_file, 'w') as outf: for rank in range(trainer.world_size): input_file = os.path.join(cfg.output_path, f"predictions_{rank}.json") with open(input_file, 'r') as inpf: lines = inpf.readlines() for line in lines: item = json.loads(line) pred_text_list.append(item["pred_text"]) text_list.append(item["text"]) outf.write(json.dumps(item) + "\n") samples_num += 1 wer_cer = word_error_rate(hypotheses=pred_text_list, references=text_list, use_cer=cfg.use_cer) logging.info( f"Prediction is done for {samples_num} samples in total on all workers and results are aggregated in {output_file}." ) logging.info("{} for all predictions is {:.4f}.".format( "CER" if cfg.use_cer else "WER", wer_cer))
with open(out_file, "w") as f: f.write(sentences) if __name__ == "__main__": args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) text_files = [] if args.in_text: if args.model is None: raise ValueError( f"ASR model must be provided to extract vocabulary for text processing" ) elif os.path.exists(args.model): model_cfg = ASRModel.restore_from(restore_path=args.model, return_config=True) classpath = model_cfg.target # original class path imported_class = model_utils.import_class_by_path( classpath) # type: ASRModel print(f"Restoring model : {imported_class.__name__}") asr_model = imported_class.restore_from( restore_path=args.model) # type: ASRModel model_name = os.path.splitext(os.path.basename(args.model))[0] else: # restore model by name asr_model = ASRModel.from_pretrained( model_name=args.model) # type: ASRModel model_name = args.model vocabulary = asr_model.cfg.decoder.vocabulary