def __init__( self, model: EncDecCTCModel, sample_rate: int, batch_size: int = 1, device: str = "cuda", ) -> None: super(ASRAudioEncoderDecoder, self).__init__() self.online_audio = ASROnlineAudioData(sample_rate) self.data_loader = DataLoader( dataset=self.online_audio, batch_size=batch_size, collate_fn=self.online_audio.collate_fn, ) model.eval() self.device = torch.device(device) self.model = model.to(self.device)
def generate_ref_hyps(asr_model: EncDecCTCModel, search: str, arpa: str): if can_gpu: asr_model = asr_model.cuda() print("USING GPU!") asr_model.eval() vocabulary = asr_model.decoder.vocabulary labels_map = dict([(i, vocabulary[i]) for i in range(len(vocabulary))]) wer = WER(vocabulary=vocabulary) if search == "kenlm" or search == "beamsearch": arpa_file = prepare_arpa_file(arpa) lm_path = arpa_file if search == "kenlm" else None beamsearcher = nemo_asr.modules.BeamSearchDecoderWithLM( vocab=list(vocabulary), beam_width=16, alpha=2, beta=1.5, lm_path=lm_path, num_cpus=max(os.cpu_count(), 1), input_tensor=True, ) for batch in asr_model.test_dataloader(): # TODO(tilo): test_loader should return dict or some typed object not tuple of tensors!! if can_gpu: batch = [x.cuda() for x in batch] input_signal, inpsig_len, transcript, transc_len = batch with autocast(): log_probs, encoded_len, greedy_predictions = asr_model( input_signal=input_signal, input_signal_length=inpsig_len) if search == "greedy": decoded = wer.ctc_decoder_predictions_tensor(greedy_predictions) else: decoded = beamsearch_forward(beamsearcher, log_probs=log_probs, log_probs_length=encoded_len) for i, hyp in enumerate(decoded): reference = "".join([ labels_map[c] for c in transcript[i].cpu().detach().numpy()[:transc_len[i]] ]) yield reference, hyp