def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name if token_type is not None: if token_list is None: raise ValueError("token_list is required if token_type is not None") self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.tokenizer = None self.token_id_converter = None
def test_from_file(tmp_path: Path): with (tmp_path / "tokens.txt").open("w") as f: f.write("a\n") f.write("b\n") f.write("c\n") f.write("<unk>\n") converter = TokenIDConverter(tmp_path / "tokens.txt") assert converter.tokens2ids("abc") == [0, 1, 2]
def __init__( self, asr_model: MaskCTCModel, n_iterations: int, threshold_probability: float, ): """Initialize Mask-CTC inference""" super().__init__() self.ctc = asr_model.ctc self.mlm = asr_model.decoder self.mask_token = asr_model.mask_token self.n_iterations = n_iterations self.threshold_probability = threshold_probability self.converter = TokenIDConverter(token_list=asr_model.token_list)
def build_tokenizer(self): """Cria um objeto tokenizer para conversão dos tokens inteiros para o dicionário de caracteres correspondente. Caso o modelo possua um modelo BPE de tokenização, ele é utilizado. Se não, apenas a lista de caracteres no arquivo de configuração é usada. """ token_type = self.model_config['token_type'] if token_type == 'bpe': bpemodel = self.model_config['bpemodel'] self.tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: self.tokenizer = build_tokenizer(token_type=token_type) self.converter = TokenIDConverter(token_list=self.model.token_list)
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 8, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, output_beam_size: int = 8, ): assert check_argument_types() # 1. Build ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() token_list = asr_model.token_list self.decode_graph = k2.arc_sort( build_ctc_topo(list(range(len(token_list))))).to(device) if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") logging.info(f"Running on : {device}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.device = device self.dtype = dtype self.output_beam_size = output_beam_size
class CommonPreprocessor(AbsPreprocessor): def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name if token_type is not None: if token_list is None: raise ValueError("token_list is required if token_type is not None") self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.tokenizer = None self.token_id_converter = None def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: assert check_argument_types() if self.speech_name in data: # Nothing now: candidates: # - STFT # - Fbank # - CMVN # - Data augmentation pass if self.text_name in data and self.tokenizer is not None: text = data[self.text_name] tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[self.text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", batch_size: int = 1, dtype: str = "float32", maskctc_n_iterations: int = 10, maskctc_threshold_probability: float = 0.99, ): assert check_argument_types() # 1. Build ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() token_list = asr_model.token_list s2t = MaskCTCInference( asr_model=asr_model, n_iterations=maskctc_n_iterations, threshold_probability=maskctc_threshold_probability, ) s2t.to(device=device, dtype=getattr(torch, dtype)).eval() # 2. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.s2t = s2t self.converter = converter self.tokenizer = tokenizer self.device = device self.dtype = dtype
def test_tokens2ids(): converter = TokenIDConverter(["a", "b", "c", "<unk>"]) assert converter.tokens2ids("abc") == [0, 1, 2]
def test_input_2dim_array(): converter = TokenIDConverter(["a", "b", "c", "<unk>"]) with pytest.raises(ValueError): converter.ids2tokens(np.random.randn(2, 2))
def test_no_unk(): with pytest.raises(RuntimeError): TokenIDConverter(["a", "b", "c"])
def test_duplicated(): with pytest.raises(RuntimeError): TokenIDConverter(["a", "a", "c"])
class ASR(object): def __init__( self, zip_model_file: Union[Path, str], ) -> None: self.zip_model_file = abspath(zip_model_file) self.device = 'cpu' self.model = None self.beam_search = None self.tokenizer = None self.converter = None self.global_cmvn = None self.extract_zip_model_file(self.zip_model_file) def extract_zip_model_file(self, zip_model_file: str) -> Dict[str, Any]: """Extrai os dados de um zip contendo o arquivo com o estado do modelo e configurações Args: zip_model_file (str): ZipFile do modelo gerado dos scripts de treinamento Raises: ValueError: Se o arquivo não for correto FileNotFoundError: Se o arquivo zip não contiver os arquivos necessários Returns: Dict[str, Any]: Dicionário do arquivo .yaml utilizado durante o treinamento para carregar o modelo corretamente """ print("Unzipping model") if not zipfile.is_zipfile(zip_model_file): raise ValueError(f"File {zip_model_file} is not a zipfile") else: zipfile.ZipFile(zip_model_file).extractall(dirname(zip_model_file)) check = ['exp', 'meta.yaml'] if not all([x for x in check]): raise FileNotFoundError print("Load yaml file") with open('meta.yaml') as f: meta = yaml.load(f, Loader=yaml.FullLoader) model_stats_file = meta['files']['asr_model_file'] asr_model_config_file = meta['yaml_files']['asr_train_config'] self.model_config = {} with open(asr_model_config_file) as f: self.model_config = yaml.load(f, Loader=yaml.FullLoader) try: self.global_cmvn = self.model_config['normalize_conf'][ 'stats_file'] except KeyError: self.global_cmvn = None print(f'Loading model config from {asr_model_config_file}') print(f'Loading model state from {model_stats_file}') #Build Model print('Building model') self.model, _ = ASRTask.build_model_from_file(asr_model_config_file, model_stats_file, self.device) self.model.to(dtype=getattr(torch, 'float32')).eval() #print("Loading extra modules") self.build_beam_search() self.build_tokenizer() def build_beam_search(self, ctc_weight: float = 0.4, beam_size: int = 1): """Constroi o objeto de decodificação beam_search. Esse objeto faz a decodificação do vetor de embeddings da saída da parte encoder do modelo passando pelos decoders da rede que são o módulo CTC e Transformer ou RNN. Como: Loss = (1-λ)*DecoderLoss + λ*CTCLoss Se ctc_weight=1 apenas o módulo CTC será usado na decodificação Args: ctc_weight (float, optional): Peso dado ao módulo CTC da rede. Defaults to 0.4. beam_size (int, optional): Tamanho do feixe de busca durante a codificação. Defaults to 1. """ scorers = {} ctc = CTCPrefixScorer(ctc=self.model.ctc, eos=self.model.eos) token_list = self.model.token_list scorers.update( decoder=self.model.decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) #Variáveis com os pesos para cada parte da decodificação #lm referente à modelos de linguagem não são utilizados aqui mas são necessários no objeto weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=1.0, length_bonus=0.0, ) #Cria o objeto beam_search self.beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=self.model.sos, eos=self.model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) self.beam_search.to(device=self.device, dtype=getattr(torch, 'float32')).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=self.device, dtype=getattr(torch, 'float32')).eval() def build_tokenizer(self): """Cria um objeto tokenizer para conversão dos tokens inteiros para o dicionário de caracteres correspondente. Caso o modelo possua um modelo BPE de tokenização, ele é utilizado. Se não, apenas a lista de caracteres no arquivo de configuração é usada. """ token_type = self.model_config['token_type'] if token_type == 'bpe': bpemodel = self.model_config['bpemodel'] self.tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: self.tokenizer = build_tokenizer(token_type=token_type) self.converter = TokenIDConverter(token_list=self.model.token_list) def get_layers(self) -> Dict[str, Dict[str, torch.Size]]: """Retorna as camadas nomeadas e os respectivos shapes para todos os módulos da rede. Os módulos são: Encoder: RNN, VGGRNN, TransformerEncoder Decoder: RNN, TransformerDecoder CTC Returns: Dict[str, Dict[str, torch.Size]]: Dicionário de cada módulo com seus respectivos layers e shape """ r = {} r['frontend'] = { x: self.model.frontend.state_dict()[x].shape for x in self.model.frontend.state_dict().keys() } r['specaug'] = { x: self.model.specaug.state_dict()[x].shape for x in self.model.specaug.state_dict().keys() } r['normalize'] = { x: self.model.normalize.state_dict()[x].shape for x in self.model.normalize.state_dict().keys() } r['encoder'] = { x: self.model.encoder.state_dict()[x].shape for x in self.model.encoder.state_dict().keys() } r['decoder'] = { x: self.model.decoder.state_dict()[x].shape for x in self.model.decoder.state_dict().keys() } r['ctc'] = { x: self.model.ctc.state_dict()[x].shape for x in self.model.ctc.state_dict().keys() } return r def frontend(self, audiofile: Union[Path, str, bytes], normalize: bool = True) -> Tuple[torch.Tensor, torch.Tensor]: """Executa o frontend do modelo, transformando as amostras de áudio em parâmetros log mel spectrogram Args: audiofile (Union[Path, str]): arquivo de áudio Returns: Tuple[torch.Tensor, torch.Tensor]: Parâmetros, Tamanho do vetor de parâmetros """ if isinstance(audiofile, str): audio_samples, rate = librosa.load(audiofile, sr=16000) elif isinstance(audiofile, bytes): audio_samples, rate = librosa.core.load(io.BytesIO(audiofile), sr=16000) else: raise ValueError("Failed to load audio file") if isinstance(audio_samples, np.ndarray): audio_samples = torch.tensor(audio_samples) audio_samples = audio_samples.unsqueeze(0).to(getattr( torch, 'float32')) lengths = audio_samples.new_full([1], dtype=torch.long, fill_value=audio_samples.size(1)) features, features_length = self.model.frontend(audio_samples, lengths) if normalize: features, features_length = self.model.normalize( features, features_length) return features, features_length def specaug( self, features: torch.Tensor, features_length: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: """Executa o módulo specaug, da parte de 'data augmentation'. Útil para visualização apenas. Não é utilizado na inferência, apenas no treinamento. Args: features (torch.Tensor): Parâmetros features_length (torch.Tensor): tamanho do vetor de parâmetros Returns: Tuple[torch.Tensor, torch.Tensor]: Parâmetros com máscaras temporais, em frequência e distoção. Tamanho dos vetores """ return self.model.specaug(features, features_length) def __del__(self) -> None: """Remove os arquivos temporários """ for f in ['exp', 'meta.yaml']: print(f"Removing {f}") ff = join(dirname(self.zip_model_file), f) if exists(ff): if isdir(ff): shutil.rmtree(ff) elif isfile(ff): os.remove(ff) else: raise ValueError("Error ao remover arquivos temporários") @torch.no_grad() def recognize(self, audiofile: Union[Path, str, bytes]) -> Result: result = Result() if isinstance(audiofile, str): audio_samples, rate = librosa.load(audiofile, sr=16000) elif isinstance(audiofile, bytes): audio_samples, rate = librosa.core.load(io.BytesIO(audiofile), sr=16000) else: raise ValueError("Failed to load audio file") result.audio_samples = copy.deepcopy(audio_samples) #a entrada do modelo é torch.tensor if isinstance(audio_samples, np.ndarray): audio_samples = torch.tensor(audio_samples) audio_samples = audio_samples.unsqueeze(0).to(getattr( torch, 'float32')) lengths = audio_samples.new_full([1], dtype=torch.long, fill_value=audio_samples.size(1)) batch = {"speech": audio_samples, "speech_lengths": lengths} batch = to_device(batch, device=self.device) #model encoder enc, _ = self.model.encode(**batch) #model decoder nbest_hyps = self.beam_search(x=enc[0]) #Apenas a melhor hipótese best_hyps = nbest_hyps[0] #Conversão de tokenids do treinamento para texto token_int = best_hyps.yseq[1:-1].tolist() token_int = list(filter(lambda x: x != 0, token_int)) token = self.converter.ids2tokens(token_int) text = self.tokenizer.tokens2text(token) #Preenche o objeto result result.text = text result.encoded_vector = enc[0] #[0] remove dimensão de batch #calcula todas as matrizes de atenção # text_tensor = torch.Tensor(token_int).unsqueeze(0).to( getattr(torch, 'long')) batch["text"] = text_tensor batch["text_lengths"] = text_tensor.new_full( [1], dtype=torch.long, fill_value=text_tensor.size(1)) result.attention_weights = calculate_all_attentions(self.model, batch) result.tokens_txt = token #CTC posteriors logp = self.model.ctc.log_softmax(enc.unsqueeze(0))[0] result.ctc_posteriors = logp.exp_().numpy() result.tokens_int = best_hyps.yseq result.mel_features, _ = self.frontend(audiofile, normalize=False) return result def __call__(self, input: Union[Path, str, bytes]) -> Result: return self.recognize(input)
def test_get_num_vocabulary_size(): converter = TokenIDConverter(["a", "b", "c", "<unk>"]) assert converter.get_num_vocabulary_size() == 4
def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, rir_scp: str = None, rir_apply_prob: float = 1.0, noise_scp: str = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", speech_volume_normalize: float = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name self.speech_volume_normalize = speech_volume_normalize self.rir_apply_prob = rir_apply_prob self.noise_apply_prob = noise_apply_prob if token_type is not None: if token_list is None: raise ValueError( "token_list is required if token_type is not None") self.text_cleaner = TextCleaner(text_cleaner) self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.text_cleaner = None self.tokenizer = None self.token_id_converter = None if train and rir_scp is not None: self.rirs = [] with open(rir_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.rirs.append(sps[0]) else: self.rirs.append(sps[1]) else: self.rirs = None if train and noise_scp is not None: self.noises = [] with open(noise_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.noises.append(sps[0]) else: self.noises.append(sps[1]) sps = noise_db_range.split("_") if len(sps) == 1: self.noise_db_low, self.noise_db_high = float(sps[0]) elif len(sps) == 2: self.noise_db_low, self.noise_db_high = float(sps[0]), float( sps[1]) else: raise ValueError( "Format error: '{noise_db_range}' e.g. -3_4 -> [-3db,4db]") else: self.noises = None
class MaskCTCInference(torch.nn.Module): """Mask-CTC-based non-autoregressive inference""" def __init__( self, asr_model: MaskCTCModel, n_iterations: int, threshold_probability: float, ): """Initialize Mask-CTC inference""" super().__init__() self.ctc = asr_model.ctc self.mlm = asr_model.decoder self.mask_token = asr_model.mask_token self.n_iterations = n_iterations self.threshold_probability = threshold_probability self.converter = TokenIDConverter(token_list=asr_model.token_list) def ids2text(self, ids: List[int]): text = "".join(self.converter.ids2tokens(ids)) return text.replace("<mask>", "_").replace("<space>", " ") def forward(self, enc_out: torch.Tensor) -> List[Hypothesis]: """Perform Mask-CTC inference""" # greedy ctc outputs enc_out = enc_out.unsqueeze(0) ctc_probs, ctc_ids = torch.exp( self.ctc.log_softmax(enc_out)).max(dim=-1) y_hat = torch.stack([x[0] for x in groupby(ctc_ids[0])]) y_idx = torch.nonzero(y_hat != 0).squeeze(-1) logging.info("ctc:{}".format(self.ids2text(y_hat[y_idx].tolist()))) # calculate token-level ctc probabilities by taking # the maximum probability of consecutive frames with # the same ctc symbols probs_hat = [] cnt = 0 for i, y in enumerate(y_hat.tolist()): probs_hat.append(-1) while cnt < ctc_ids.shape[1] and y == ctc_ids[0][cnt]: if probs_hat[i] < ctc_probs[0][cnt]: probs_hat[i] = ctc_probs[0][cnt].item() cnt += 1 probs_hat = torch.from_numpy(numpy.array(probs_hat)) # mask ctc outputs based on ctc probabilities p_thres = self.threshold_probability mask_idx = torch.nonzero(probs_hat[y_idx] < p_thres).squeeze(-1) confident_idx = torch.nonzero(probs_hat[y_idx] >= p_thres).squeeze(-1) mask_num = len(mask_idx) y_in = torch.zeros(1, len(y_idx), dtype=torch.long) + self.mask_token y_in[0][confident_idx] = y_hat[y_idx][confident_idx] logging.info("msk:{}".format(self.ids2text(y_in[0].tolist()))) # iterative decoding if not mask_num == 0: K = self.n_iterations num_iter = K if mask_num >= K and K > 0 else mask_num for t in range(num_iter - 1): pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)]) pred_score, pred_id = pred[0][mask_idx].max(dim=-1) cand = torch.topk(pred_score, mask_num // num_iter, -1)[1] y_in[0][mask_idx[cand]] = pred_id[cand] mask_idx = torch.nonzero( y_in[0] == self.mask_token).squeeze(-1) logging.info("msk:{}".format(self.ids2text(y_in[0].tolist()))) # predict leftover masks (|masks| < mask_num // num_iter) pred, _ = self.mlm(enc_out, [enc_out.size(1)], y_in, [y_in.size(1)]) y_in[0][mask_idx] = pred[0][mask_idx].argmax(dim=-1) logging.info("msk:{}".format(self.ids2text(y_in[0].tolist()))) # pad with mask tokens to ensure compatibility with sos/eos tokens yseq = torch.tensor([self.mask_token] + y_in.tolist()[0] + [self.mask_token], device=y_in.device) return Hypothesis(yseq=yseq)
def __init__( self, asr_train_config: Union[Path, str] = None, asr_model_file: Union[Path, str] = None, transducer_conf: dict = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, enh_s2t_task: bool = False, quantize_asr_model: bool = False, quantize_lm: bool = False, quantize_modules: List[str] = ["Linear"], quantize_dtype: str = "qint8", ): assert check_argument_types() task = ASRTask if not enh_s2t_task else EnhS2TTask if quantize_asr_model or quantize_lm: if quantize_dtype == "float16" and torch.__version__ < LooseVersion( "1.5.0"): raise ValueError( "float16 dtype for dynamic quantization is not supported with " "torch version < 1.5.0. Switch to qint8 dtype instead.") quantize_modules = set( [getattr(torch.nn, q) for q in quantize_modules]) quantize_dtype = getattr(torch, quantize_dtype) # 1. Build ASR model scorers = {} asr_model, asr_train_args = task.build_model_from_file( asr_train_config, asr_model_file, device) if enh_s2t_task: asr_model.inherite_attributes(inherite_s2t_attrs=[ "ctc", "decoder", "eos", "joint_network", "sos", "token_list", "use_transducer_decoder", ]) asr_model.to(dtype=getattr(torch, dtype)).eval() if quantize_asr_model: logging.info("Use quantized asr model for decoding.") asr_model = torch.quantization.quantize_dynamic( asr_model, qconfig_spec=quantize_modules, dtype=quantize_dtype) decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) if quantize_lm: logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( lm, qconfig_spec=quantize_modules, dtype=quantize_dtype) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object if asr_model.use_transducer_decoder: beam_search_transducer = BeamSearchTransducer( decoder=asr_model.decoder, joint_network=asr_model.joint_network, beam_size=beam_size, lm=scorers["lm"] if "lm" in scorers else None, lm_weight=lm_weight, **transducer_conf, ) beam_search = None else: beam_search_transducer = None weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: if streaming: beam_search.__class__ = BatchBeamSearchOnlineSim beam_search.set_streaming_config(asr_train_config) logging.info( "BatchBeamSearchOnlineSim implementation is selected." ) else: beam_search.__class__ = BatchBeamSearch logging.info( "BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.beam_search_transducer = beam_search_transducer self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, train: bool, token_type: List[str] = [None], token_list: List[Union[Path, str, Iterable[str]]] = [None], bpemodel: List[Union[Path, str, Iterable[str]]] = [None], text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, rir_scp: str = None, rir_apply_prob: float = 1.0, noise_scp: str = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", speech_volume_normalize: float = None, speech_name: str = "speech", text_name: List[str] = ["text"], ): # TODO(jiatong): sync with Kamo and Jing on interface for preprocessor super().__init__( train=train, token_type=token_type[0], token_list=token_list[0], bpemodel=bpemodel[0], text_cleaner=text_cleaner, g2p_type=g2p_type, unk_symbol=unk_symbol, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, delimiter=delimiter, speech_name=speech_name, text_name=text_name[0], rir_scp=rir_scp, rir_apply_prob=rir_apply_prob, noise_scp=noise_scp, noise_apply_prob=noise_apply_prob, noise_db_range=noise_db_range, speech_volume_normalize=speech_volume_normalize, ) assert ( len(token_type) == len(token_list) == len(bpemodel) == len(text_name) ), "token_type, token_list, bpemodel, or processing text_name mismatched" self.num_tokenizer = len(token_type) self.tokenizer = [] self.token_id_converter = [] for i in range(self.num_tokenizer): if token_type[i] is not None: if token_list[i] is None: raise ValueError("token_list is required if token_type is not None") self.tokenizer.append( build_tokenizer( token_type=token_type[i], bpemodel=bpemodel[i], delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) ) self.token_id_converter.append( TokenIDConverter( token_list=token_list[i], unk_symbol=unk_symbol, ) ) else: self.tokenizer.append(None) self.token_id_converter.append(None) self.text_cleaner = TextCleaner(text_cleaner) self.text_name = text_name # override the text_name from CommonPreprocessor
def __init__( self, mt_train_config: Union[Path, str] = None, mt_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build MT model scorers = {} mt_model, mt_train_args = MTTask.build_model_from_file( mt_train_config, mt_model_file, device) mt_model.to(dtype=getattr(torch, dtype)).eval() decoder = mt_model.decoder token_list = mt_model.token_list scorers.update( decoder=decoder, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object weights = dict( decoder=1.0, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=mt_model.sos, eos=mt_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key="full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = mt_train_args.token_type if bpemodel is None: bpemodel = mt_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.mt_model = mt_model self.mt_train_args = mt_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 8, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, search_beam_size: int = 20, output_beam_size: int = 20, min_active_states: int = 30, max_active_states: int = 10000, blank_bias: float = 0.0, lattice_weight: float = 1.0, is_ctc_decoding: bool = True, lang_dir: Optional[str] = None, use_fgram_rescoring: bool = False, use_nbest_rescoring: bool = False, am_weight: float = 1.0, decoder_weight: float = 0.5, nnlm_weight: float = 1.0, num_paths: int = 1000, nbest_batch_size: int = 500, nll_batch_size: int = 100, ): assert check_argument_types() # 1. Build ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device ) asr_model.to(dtype=getattr(torch, dtype)).eval() token_list = asr_model.token_list # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) self.lm = lm self.is_ctc_decoding = is_ctc_decoding self.use_fgram_rescoring = use_fgram_rescoring self.use_nbest_rescoring = use_nbest_rescoring assert self.is_ctc_decoding, "Currently, only ctc_decoding graph is supported." if self.is_ctc_decoding: self.decode_graph = k2.arc_sort( build_ctc_topo(list(range(len(token_list)))) ) self.decode_graph = self.decode_graph.to(device) if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") logging.info(f"Running on : {device}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.device = device self.dtype = dtype self.search_beam_size = search_beam_size self.output_beam_size = output_beam_size self.min_active_states = min_active_states self.max_active_states = max_active_states self.blank_bias = blank_bias self.lattice_weight = lattice_weight self.am_weight = am_weight self.decoder_weight = decoder_weight self.nnlm_weight = nnlm_weight self.num_paths = num_paths self.nbest_batch_size = nbest_batch_size self.nll_batch_size = nll_batch_size
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device ) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.lm_train_args = lm_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: if streaming: beam_search.__class__ = BatchBeamSearchOnlineSim beam_search.set_streaming_config(asr_train_config) logging.info( "BatchBeamSearchOnlineSim implementation is selected.") else: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, disable_repetition_detection=False, decoder_text_length_limit=0, encoded_feat_length_limit=0, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() assert isinstance(asr_model.encoder, ContextualBlockTransformerEncoder) or isinstance( asr_model.encoder, ContextualBlockConformerEncoder) decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) assert "encoder_conf" in asr_train_args assert "look_ahead" in asr_train_args.encoder_conf assert "hop_size" in asr_train_args.encoder_conf assert "block_size" in asr_train_args.encoder_conf # look_ahead = asr_train_args.encoder_conf['look_ahead'] # hop_size = asr_train_args.encoder_conf['hop_size'] # block_size = asr_train_args.encoder_conf['block_size'] assert batch_size == 1 beam_search = BatchBeamSearchOnline( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", disable_repetition_detection=disable_repetition_detection, decoder_text_length_limit=decoder_text_length_limit, encoded_feat_length_limit=encoded_feat_length_limit, ) non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] assert len(non_batch) == 0 # TODO(karita): make all scorers batchfied logging.info("BatchBeamSearchOnline implementation is selected.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest if "n_fft" in asr_train_args.frontend_conf: self.n_fft = asr_train_args.frontend_conf["n_fft"] else: self.n_fft = 512 if "hop_length" in asr_train_args.frontend_conf: self.hop_length = asr_train_args.frontend_conf["hop_length"] else: self.hop_length = 128 if ("win_length" in asr_train_args.frontend_conf and asr_train_args.frontend_conf["win_length"] is not None): self.win_length = asr_train_args.frontend_conf["win_length"] else: self.win_length = self.n_fft self.reset()
class CommonPreprocessor(AbsPreprocessor): def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, rir_scp: str = None, rir_apply_prob: float = 1.0, noise_scp: str = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", speech_volume_normalize: float = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name self.speech_volume_normalize = speech_volume_normalize self.rir_apply_prob = rir_apply_prob self.noise_apply_prob = noise_apply_prob if token_type is not None: if token_list is None: raise ValueError( "token_list is required if token_type is not None") self.text_cleaner = TextCleaner(text_cleaner) self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.text_cleaner = None self.tokenizer = None self.token_id_converter = None if train and rir_scp is not None: self.rirs = [] with open(rir_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.rirs.append(sps[0]) else: self.rirs.append(sps[1]) else: self.rirs = None if train and noise_scp is not None: self.noises = [] with open(noise_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.noises.append(sps[0]) else: self.noises.append(sps[1]) sps = noise_db_range.split("_") if len(sps) == 1: self.noise_db_low, self.noise_db_high = float(sps[0]) elif len(sps) == 2: self.noise_db_low, self.noise_db_high = float(sps[0]), float( sps[1]) else: raise ValueError( "Format error: '{noise_db_range}' e.g. -3_4 -> [-3db,4db]") else: self.noises = None def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]]) -> Dict[str, np.ndarray]: assert check_argument_types() if self.speech_name in data: if self.train and self.rirs is not None and self.noises is not None: speech = data[self.speech_name] nsamples = len(speech) # speech: (Nmic, Time) if speech.ndim == 1: speech = speech[None, :] else: speech = speech.T # Calc power on non shlence region power = (speech[detect_non_silence(speech)]**2).mean() # 1. Convolve RIR if self.rirs is not None and self.rir_apply_prob >= np.random.random( ): rir_path = np.random.choice(self.rirs) if rir_path is not None: rir, _ = soundfile.read(rir_path, dtype=np.float64, always_2d=True) # rir: (Nmic, Time) rir = rir.T # speech: (Nmic, Time) # Note that this operation doesn't change the signal length speech = scipy.signal.convolve( speech, rir, mode="full")[:, :speech.shape[1]] # Reverse mean power to the original power power2 = (speech[detect_non_silence(speech)]**2).mean() speech = np.sqrt(power / max(power2, 1e-10)) * speech # 2. Add Noise if (self.noises is not None and self.noise_apply_prob >= np.random.random()): noise_path = np.random.choice(self.noises) if noise_path is not None: noise_db = np.random.uniform(self.noise_db_low, self.noise_db_high) with soundfile.SoundFile(noise_path) as f: if f.frames == nsamples: noise = f.read(dtype=np.float64, always_2d=True) elif f.frames < nsamples: offset = np.random.randint( 0, nsamples - f.frames) # noise: (Time, Nmic) noise = f.read(dtype=np.float64, always_2d=True) # Repeat noise noise = np.pad( noise, [(offset, nsamples - f.frames - offset), (0, 0)], mode="wrap", ) else: offset = np.random.randint( 0, f.frames - nsamples) f.seek(offset) # noise: (Time, Nmic) noise = f.read(nsamples, dtype=np.float64, always_2d=True) if len(noise) != nsamples: raise RuntimeError( f"Something wrong: {noise_path}") # noise: (Nmic, Time) noise = noise.T noise_power = (noise**2).mean() scale = (10**(-noise_db / 20) * np.sqrt(power) / np.sqrt(max(noise_power, 1e-10))) speech = speech + scale * noise speech = speech.T ma = np.max(np.abs(speech)) if ma > 1.0: speech /= ma data[self.speech_name] = speech if self.speech_volume_normalize is not None: speech = data[self.speech_name] ma = np.max(np.abs(speech)) data[self. speech_name] = speech * self.speech_volume_normalize / ma if self.text_name in data and self.tokenizer is not None: text = data[self.text_name] # from transformers import pipeline # generator = pipeline('text-generation', model='gpt2') # data[self.text_name] = [] # words = text.split(" ") # chunk_len = 3 # pseudo_lookahead = 3 # for i in range(0, len(words), chunk_len): # chunk = " ".join(words[i:i+chunk_len]) # pseudo = generator(chunk, max_new_tokens=pseudo_lookahead, num_return_sequences=1)[0]["generated_text"] # pseudo = self.text_cleaner(pseudo) # tokens = self.tokenizer.text2tokens(pseudo) # text_ints = self.token_id_converter.tokens2ids(tokens) # data[self.text_name].append(np.array(text_ints, dtype=np.int64)) text = self.text_cleaner(text) tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[self.text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
def test_idstokens(): converter = TokenIDConverter(["a", "b", "c", "<unk>"]) assert converter.ids2tokens([0, 1, 2]) == ["a", "b", "c"]
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search(x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def __init__( self, asr_train_config: Union[Path, str] = None, asr_model_file: Union[Path, str] = None, beam_search_config: Dict[str, Any] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", beam_size: int = 5, dtype: str = "float32", lm_weight: float = 1.0, quantize_asr_model: bool = False, quantize_modules: List[str] = None, quantize_dtype: str = "qint8", nbest: int = 1, streaming: bool = False, chunk_size: int = 16, left_context: int = 32, right_context: int = 0, display_partial_hypotheses: bool = False, ) -> None: assert check_argument_types() asr_model, asr_train_args = ASRTransducerTask.build_model_from_file( asr_train_config, asr_model_file, device) if quantize_asr_model: if quantize_modules is not None: if not all([q in ["LSTM", "Linear"] for q in quantize_modules]): raise ValueError( "Only 'Linear' and 'LSTM' modules are currently supported" " by PyTorch and in --quantize_modules") q_config = set( [getattr(torch.nn, q) for q in quantize_modules]) else: q_config = {torch.nn.Linear} if quantize_dtype == "float16" and (V(torch.__version__) < V("1.5.0")): raise ValueError( "float16 dtype for dynamic quantization is not supported with torch" " version < 1.5.0. Switching to qint8 dtype instead.") q_dtype = getattr(torch, quantize_dtype) asr_model = torch.quantization.quantize_dynamic( asr_model, q_config, dtype=q_dtype).eval() else: asr_model.to(dtype=getattr(torch, dtype)).eval() if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) lm_scorer = lm.lm else: lm_scorer = None # 4. Build BeamSearch object if beam_search_config is None: beam_search_config = {} beam_search = BeamSearchTransducer( asr_model.decoder, asr_model.joint_network, beam_size, lm=lm_scorer, lm_weight=lm_weight, nbest=nbest, **beam_search_config, ) token_list = asr_model.token_list if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.device = device self.dtype = dtype self.nbest = nbest self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.streaming = streaming self.chunk_size = max(chunk_size, 0) self.left_context = max(left_context, 0) self.right_context = max(right_context, 0) if not streaming or chunk_size == 0: self.streaming = False self.asr_model.encoder.dynamic_chunk_training = False self.n_fft = asr_train_args.frontend_conf.get("n_fft", 512) self.hop_length = asr_train_args.frontend_conf.get("hop_length", 128) if asr_train_args.frontend_conf.get("win_length", None) is not None: self.frontend_window_size = asr_train_args.frontend_conf[ "win_length"] else: self.frontend_window_size = self.n_fft self.window_size = self.chunk_size + self.right_context self._raw_ctx = self.asr_model.encoder.get_encoder_input_raw_size( self.window_size, self.hop_length) self.last_chunk_length = (self.asr_model.encoder.embed.min_frame_length + self.right_context + 1) * self.hop_length self.reset_inference_cache()