def _asr(model_name): d = ModelDownloader("downloads") speech2text = Speech2Text(**d.download_and_unpack(model_name, quiet=True)) speech = np.zeros((10000, ), dtype=np.float32) nbests = speech2text(speech) text, *_ = nbests[0] assert isinstance(text, str)
def __init__(self, model_name, trans_df): from espnet2.bin.asr_inference import Speech2Text from espnet_model_zoo.downloader import ModelDownloader import jiwer self.model_name = model_name d = ModelDownloader() self.asr_model = Speech2Text(**d.download_and_unpack(model_name)) self.input_txt_list = [] self.clean_txt_list = [] self.output_txt_list = [] self.transcriptions = [] self.true_txt_list = [] self.sample_rate = int( d.data_frame[d.data_frame["name"] == model_name]["fs"]) self.trans_df = trans_df self.trans_dic = self._df_to_dict(trans_df) self.mix_counter = Counter() self.clean_counter = Counter() self.est_counter = Counter() self.transformation = jiwer.Compose([ jiwer.ToLowerCase(), jiwer.RemovePunctuation(), jiwer.RemoveMultipleSpaces(), jiwer.Strip(), jiwer.SentencesToListOfWords(), jiwer.RemoveEmptyStrings(), ])
def get_speech2text(): d = ModelDownloader() speech2text = Speech2Text( # Specify task and corpus # **d.download_and_unpack(task="asr", corpus="librispeech") **d.download_and_unpack( "Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave" ), device='cuda') return speech2text
def test_Speech2Text(asr_config_file, lm_config_file): speech2text = Speech2Text(asr_train_config=asr_config_file, lm_train_config=lm_config_file, beam_size=1) speech = np.random.randn(100000) results = speech2text(speech) for text, token, token_int, hyp in results: assert isinstance(text, str) assert isinstance(token[0], str) assert isinstance(token_int[0], int) assert isinstance(hyp, Hypothesis)
def test_EnhS2T_Speech2Text(enh_asr_config_file, lm_config_file): speech2text = Speech2Text( asr_train_config=enh_asr_config_file, lm_train_config=lm_config_file, beam_size=1, enh_s2t_task=True, ) speech = np.random.randn(48000) results = speech2text(speech) for text, token, token_int, hyp in results: assert isinstance(text, str) assert isinstance(token[0], str) assert isinstance(token_int[0], int) assert isinstance(hyp, Hypothesis)
def __init__(self, model_name, trans_df): from espnet2.bin.asr_inference import Speech2Text from espnet_model_zoo.downloader import ModelDownloader self.model_name = model_name d = ModelDownloader() self.asr_model = Speech2Text(**d.download_and_unpack(model_name)) self.input_txt_list = [] self.clean_txt_list = [] self.output_txt_list = [] self.sample_rate = int( d.data_frame[d.data_frame["name"] == model_name]["fs"]) self.trans_df = trans_df self.trans_dic = self._df_to_dict(trans_df) self.mix_counter = Counter() self.clean_counter = Counter() self.est_counter = Counter()
import Levenshtein import json import warnings from pathlib import Path import hydra.utils as utils import matplotlib.pyplot as plt import pickle import seaborn as sns from utils import normalize sns.set() warnings.simplefilter('ignore') d = ModelDownloader() speech2text_en = Speech2Text( **d.download_and_unpack(task="asr", corpus="librispeech") ) def log_spec_dB_dist(x, y): log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0) diff = x - y return log_spec_dB_const * math.sqrt(np.inner(diff, diff)) def average_mcd(mc_ref, mc_cv): _, path = fastdtw(mc_cv, mc_ref, dist=euclidean) twf = np.array(path).T cvt_mcc_dtw = mc_cv[twf[0]] trg_mcc_dtw = mc_ref[twf[1]] # MCD diff2sum = np.sum((cvt_mcc_dtw - trg_mcc_dtw)**2, 1) mcd_value = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
def __init__(self, model_id: str): self.model = Speech2Text.from_pretrained(model_id, device="cpu", beam_size=1) self.sampling_rate = 16000