Esempio n. 1
0
def _asr(model_name):
    d = ModelDownloader("downloads")
    speech2text = Speech2Text(**d.download_and_unpack(model_name, quiet=True))
    speech = np.zeros((10000, ), dtype=np.float32)
    nbests = speech2text(speech)
    text, *_ = nbests[0]
    assert isinstance(text, str)
Esempio n. 2
0
    def __init__(self, model_name, trans_df):

        from espnet2.bin.asr_inference import Speech2Text
        from espnet_model_zoo.downloader import ModelDownloader
        import jiwer

        self.model_name = model_name
        d = ModelDownloader()
        self.asr_model = Speech2Text(**d.download_and_unpack(model_name))
        self.input_txt_list = []
        self.clean_txt_list = []
        self.output_txt_list = []
        self.transcriptions = []
        self.true_txt_list = []
        self.sample_rate = int(
            d.data_frame[d.data_frame["name"] == model_name]["fs"])
        self.trans_df = trans_df
        self.trans_dic = self._df_to_dict(trans_df)
        self.mix_counter = Counter()
        self.clean_counter = Counter()
        self.est_counter = Counter()
        self.transformation = jiwer.Compose([
            jiwer.ToLowerCase(),
            jiwer.RemovePunctuation(),
            jiwer.RemoveMultipleSpaces(),
            jiwer.Strip(),
            jiwer.SentencesToListOfWords(),
            jiwer.RemoveEmptyStrings(),
        ])
def get_speech2text():
    d = ModelDownloader()
    speech2text = Speech2Text(
        # Specify task and corpus
        # **d.download_and_unpack(task="asr", corpus="librispeech")
        **d.download_and_unpack(
            "Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave"
        ),
        device='cuda')
    return speech2text
Esempio n. 4
0
def test_Speech2Text(asr_config_file, lm_config_file):
    speech2text = Speech2Text(asr_train_config=asr_config_file,
                              lm_train_config=lm_config_file,
                              beam_size=1)
    speech = np.random.randn(100000)
    results = speech2text(speech)
    for text, token, token_int, hyp in results:
        assert isinstance(text, str)
        assert isinstance(token[0], str)
        assert isinstance(token_int[0], int)
        assert isinstance(hyp, Hypothesis)
Esempio n. 5
0
def test_EnhS2T_Speech2Text(enh_asr_config_file, lm_config_file):
    speech2text = Speech2Text(
        asr_train_config=enh_asr_config_file,
        lm_train_config=lm_config_file,
        beam_size=1,
        enh_s2t_task=True,
    )
    speech = np.random.randn(48000)
    results = speech2text(speech)
    for text, token, token_int, hyp in results:
        assert isinstance(text, str)
        assert isinstance(token[0], str)
        assert isinstance(token_int[0], int)
        assert isinstance(hyp, Hypothesis)
Esempio n. 6
0
    def __init__(self, model_name, trans_df):

        from espnet2.bin.asr_inference import Speech2Text
        from espnet_model_zoo.downloader import ModelDownloader

        self.model_name = model_name
        d = ModelDownloader()
        self.asr_model = Speech2Text(**d.download_and_unpack(model_name))
        self.input_txt_list = []
        self.clean_txt_list = []
        self.output_txt_list = []
        self.sample_rate = int(
            d.data_frame[d.data_frame["name"] == model_name]["fs"])
        self.trans_df = trans_df
        self.trans_dic = self._df_to_dict(trans_df)
        self.mix_counter = Counter()
        self.clean_counter = Counter()
        self.est_counter = Counter()
Esempio n. 7
0
import Levenshtein
import json
import warnings
from pathlib import Path
import hydra.utils as utils
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from utils import normalize
sns.set()

warnings.simplefilter('ignore')

d = ModelDownloader()
speech2text_en = Speech2Text(
    **d.download_and_unpack(task="asr", corpus="librispeech")
)

def log_spec_dB_dist(x, y):
    log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
    diff = x - y
    return log_spec_dB_const * math.sqrt(np.inner(diff, diff))

def average_mcd(mc_ref, mc_cv):
    _, path = fastdtw(mc_cv, mc_ref, dist=euclidean)
    twf = np.array(path).T
    cvt_mcc_dtw = mc_cv[twf[0]]
    trg_mcc_dtw = mc_ref[twf[1]]
    # MCD
    diff2sum = np.sum((cvt_mcc_dtw - trg_mcc_dtw)**2, 1)
    mcd_value = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)
 def __init__(self, model_id: str):
     self.model = Speech2Text.from_pretrained(model_id, device="cpu", beam_size=1)
     self.sampling_rate = 16000