Exemple #1
0
    def from_pretrained(
        model_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ):
        """Build SeparateSpeech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.

        Returns:
            SeparateSpeech: SeparateSpeech instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        return SeparateSpeech(**kwargs)
    def from_pretrained(
        model_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ) -> Speech2Text:
        """Build Speech2Text instance from the pretrained model.

        Args:
            model_tag: Model tag of the pretrained models.

        Return:
            : Speech2Text instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        return Speech2Text(**kwargs)
Exemple #3
0
    def __init__(self, model_name, trans_df):

        from espnet2.bin.asr_inference import Speech2Text
        from espnet_model_zoo.downloader import ModelDownloader
        import jiwer

        self.model_name = model_name
        d = ModelDownloader()
        self.asr_model = Speech2Text(**d.download_and_unpack(model_name))
        self.input_txt_list = []
        self.clean_txt_list = []
        self.output_txt_list = []
        self.transcriptions = []
        self.true_txt_list = []
        self.sample_rate = int(
            d.data_frame[d.data_frame["name"] == model_name]["fs"])
        self.trans_df = trans_df
        self.trans_dic = self._df_to_dict(trans_df)
        self.mix_counter = Counter()
        self.clean_counter = Counter()
        self.est_counter = Counter()
        self.transformation = jiwer.Compose([
            jiwer.ToLowerCase(),
            jiwer.RemovePunctuation(),
            jiwer.RemoveMultipleSpaces(),
            jiwer.Strip(),
            jiwer.SentencesToListOfWords(),
            jiwer.RemoveEmptyStrings(),
        ])
Exemple #4
0
def _asr(model_name):
    d = ModelDownloader("downloads")
    speech2text = Speech2Text(**d.download_and_unpack(model_name, quiet=True))
    speech = np.zeros((10000, ), dtype=np.float32)
    nbests = speech2text(speech)
    text, *_ = nbests[0]
    assert isinstance(text, str)
def get_text2speech(
    model_name='kan-bayashi/jsut_tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_train.loss.best'
):
    d = ModelDownloader()
    text2speech = Text2Speech(**d.download_and_unpack(model_name),
                              device='cuda')
    return text2speech
def _tts(model_name):
    d = ModelDownloader()
    text2speech = Text2Speech(**d.download_and_unpack(model_name))
    speech = np.zeros((10000, ), dtype=np.float32)
    if text2speech.use_speech:
        text2speech("foo", speech=speech)
    else:
        text2speech("foo")
Exemple #7
0
    def from_pretrained(
        model_tag: Optional[str] = None,
        vocoder_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ):
        """Build Text2Speech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.
            vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders.
                Currently, the tags of parallel_wavegan are supported, which should
                start with the prefix "parallel_wavegan/".

        Returns:
            Text2Speech: Text2Speech instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        if vocoder_tag is not None:
            if vocoder_tag.startswith("parallel_wavegan/"):
                try:
                    from parallel_wavegan.utils import download_pretrained_model

                except ImportError:
                    logging.error(
                        "`parallel_wavegan` is not installed. "
                        "Please install via `pip install -U parallel_wavegan`."
                    )
                    raise

                from parallel_wavegan import __version__

                # NOTE(kan-bayashi): Filelock download is supported from 0.5.2
                assert LooseVersion(__version__) > LooseVersion("0.5.1"), (
                    "Please install the latest parallel_wavegan "
                    "via `pip install -U parallel_wavegan`.")
                vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "")
                vocoder_file = download_pretrained_model(vocoder_tag)
                vocoder_config = Path(vocoder_file).parent / "config.yml"
                kwargs.update(vocoder_config=vocoder_config,
                              vocoder_file=vocoder_file)

            else:
                raise ValueError(f"{vocoder_tag} is unsupported format.")

        return Text2Speech(**kwargs)
    def setup_model(self):
        try:
            self.model_reload_needed = False
            self.output_status("Loading nltk...")

            # setup nltk
            import nltk
            nltk.data.path.append(MODEL_DIR + '/nltk_models')
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models")

            self.output_status("Loading torch...", end=" ")

            # setup model
            import torch
            from espnet_model_zoo.downloader import ModelDownloader
            from espnet2.bin.tts_inference import Text2Speech
            from parallel_wavegan.utils import download_pretrained_model
            from parallel_wavegan.utils import load_model

            self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu"
            self.output_status("Running on " + self.mlDevice)

            self.output_status("Loading espnet...")

            d = ModelDownloader(MODEL_DIR + "/espnet_models")
            self.text2speech = Text2Speech(
                **d.download_and_unpack(self.tag),
                device=self.mlDevice,
                # Only for Tacotron 2
                threshold=0.5,
                minlenratio=0.0,
                maxlenratio=10.0,
                use_att_constraint=False,
                backward_window=1,
                forward_window=3,
                # Only for FastSpeech & FastSpeech2
                speed_control_alpha=1.0,
            )
            self.text2speech.spc2wav = None  # Disable griffin-lim
            # NOTE: Sometimes download is failed due to "Permission denied". That is
            #   the limitation of google drive. Please retry after serveral hours.

            self.output_status("Loading vocoder models...")

            self.vocoder = load_model(
                download_pretrained_model(self.vocoder_tag,
                                          download_dir=MODEL_DIR +
                                          "/vocoder_models")).to(
                                              self.mlDevice).eval()
            self.vocoder.remove_weight_norm()
            self.output_status("Model setup completed.")
        except Exception as e:
            self.output_err("Model error", e)
            raise HandledException()
def _tts(model_name):
    d = ModelDownloader("downloads")
    text2speech = Text2Speech(**d.download_and_unpack(model_name, quiet=True))
    inputs = {"text": "foo"}
    if text2speech.use_speech:
        inputs["speech"] = np.zeros((10000,), dtype=np.float32)
    if text2speech.tts.spk_embed_dim is not None:
        inputs["spembs"] = np.zeros((text2speech.tts.spk_embed_dim,), dtype=np.float32)
    text2speech(**inputs)
def get_speech2text():
    d = ModelDownloader()
    speech2text = Speech2Text(
        # Specify task and corpus
        # **d.download_and_unpack(task="asr", corpus="librispeech")
        **d.download_and_unpack(
            "Shinji Watanabe/laborotv_asr_train_asr_conformer2_latest33_raw_char_sp_valid.acc.ave"
        ),
        device='cuda')
    return speech2text
Exemple #11
0
def main(cmd=None):
    print(get_commandline_args(), file=sys.stderr)
    parser = get_parser()
    args = parser.parse_args(cmd)

    d = ModelDownloader(".cache/espnet")
    o = d.download_and_unpack(args.mdl_file)
    kwargs = vars(args)
    kwargs.update(o)
    kwargs.update(
        {'data_path_and_name_and_type': [(args.wav_scp, 'speech', 'sound')]})
    del args.mdl_file
    del args.wav_scp

    kwargs.pop("config", None)
    inference(**kwargs)
Exemple #12
0
    def __init__(self, model_name, trans_df):

        from espnet2.bin.asr_inference import Speech2Text
        from espnet_model_zoo.downloader import ModelDownloader

        self.model_name = model_name
        d = ModelDownloader()
        self.asr_model = Speech2Text(**d.download_and_unpack(model_name))
        self.input_txt_list = []
        self.clean_txt_list = []
        self.output_txt_list = []
        self.sample_rate = int(
            d.data_frame[d.data_frame["name"] == model_name]["fs"])
        self.trans_df = trans_df
        self.trans_dic = self._df_to_dict(trans_df)
        self.mix_counter = Counter()
        self.clean_counter = Counter()
        self.est_counter = Counter()
Exemple #13
0
class TTS_Worker:
    def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'):
        self.id = worker_id
        #Model selection
        self.fs = 22050
        self.lang = "English"
        self.tag = "kan-bayashi/ljspeech_tacotron2"
        self.vocoder_tag = "ljspeech_parallel_wavegan.v1"

        #Model setup
        self.d = ModelDownloader()
        self.text2speech = Text2Speech(
            **self.d.download_and_unpack(self.tag),
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
        )
        self.vocoder = load_model(download_pretrained_model(
            self.vocoder_tag)).to("cpu").eval()

        self.text2speech.spc2wav = None
        self.vocoder.remove_weight_norm()

        self.audio_d = audio_dest
        self.audio_f = audio_format

    def process_text(self, text, dest):
        print(f'Worker {self.id} attempting : {text}')
        with torch.no_grad():
            #start = time.time()
            wav, c, *_ = self.text2speech(text)
            wav = self.vocoder.inference(c)
        #rtf = (time.time() - start) / (len(wav) / self.fs)
        #print(f"RTF = {rtf:5f}")

        #Output generation
        wav = wav.view(-1).cpu().numpy()
        sfwrite(self.audio_d + dest + self.audio_f, wav, self.fs)
        print(f'Worker {self.id} finished : {text}')
def test_download_and_clean_cache():
    d = ModelDownloader()
    d.download_and_unpack("test")
    p = d.download("test")
    d.clean_cache("test")
    assert not Path(p).exists()
def test_download_and_unpack_local_file():
    d = ModelDownloader()
    path = d.download("test")
    d.download_and_unpack(path)
def test_download_and_unpack_non_matching():
    d = ModelDownloader()
    with pytest.raises(RuntimeError):
        d.download_and_unpack(task="dummy")
def test_download_and_unpack_no_inputting():
    d = ModelDownloader()
    with pytest.raises(TypeError):
        d.download_and_unpack()
def test_download_and_unpack_with_name():
    d = ModelDownloader()
    d.download_and_unpack("test")
def test_download_and_unpack_with_url():
    d = ModelDownloader()
    d.download_and_unpack("https://zenodo.org/record/3951842/files/test.zip?download=1")
Exemple #20
0
fs, lang = 22050, "English"
tag = "kan-bayashi/ljspeech_conformer_fastspeech2"
vocoder_tag = "ljspeech_full_band_melgan.v2"

import time
import os
import torch
import soundfile as sf
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model

d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

while True:
    conn, addr = s.accept()
    data = conn.recv(1024)
    encoding = 'utf-8'
    data = str(data, encoding)
    conn.close()
    # synthesis
    with torch.no_grad():
import Levenshtein
import json
import warnings
from pathlib import Path
import hydra.utils as utils
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from utils import normalize
sns.set()

warnings.simplefilter('ignore')

d = ModelDownloader()
speech2text_en = Speech2Text(
    **d.download_and_unpack(task="asr", corpus="librispeech")
)

def log_spec_dB_dist(x, y):
    log_spec_dB_const = 10.0 / math.log(10.0) * math.sqrt(2.0)
    diff = x - y
    return log_spec_dB_const * math.sqrt(np.inner(diff, diff))

def average_mcd(mc_ref, mc_cv):
    _, path = fastdtw(mc_cv, mc_ref, dist=euclidean)
    twf = np.array(path).T
    cvt_mcc_dtw = mc_cv[twf[0]]
    trg_mcc_dtw = mc_ref[twf[1]]
    # MCD
    diff2sum = np.sum((cvt_mcc_dtw - trg_mcc_dtw)**2, 1)
    mcd_value = np.mean(10.0 / np.log(10.0) * np.sqrt(2 * diff2sum), 0)