Beispiel #1
0
    def __init__(self, torch_device=None):
        if torch_device is None:
            if torch.cuda.is_available():
                torch_device = 'cuda'
            else:
                torch_device = 'cpu'

        self.tacotron_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE)
        self.tacotron_file_checkpoints = path.join(TTS_WORK_DIR, TTS_MODEL_WEIGHTS)

        self.vocoder_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE)
        self.vocoder_file_checkpoints = path.join(VOCODER_WORK_DIR, VOCODER_MODEL_WEIGHTS)

        # Tacotron2 Loading
        self.tacotron_instance = Text2Speech(
            self.tacotron_file_config,
            self.tacotron_file_checkpoints,
            device=torch_device,
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3
        )
        self.tacotron_instance.spc2wav = None

        # Vocoder Loading
        self.vocoder = load_model(self.vocoder_file_checkpoints)\
            .to(torch_device)\
            .eval()
        self.vocoder.remove_weight_norm()
Beispiel #2
0
    def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'):
        self.id = worker_id
        #Model selection
        self.fs = 22050
        self.lang = "English"
        self.tag = "kan-bayashi/ljspeech_tacotron2"
        self.vocoder_tag = "ljspeech_parallel_wavegan.v1"

        #Model setup
        self.d = ModelDownloader()
        self.text2speech = Text2Speech(
            **self.d.download_and_unpack(self.tag),
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
        )
        self.vocoder = load_model(download_pretrained_model(
            self.vocoder_tag)).to("cpu").eval()

        self.text2speech.spc2wav = None
        self.vocoder.remove_weight_norm()

        self.audio_d = audio_dest
        self.audio_f = audio_format
def get_text2speech(
    model_name='kan-bayashi/jsut_tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_train.loss.best'
):
    d = ModelDownloader()
    text2speech = Text2Speech(**d.download_and_unpack(model_name),
                              device='cuda')
    return text2speech
    def __init__(self, model_id: str):
        self.model = Text2Speech.from_pretrained(model_id, device="cpu")

        if hasattr(self.model, "fs"):
            self.sampling_rate = self.model.fs
        else:
            # 16000 by default if not specified
            self.sampling_rate = 16000
Beispiel #5
0
def _tts(model_name):
    d = ModelDownloader()
    text2speech = Text2Speech(**d.download_and_unpack(model_name))
    speech = np.zeros((10000, ), dtype=np.float32)
    if text2speech.use_speech:
        text2speech("foo", speech=speech)
    else:
        text2speech("foo")
def _tts(model_name):
    d = ModelDownloader("downloads")
    text2speech = Text2Speech(**d.download_and_unpack(model_name, quiet=True))
    inputs = {"text": "foo"}
    if text2speech.use_speech:
        inputs["speech"] = np.zeros((10000,), dtype=np.float32)
    if text2speech.tts.spk_embed_dim is not None:
        inputs["spembs"] = np.zeros((text2speech.tts.spk_embed_dim,), dtype=np.float32)
    text2speech(**inputs)
    def setup_model(self):
        try:
            self.model_reload_needed = False
            self.output_status("Loading nltk...")

            # setup nltk
            import nltk
            nltk.data.path.append(MODEL_DIR + '/nltk_models')
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models")

            self.output_status("Loading torch...", end=" ")

            # setup model
            import torch
            from espnet_model_zoo.downloader import ModelDownloader
            from espnet2.bin.tts_inference import Text2Speech
            from parallel_wavegan.utils import download_pretrained_model
            from parallel_wavegan.utils import load_model

            self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu"
            self.output_status("Running on " + self.mlDevice)

            self.output_status("Loading espnet...")

            d = ModelDownloader(MODEL_DIR + "/espnet_models")
            self.text2speech = Text2Speech(
                **d.download_and_unpack(self.tag),
                device=self.mlDevice,
                # Only for Tacotron 2
                threshold=0.5,
                minlenratio=0.0,
                maxlenratio=10.0,
                use_att_constraint=False,
                backward_window=1,
                forward_window=3,
                # Only for FastSpeech & FastSpeech2
                speed_control_alpha=1.0,
            )
            self.text2speech.spc2wav = None  # Disable griffin-lim
            # NOTE: Sometimes download is failed due to "Permission denied". That is
            #   the limitation of google drive. Please retry after serveral hours.

            self.output_status("Loading vocoder models...")

            self.vocoder = load_model(
                download_pretrained_model(self.vocoder_tag,
                                          download_dir=MODEL_DIR +
                                          "/vocoder_models")).to(
                                              self.mlDevice).eval()
            self.vocoder.remove_weight_norm()
            self.output_status("Model setup completed.")
        except Exception as e:
            self.output_err("Model error", e)
            raise HandledException()
 def get_acoustic_model(self):
     acoustic_model = Text2Speech(
         self.acoustic_model_config_path,
         self.acoustic_model_path,
         device=self.device,
         threshold=0.5,
         minlenratio=0.0,
         maxlenratio=10.0,
         use_att_constraint=False,
         backward_window=1,
         forward_window=3
     )
     acoustic_model.spc2wav = None
     return acoustic_model
Beispiel #9
0
tag = "kan-bayashi/ljspeech_conformer_fastspeech2"
vocoder_tag = "ljspeech_full_band_melgan.v2"

import time
import os
import torch
import soundfile as sf
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model

d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

while True:
    conn, addr = s.accept()
    data = conn.recv(1024)
    encoding = 'utf-8'
    data = str(data, encoding)
    conn.close()
    # synthesis
    with torch.no_grad():
        start = time.time()
def test_Text2Speech(config_file):
    text2speech = Text2Speech(train_config=config_file)
    text = "aiueo"
    text2speech(text)
Beispiel #11
0
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model

fs, lang = 22050, "English"
tag = "kan-bayashi/ljspeech_tacotron2"
vocoder_tag = "ljspeech_multi_band_melgan.v2"

d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cpu",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cpu").eval()
vocoder.remove_weight_norm()


def pronounce(x):
    with torch.no_grad():
Beispiel #12
0
## specify the path to vocoder's checkpoint
vocoder_checkpoint = "exp/vocoder/checkpoint-400000steps.pkl"
vocoder = load_model(vocoder_checkpoint).to("cuda").eval()
vocoder.remove_weight_norm()

## specify path to the main model(transformer/tacotron2/fastspeech) and its config file
config_file = "exp/tts_train_raw_char/config.yaml"
model_path = "exp/tts_train_raw_char/train.loss.ave_5best.pth"

text2speech = Text2Speech(
    config_file,
    model_path,
    device="cuda",
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=True,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim

args = get_args()
sample_text = args.text
with torch.no_grad():
    _, c_mel, *_ = text2speech(sample_text.lower())
    wav = vocoder.inference(c_mel)

## here all of your synthesized audios will be saved
Beispiel #13
0
    vocoder_tag = "ljspeech_multi_band_melgan.v2"  #@param ["ljspeech_parallel_wavegan.v1", "ljspeech_full_band_melgan.v2", "ljspeech_multi_band_melgan.v2"] {type:"string"}
    vocoder = load_model(
        download_pretrained_model(vocoder_tag)).to("cuda").eval()
    vocoder.remove_weight_norm()

    models = [
        # "transformer",
        # "tacotron2",
        # "fastspeech",
        "fastspeech2",
        # "conformer_fastspeech2",
    ]
    for model in models:
        text2speech = Text2Speech(
            "./egs2/ljspeech/tts1/exp/tts_train_{}_raw_phn_tacotron_g2p_en_no_space/config.yaml"
            .format(model),
            "./egs2/ljspeech/tts1/exp/tts_train_{}_raw_phn_tacotron_g2p_en_no_space/valid.loss.best.pth"
            .format(model),
            device="cuda")
        fs = text2speech.fs
        text2speech.spc2wav = None
        texts = [
            # "they state that they are compelled by an imperative sense of duty to advert in terms of decided condemnation to the lamentable condition of the prisons of the city of London,",
            # "they state that they are compelled by an imperative sense of duty to advert in terms of decided condemnation to the lamentable condition of the prisons of the city of London, The prison officials appear to be on the side of the inspectors, to the great dissatisfaction of the corporation, who claimed the full allegiance and support of its servants. "
            # "The Court in addition to the proper use of its judicial functions has improperly set itself up as a third house of the Congress. If, for instance, any one of the six justices of the Supreme Court now over the age of seventy should retire as provided under the plan, "
            # "If such a plan is good for the lower courts it certainly ought to be equally good for the highest court from which there is no appeal. Is it a dangerous precedent for the Congress to change the number of the justices? The Congress has always had, and will have, that power.",
            # "The prison officials appear to be on the side of the inspectors, to the great dissatisfaction of the corporation, who claimed the full allegiance and support of its servants.",
            # "in some yards",
            # "We have, therefore,",
            "The feature matching loss is a learned similarity metric measured by the difference in features of the discriminator between a ground truth sample and a generated sample",
        ]
        with torch.no_grad():