コード例 #1
0
    def __init__(self, device, melgan_config_path, melgan_stats_path, *args, **kwargs):
        super(MelganConverter, self).__init__(device, *args, **kwargs)
        print("initializing melganconverter")

        try:
            with open(melgan_config_path) as f:
                self.melgan_config = yaml.load(f, Loader=yaml.Loader)
        except:
            log.error(f"Unable to load in yaml config at path: {melgan_config_path}, unknown desired settings (sampling rate etc.) exiting...")
            exit(0)
        download_pretrained_model("vctk_multi_band_melgan.v2", Config.dir_paths["melgan_download_location"]) #download model
        self.melgan_model = None
        self.melgan_stats_path = melgan_stats_path
コード例 #2
0
    def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'):
        self.id = worker_id
        #Model selection
        self.fs = 22050
        self.lang = "English"
        self.tag = "kan-bayashi/ljspeech_tacotron2"
        self.vocoder_tag = "ljspeech_parallel_wavegan.v1"

        #Model setup
        self.d = ModelDownloader()
        self.text2speech = Text2Speech(
            **self.d.download_and_unpack(self.tag),
            device="cpu",
            # Only for Tacotron 2
            threshold=0.5,
            minlenratio=0.0,
            maxlenratio=10.0,
            use_att_constraint=False,
            backward_window=1,
            forward_window=3,
        )
        self.vocoder = load_model(download_pretrained_model(
            self.vocoder_tag)).to("cpu").eval()

        self.text2speech.spc2wav = None
        self.vocoder.remove_weight_norm()

        self.audio_d = audio_dest
        self.audio_f = audio_format
コード例 #3
0
    def __init__(self,
                 model_name: str = 'multiband_mel_gan_vctk',
                 device='cpu'):
        super().__init__()
        assert model_name in PARAMS['models'], \
            'Model name {} is not valid! choose in {}'.format(
                model_name, str(PARAMS['models'].keys()))

        model_name_mapping = PARAMS['models'][model_name]

        self.device = device
        self.encoder = MelSpectrogram(**PARAMS['audio'][model_name])
        self.vocoder = load_model(
            download_pretrained_model(model_name_mapping)).to(device).eval()
        self.vocoder.remove_weight_norm()

        # make stat tensors
        param_key = 'vctk' if 'vctk' in model_name else 'lj'
        stats = MULTI_BAND_MEL_GAN_PARAMS[param_key]
        self.mean = torch.FloatTensor(
            stats['mean']).unsqueeze(0).unsqueeze(-1).to(device)
        self.scale = torch.FloatTensor(
            stats['scale']).unsqueeze(0).unsqueeze(-1).to(device)

        # print params
        print('Total Model {} params.'.format(self.num_params(self.vocoder)))
コード例 #4
0
    def build_vocoder_from_file(
        cls,
        vocoder_config_file: Union[Path, str] = None,
        vocoder_file: Union[Path, str] = None,
        model: Optional[ESPnetTTSModel] = None,
        device: str = "cpu",
    ):
        # Build vocoder
        if vocoder_file is None:
            # If vocoder file is not provided, use griffin-lim as a vocoder
            vocoder_conf = {}
            if vocoder_config_file is not None:
                vocoder_config_file = Path(vocoder_config_file)
                with vocoder_config_file.open("r", encoding="utf-8") as f:
                    vocoder_conf = yaml.safe_load(f)
            if model.feats_extract is not None:
                vocoder_conf.update(model.feats_extract.get_parameters())
            if ("n_fft" in vocoder_conf and "n_shift" in vocoder_conf
                    and "fs" in vocoder_conf):
                return Spectrogram2Waveform(**vocoder_conf)
            else:
                logging.warning(
                    "Vocoder is not available. Skipped its building.")
                return None

        elif not Path(vocoder_file).exists():
            # Assume that vocoder file is the tag of pretrained model
            try:
                from parallel_wavegan.utils import download_pretrained_model

            except ImportError:
                logging.error(
                    "`parallel_wavegan` is not installed. "
                    "Please install via `pip install -U parallel_wavegan`.")
                raise

            from parallel_wavegan import __version__

            # NOTE(kan-bayashi): Filelock download is supported from 0.5.2
            assert LooseVersion(__version__) > LooseVersion("0.5.1"), (
                "Please install the latest parallel_wavegan "
                "via `pip install -U parallel_wavegan`.")

            logging.info(
                f"{vocoder_file} does not exist. "
                f"We assume that {vocoder_file} is tag of the pretrained model."
            )
            vocoder = ParallelWaveGANPretrainedVocoder(
                download_pretrained_model(vocoder_file))
            return vocoder.to(device)

        elif str(vocoder_file).endswith(".pkl"):
            # If the extension is ".pkl", the model is trained with parallel_wavegan
            vocoder = ParallelWaveGANPretrainedVocoder(vocoder_file,
                                                       vocoder_config_file)
            return vocoder.to(device)

        else:
            raise ValueError(f"{vocoder_file} is not supported format.")
コード例 #5
0
    def from_pretrained(
        model_tag: Optional[str] = None,
        vocoder_tag: Optional[str] = None,
        **kwargs: Optional[Any],
    ):
        """Build Text2Speech instance from the pretrained model.

        Args:
            model_tag (Optional[str]): Model tag of the pretrained models.
                Currently, the tags of espnet_model_zoo are supported.
            vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders.
                Currently, the tags of parallel_wavegan are supported, which should
                start with the prefix "parallel_wavegan/".

        Returns:
            Text2Speech: Text2Speech instance.

        """
        if model_tag is not None:
            try:
                from espnet_model_zoo.downloader import ModelDownloader

            except ImportError:
                logging.error(
                    "`espnet_model_zoo` is not installed. "
                    "Please install via `pip install -U espnet_model_zoo`.")
                raise
            d = ModelDownloader()
            kwargs.update(**d.download_and_unpack(model_tag))

        if vocoder_tag is not None:
            if vocoder_tag.startswith("parallel_wavegan/"):
                try:
                    from parallel_wavegan.utils import download_pretrained_model

                except ImportError:
                    logging.error(
                        "`parallel_wavegan` is not installed. "
                        "Please install via `pip install -U parallel_wavegan`."
                    )
                    raise

                from parallel_wavegan import __version__

                # NOTE(kan-bayashi): Filelock download is supported from 0.5.2
                assert LooseVersion(__version__) > LooseVersion("0.5.1"), (
                    "Please install the latest parallel_wavegan "
                    "via `pip install -U parallel_wavegan`.")
                vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "")
                vocoder_file = download_pretrained_model(vocoder_tag)
                vocoder_config = Path(vocoder_file).parent / "config.yml"
                kwargs.update(vocoder_config=vocoder_config,
                              vocoder_file=vocoder_file)

            else:
                raise ValueError(f"{vocoder_tag} is unsupported format.")

        return Text2Speech(**kwargs)
コード例 #6
0
    def setup_model(self):
        try:
            self.model_reload_needed = False
            self.output_status("Loading nltk...")

            # setup nltk
            import nltk
            nltk.data.path.append(MODEL_DIR + '/nltk_models')
            try:
                nltk.data.find('tokenizers/punkt')
            except LookupError:
                nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models")

            self.output_status("Loading torch...", end=" ")

            # setup model
            import torch
            from espnet_model_zoo.downloader import ModelDownloader
            from espnet2.bin.tts_inference import Text2Speech
            from parallel_wavegan.utils import download_pretrained_model
            from parallel_wavegan.utils import load_model

            self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu"
            self.output_status("Running on " + self.mlDevice)

            self.output_status("Loading espnet...")

            d = ModelDownloader(MODEL_DIR + "/espnet_models")
            self.text2speech = Text2Speech(
                **d.download_and_unpack(self.tag),
                device=self.mlDevice,
                # Only for Tacotron 2
                threshold=0.5,
                minlenratio=0.0,
                maxlenratio=10.0,
                use_att_constraint=False,
                backward_window=1,
                forward_window=3,
                # Only for FastSpeech & FastSpeech2
                speed_control_alpha=1.0,
            )
            self.text2speech.spc2wav = None  # Disable griffin-lim
            # NOTE: Sometimes download is failed due to "Permission denied". That is
            #   the limitation of google drive. Please retry after serveral hours.

            self.output_status("Loading vocoder models...")

            self.vocoder = load_model(
                download_pretrained_model(self.vocoder_tag,
                                          download_dir=MODEL_DIR +
                                          "/vocoder_models")).to(
                                              self.mlDevice).eval()
            self.vocoder.remove_weight_norm()
            self.output_status("Model setup completed.")
        except Exception as e:
            self.output_err("Model error", e)
            raise HandledException()
コード例 #7
0
ファイル: test_hifigan.py プロジェクト: yuekaizhang/espnet
def test_parallel_wavegan_compatibility():
    from parallel_wavegan.utils import download_pretrained_model
    from parallel_wavegan.utils import load_model

    ckpt_path = download_pretrained_model("ljspeech_hifigan.v1")
    state_dict = torch.load(ckpt_path,
                            map_location="cpu")["model"]["generator"]
    model_pwg = load_model(ckpt_path)
    model_espnet2 = HiFiGANGenerator()
    model_espnet2.load_state_dict(state_dict)
    model_pwg.eval()
    model_espnet2.eval()

    with torch.no_grad():
        c = torch.randn(5, 80)
        out_pwg = model_pwg.inference(c)
        out_espnet2 = model_espnet2.inference(c)
        np.testing.assert_array_equal(
            out_pwg.cpu().numpy(),
            out_espnet2.cpu().numpy(),
        )
コード例 #8
0
    """
    # get amplitude spectrogram
    x_stft = librosa.stft(audio, n_fft=fft_size, hop_length=hop_size,
                          win_length=win_length, window=window, pad_mode="reflect")
    spc = np.abs(x_stft).T  # (#frames, #bins)

    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sampling_rate / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin, fmax)

    return np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

    

download_pretrained_model("vctk_multi_band_melgan.v2", "melgan") #download model
vocoder_conf = "melgan/vctk_multi_band_melgan.v2/config.yml"
with open(vocoder_conf) as f:
    config = yaml.load(f, Loader=yaml.Loader)


#================================================Loading/preprocessing=========================================================
# audio, sr = sf.read(utility.get_full_path(".\\input\\p225\\p225_001.wav"))
audio, sr = sf.read(utility.get_full_path(".\\input\\Wouter\\6.wav"))
# trim silence
if config["trim_silence"]:
    audio, _ = librosa.effects.trim(audio,
                                    top_db=config["trim_threshold_in_db"],
                                    frame_length=config["trim_frame_size"],
                                    hop_length=config["trim_hop_size"])
コード例 #9
0
ファイル: inference_gst.py プロジェクト: NolanYuu/espnet
    from espnet_model_zoo.downloader import ModelDownloader
    import soundfile as sf
    import librosa
    import numpy as np
    import os
    import kaldiio

    d = ModelDownloader()
    # tag = 'kan-bayashi/libritts_gst+xvector_transformer'
    text2speech = Text2Speech(
        "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/config.yaml",
        "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/train.loss.best.pth",
        device="cuda")
    # text2speech.spc2wav = None
    vocoder = load_model(
        download_pretrained_model("libritts_parallel_wavegan.v1.long")).to(
            "cuda").eval()

    vocoder.remove_weight_norm()
    spembs = None
    if text2speech.use_speech:
        speech, fs = sf.read("/nolan/VCTK-Corpus/wav48/p226/p226_001.wav")
        # speech, _ = librosa.load("/nolan/VCTK-Corpus/wav48/p225/p225_001.wav", text2speech.fs)
        speech = torch.from_numpy(speech).float().cuda()
        # speech = torch.randn(50000,)

    texts = [
        "Mostly I would recommend giving a quick look to the figures beyond the introduction.",
    ]
    for i, text in enumerate(texts):
        with torch.no_grad():
コード例 #10
0
ファイル: MyTTsService.py プロジェクト: MikeyBeez/espnet
import os
import torch
import soundfile as sf
from espnet_model_zoo.downloader import ModelDownloader
from espnet2.bin.tts_inference import Text2Speech
from parallel_wavegan.utils import download_pretrained_model
from parallel_wavegan.utils import load_model

d = ModelDownloader()
text2speech = Text2Speech(
    **d.download_and_unpack(tag),
    device="cuda",
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval()
vocoder.remove_weight_norm()

while True:
    conn, addr = s.accept()
    data = conn.recv(1024)
    encoding = 'utf-8'
    data = str(data, encoding)
    conn.close()
    # synthesis
    with torch.no_grad():
        start = time.time()
        wav, c, *_ = text2speech(data)
        wav = vocoder.inference(c)
    rtf = (time.time() - start) / (len(wav) / fs)
    print(f"RTF = {rtf:5f}")
コード例 #11
0
                          pad_mode="reflect")
    spc = np.abs(x_stft).T  # (#frames, #bins)

    # get mel basis
    fmin = 0 if fmin is None else fmin
    fmax = sampling_rate / 2 if fmax is None else fmax
    mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin,
                                    fmax)

    return np.log10(np.maximum(eps, np.dot(spc, mel_basis.T)))

    # converter.output_to_wav([[mel]])


print(f"Now loading in pretrained melGAN model")
download_pretrained_model("vctk_multi_band_melgan.v2", "./vocoders/melgan")
model = load_model(
    "melgan/vctk_multi_band_melgan.v2/checkpoint-1000000steps.pkl")
model.remove_weight_norm()
model = model.eval().to(device)

vocoder_conf = "melgan/vctk_multi_band_melgan.v2/config.yml"
with open(vocoder_conf) as f:
    config = yaml.load(f, Loader=yaml.Loader)

#================================================Loading/preprocessing=========================================================
# audio, sr = sf.read(utility.get_full_path(".\\input\\p225\\p225_001.wav"))
audio, sr = sf.read(utility.get_full_path(".\\input\\Wouter\\6.wav"))
# trim silence
if config["trim_silence"]:
    audio, _ = librosa.effects.trim(audio,
コード例 #12
0
    device=mlDevice,
    # Only for Tacotron 2
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2
    speed_control_alpha=1.0,
)
text2speech.spc2wav = None  # Disable griffin-lim
# NOTE: Sometimes download is failed due to "Permission denied". That is
#   the limitation of google drive. Please retry after serveral hours.
vocoder = load_model(
    download_pretrained_model(
        vocoder_tag, download_dir='./vocoder_models')).to(mlDevice).eval()
vocoder.remove_weight_norm()

import scipy.io.wavfile as wv
import os

if os.path.isfile(out_name + ".wav"): os.remove(out_name + ".wav")

from concurrent.futures import ThreadPoolExecutor
executor = ThreadPoolExecutor(max_workers=5)


def save_wav(wav, count=-1):
    # print("Outputing wav file...")
    out_arr = wav.view(-1).cpu().numpy()
    fname = out_name + ".wav"