def __init__(self, device, melgan_config_path, melgan_stats_path, *args, **kwargs): super(MelganConverter, self).__init__(device, *args, **kwargs) print("initializing melganconverter") try: with open(melgan_config_path) as f: self.melgan_config = yaml.load(f, Loader=yaml.Loader) except: log.error(f"Unable to load in yaml config at path: {melgan_config_path}, unknown desired settings (sampling rate etc.) exiting...") exit(0) download_pretrained_model("vctk_multi_band_melgan.v2", Config.dir_paths["melgan_download_location"]) #download model self.melgan_model = None self.melgan_stats_path = melgan_stats_path
def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'): self.id = worker_id #Model selection self.fs = 22050 self.lang = "English" self.tag = "kan-bayashi/ljspeech_tacotron2" self.vocoder_tag = "ljspeech_parallel_wavegan.v1" #Model setup self.d = ModelDownloader() self.text2speech = Text2Speech( **self.d.download_and_unpack(self.tag), device="cpu", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, ) self.vocoder = load_model(download_pretrained_model( self.vocoder_tag)).to("cpu").eval() self.text2speech.spc2wav = None self.vocoder.remove_weight_norm() self.audio_d = audio_dest self.audio_f = audio_format
def __init__(self, model_name: str = 'multiband_mel_gan_vctk', device='cpu'): super().__init__() assert model_name in PARAMS['models'], \ 'Model name {} is not valid! choose in {}'.format( model_name, str(PARAMS['models'].keys())) model_name_mapping = PARAMS['models'][model_name] self.device = device self.encoder = MelSpectrogram(**PARAMS['audio'][model_name]) self.vocoder = load_model( download_pretrained_model(model_name_mapping)).to(device).eval() self.vocoder.remove_weight_norm() # make stat tensors param_key = 'vctk' if 'vctk' in model_name else 'lj' stats = MULTI_BAND_MEL_GAN_PARAMS[param_key] self.mean = torch.FloatTensor( stats['mean']).unsqueeze(0).unsqueeze(-1).to(device) self.scale = torch.FloatTensor( stats['scale']).unsqueeze(0).unsqueeze(-1).to(device) # print params print('Total Model {} params.'.format(self.num_params(self.vocoder)))
def build_vocoder_from_file( cls, vocoder_config_file: Union[Path, str] = None, vocoder_file: Union[Path, str] = None, model: Optional[ESPnetTTSModel] = None, device: str = "cpu", ): # Build vocoder if vocoder_file is None: # If vocoder file is not provided, use griffin-lim as a vocoder vocoder_conf = {} if vocoder_config_file is not None: vocoder_config_file = Path(vocoder_config_file) with vocoder_config_file.open("r", encoding="utf-8") as f: vocoder_conf = yaml.safe_load(f) if model.feats_extract is not None: vocoder_conf.update(model.feats_extract.get_parameters()) if ("n_fft" in vocoder_conf and "n_shift" in vocoder_conf and "fs" in vocoder_conf): return Spectrogram2Waveform(**vocoder_conf) else: logging.warning( "Vocoder is not available. Skipped its building.") return None elif not Path(vocoder_file).exists(): # Assume that vocoder file is the tag of pretrained model try: from parallel_wavegan.utils import download_pretrained_model except ImportError: logging.error( "`parallel_wavegan` is not installed. " "Please install via `pip install -U parallel_wavegan`.") raise from parallel_wavegan import __version__ # NOTE(kan-bayashi): Filelock download is supported from 0.5.2 assert LooseVersion(__version__) > LooseVersion("0.5.1"), ( "Please install the latest parallel_wavegan " "via `pip install -U parallel_wavegan`.") logging.info( f"{vocoder_file} does not exist. " f"We assume that {vocoder_file} is tag of the pretrained model." ) vocoder = ParallelWaveGANPretrainedVocoder( download_pretrained_model(vocoder_file)) return vocoder.to(device) elif str(vocoder_file).endswith(".pkl"): # If the extension is ".pkl", the model is trained with parallel_wavegan vocoder = ParallelWaveGANPretrainedVocoder(vocoder_file, vocoder_config_file) return vocoder.to(device) else: raise ValueError(f"{vocoder_file} is not supported format.")
def from_pretrained( model_tag: Optional[str] = None, vocoder_tag: Optional[str] = None, **kwargs: Optional[Any], ): """Build Text2Speech instance from the pretrained model. Args: model_tag (Optional[str]): Model tag of the pretrained models. Currently, the tags of espnet_model_zoo are supported. vocoder_tag (Optional[str]): Vocoder tag of the pretrained vocoders. Currently, the tags of parallel_wavegan are supported, which should start with the prefix "parallel_wavegan/". Returns: Text2Speech: Text2Speech instance. """ if model_tag is not None: try: from espnet_model_zoo.downloader import ModelDownloader except ImportError: logging.error( "`espnet_model_zoo` is not installed. " "Please install via `pip install -U espnet_model_zoo`.") raise d = ModelDownloader() kwargs.update(**d.download_and_unpack(model_tag)) if vocoder_tag is not None: if vocoder_tag.startswith("parallel_wavegan/"): try: from parallel_wavegan.utils import download_pretrained_model except ImportError: logging.error( "`parallel_wavegan` is not installed. " "Please install via `pip install -U parallel_wavegan`." ) raise from parallel_wavegan import __version__ # NOTE(kan-bayashi): Filelock download is supported from 0.5.2 assert LooseVersion(__version__) > LooseVersion("0.5.1"), ( "Please install the latest parallel_wavegan " "via `pip install -U parallel_wavegan`.") vocoder_tag = vocoder_tag.replace("parallel_wavegan/", "") vocoder_file = download_pretrained_model(vocoder_tag) vocoder_config = Path(vocoder_file).parent / "config.yml" kwargs.update(vocoder_config=vocoder_config, vocoder_file=vocoder_file) else: raise ValueError(f"{vocoder_tag} is unsupported format.") return Text2Speech(**kwargs)
def setup_model(self): try: self.model_reload_needed = False self.output_status("Loading nltk...") # setup nltk import nltk nltk.data.path.append(MODEL_DIR + '/nltk_models') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models") self.output_status("Loading torch...", end=" ") # setup model import torch from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu" self.output_status("Running on " + self.mlDevice) self.output_status("Loading espnet...") d = ModelDownloader(MODEL_DIR + "/espnet_models") self.text2speech = Text2Speech( **d.download_and_unpack(self.tag), device=self.mlDevice, # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) self.text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. self.output_status("Loading vocoder models...") self.vocoder = load_model( download_pretrained_model(self.vocoder_tag, download_dir=MODEL_DIR + "/vocoder_models")).to( self.mlDevice).eval() self.vocoder.remove_weight_norm() self.output_status("Model setup completed.") except Exception as e: self.output_err("Model error", e) raise HandledException()
def test_parallel_wavegan_compatibility(): from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model ckpt_path = download_pretrained_model("ljspeech_hifigan.v1") state_dict = torch.load(ckpt_path, map_location="cpu")["model"]["generator"] model_pwg = load_model(ckpt_path) model_espnet2 = HiFiGANGenerator() model_espnet2.load_state_dict(state_dict) model_pwg.eval() model_espnet2.eval() with torch.no_grad(): c = torch.randn(5, 80) out_pwg = model_pwg.inference(c) out_espnet2 = model_espnet2.inference(c) np.testing.assert_array_equal( out_pwg.cpu().numpy(), out_espnet2.cpu().numpy(), )
""" # get amplitude spectrogram x_stft = librosa.stft(audio, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window, pad_mode="reflect") spc = np.abs(x_stft).T # (#frames, #bins) # get mel basis fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin, fmax) return np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) download_pretrained_model("vctk_multi_band_melgan.v2", "melgan") #download model vocoder_conf = "melgan/vctk_multi_band_melgan.v2/config.yml" with open(vocoder_conf) as f: config = yaml.load(f, Loader=yaml.Loader) #================================================Loading/preprocessing========================================================= # audio, sr = sf.read(utility.get_full_path(".\\input\\p225\\p225_001.wav")) audio, sr = sf.read(utility.get_full_path(".\\input\\Wouter\\6.wav")) # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim(audio, top_db=config["trim_threshold_in_db"], frame_length=config["trim_frame_size"], hop_length=config["trim_hop_size"])
from espnet_model_zoo.downloader import ModelDownloader import soundfile as sf import librosa import numpy as np import os import kaldiio d = ModelDownloader() # tag = 'kan-bayashi/libritts_gst+xvector_transformer' text2speech = Text2Speech( "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/config.yaml", "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/train.loss.best.pth", device="cuda") # text2speech.spc2wav = None vocoder = load_model( download_pretrained_model("libritts_parallel_wavegan.v1.long")).to( "cuda").eval() vocoder.remove_weight_norm() spembs = None if text2speech.use_speech: speech, fs = sf.read("/nolan/VCTK-Corpus/wav48/p226/p226_001.wav") # speech, _ = librosa.load("/nolan/VCTK-Corpus/wav48/p225/p225_001.wav", text2speech.fs) speech = torch.from_numpy(speech).float().cuda() # speech = torch.randn(50000,) texts = [ "Mostly I would recommend giving a quick look to the figures beyond the introduction.", ] for i, text in enumerate(texts): with torch.no_grad():
import os import torch import soundfile as sf from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model d = ModelDownloader() text2speech = Text2Speech( **d.download_and_unpack(tag), device="cuda", speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval() vocoder.remove_weight_norm() while True: conn, addr = s.accept() data = conn.recv(1024) encoding = 'utf-8' data = str(data, encoding) conn.close() # synthesis with torch.no_grad(): start = time.time() wav, c, *_ = text2speech(data) wav = vocoder.inference(c) rtf = (time.time() - start) / (len(wav) / fs) print(f"RTF = {rtf:5f}")
pad_mode="reflect") spc = np.abs(x_stft).T # (#frames, #bins) # get mel basis fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax mel_basis = librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin, fmax) return np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) # converter.output_to_wav([[mel]]) print(f"Now loading in pretrained melGAN model") download_pretrained_model("vctk_multi_band_melgan.v2", "./vocoders/melgan") model = load_model( "melgan/vctk_multi_band_melgan.v2/checkpoint-1000000steps.pkl") model.remove_weight_norm() model = model.eval().to(device) vocoder_conf = "melgan/vctk_multi_band_melgan.v2/config.yml" with open(vocoder_conf) as f: config = yaml.load(f, Loader=yaml.Loader) #================================================Loading/preprocessing========================================================= # audio, sr = sf.read(utility.get_full_path(".\\input\\p225\\p225_001.wav")) audio, sr = sf.read(utility.get_full_path(".\\input\\Wouter\\6.wav")) # trim silence if config["trim_silence"]: audio, _ = librosa.effects.trim(audio,
device=mlDevice, # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. vocoder = load_model( download_pretrained_model( vocoder_tag, download_dir='./vocoder_models')).to(mlDevice).eval() vocoder.remove_weight_norm() import scipy.io.wavfile as wv import os if os.path.isfile(out_name + ".wav"): os.remove(out_name + ".wav") from concurrent.futures import ThreadPoolExecutor executor = ThreadPoolExecutor(max_workers=5) def save_wav(wav, count=-1): # print("Outputing wav file...") out_arr = wav.view(-1).cpu().numpy() fname = out_name + ".wav"