Exemple #1
0
 def load_phoneme_sequence(self, wav_file, text):
     file_name = os.path.basename(wav_file).split('.')[0]
     tmp_path = os.path.join(self.phoneme_cache_path,
                             file_name + '_phoneme.npy')
     if os.path.isfile(tmp_path):
         try:
             text = np.load(tmp_path)
         except:
             print(
                 " > ERROR: phoneme connot be loaded for {}. Recomputing.".
                 format(wav_file))
             text = np.asarray(phoneme_to_sequence(
                 text, [self.cleaners],
                 language=self.phoneme_language,
                 enable_eos_bos=self.enable_eos_bos),
                               dtype=np.int32)
             np.save(tmp_path, text)
     else:
         text = np.asarray(phoneme_to_sequence(
             text, [self.cleaners],
             language=self.phoneme_language,
             enable_eos_bos=self.enable_eos_bos),
                           dtype=np.int32)
         np.save(tmp_path, text)
     return text
Exemple #2
0
 def load_tts(self, model_path, model_file, model_config, use_cuda):
     tts_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_file)
     print(" > Loading TTS model ...")
     print(" | > model config: ", tts_config)
     print(" | > model file: ", model_file)
     self.tts_config = load_config(tts_config)
     self.use_phonemes = self.tts_config.use_phonemes
     self.ap = AudioProcessor(**self.tts_config.audio)
     if self.use_phonemes:
         self.input_size = len(phonemes)
         self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars)
     else:
         self.input_size = len(symbols)
         self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner])
     self.tts_model = setup_model(self.input_size, self.tts_config)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
     # load the model
     self.tts_model.load_state_dict(cp['model'])
     if use_cuda:
         self.tts_model.cuda()
     self.tts_model.eval()
     self.tts_model.decoder.max_decoder_steps = 3000
def generate_phoneme_sequence(text, phoneme_file):
    phonemes = phoneme_to_sequence(text, ['phoneme_cleaners'],
                                   language='en-us',
                                   enable_eos_bos=False)
    phonemes = np.asarray(phonemes, dtype=np.int32)
    np.save(phoneme_file, phonemes)
    return phonemes
Exemple #4
0
 def load_model(self, model_path, model_config, wavernn_path, use_cuda):
     
     self.model_file = model_path
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.use_phonemes = config.use_phonemes
     self.ap = AudioProcessor(**config.audio)
     
     if self.use_phonemes:
         self.input_size = len(phonemes)
         self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
     else:
         self.input_size = len(symbols)
         self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
     
     self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True)
     self.model.decoder.max_decoder_steps = 8000
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
     self.vocoder=WaveRNNVocoder.Vocoder()
     self.vocoder.loadWeights(wavernn_path)
     self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
Exemple #5
0
def visualize(alignment,
              spectrogram_postnet,
              stop_tokens,
              text,
              hop_length,
              CONFIG,
              spectrogram=None,
              output_path=None):
    if spectrogram is not None:
        num_plot = 4
    else:
        num_plot = 3

    label_fontsize = 16
    fig = plt.figure(figsize=(8, 24))

    plt.subplot(num_plot, 1, 1)
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
    if CONFIG.use_phonemes:
        seq = phoneme_to_sequence(text, [CONFIG.text_cleaner],
                                  CONFIG.phoneme_language,
                                  CONFIG.enable_eos_bos_chars)
        text = sequence_to_phoneme(seq)
        print(text)
    plt.yticks(range(len(text)), list(text))
    plt.colorbar()

    stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy()
    plt.subplot(num_plot, 1, 2)
    plt.plot(range(len(stop_tokens)), list(stop_tokens))

    plt.subplot(num_plot, 1, 3)
    librosa.display.specshow(spectrogram_postnet.T,
                             sr=CONFIG.audio['sample_rate'],
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear")
    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()

    if spectrogram is not None:
        plt.subplot(num_plot, 1, 4)
        librosa.display.specshow(spectrogram.T,
                                 sr=CONFIG.audio['sample_rate'],
                                 hop_length=hop_length,
                                 x_axis="time",
                                 y_axis="linear")
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)
        plt.tight_layout()
        plt.colorbar()

    if output_path:
        print(output_path)
        fig.savefig(output_path)
        plt.close()
Exemple #6
0
    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)

        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(
                sen, [self.config.text_cleaner], self.config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(
                sen, [self.config.text_cleaner])

        self.model = Tacotron(self.input_size, config.embedding_size,
                              self.ap.num_freq, self.ap.num_mels, config.r)
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file,
                            map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()
    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen += '.'
            print(sen)
            sen = sen.strip()
            seq = np.array(
                phoneme_to_sequence(sen, text_cleaner,
                                    self.config.phoneme_language))
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(
                chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)

        return out
 def load_phoneme_sequence(self, wav_file, text):
     file_name = os.path.basename(wav_file).split('.')[0]
     tmp_path = os.path.join(self.phoneme_cache_path, file_name+'_phoneme.npy')
     if os.path.isfile(tmp_path):
         text = np.load(tmp_path)
     else:
         text = np.asarray(
             phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language), dtype=np.int32)
         np.save(tmp_path, text)
     return text
def tts(model, text, ap):
    inputs = phoneme_to_sequence(text, ['phoneme_cleaners'],
                                 language='en-us',
                                 enable_eos_bos=False)
    inputs = np.asarray(inputs, dtype=np.int32)
    inputs = torch.from_numpy(inputs).unsqueeze(0).long().to(device)

    decoder_output, postnet_output, alignments, stop_tokens = \
        model.inference(inputs)
    postnet_output = postnet_output[0].data.cpu().numpy()
    wav = ap.inv_spectrogram(postnet_output.T)
    return wav
Exemple #10
0
    def _generate_and_cache_phoneme_sequence(self, text, cache_path):
        """generate a phoneme sequence from text.

        since the usage is for subsequent caching, we never add bos and
        eos chars here. Instead we add those dynamically later; based on the
        config option."""
        phonemes = phoneme_to_sequence(text, [self.cleaners],
                                       language=self.phoneme_language,
                                       enable_eos_bos=False)
        phonemes = np.asarray(phonemes, dtype=np.int32)
        np.save(cache_path, phonemes)
        return phonemes
Exemple #11
0
    def load_model(self, model_path, model_name, model_config, use_cuda):

        #build the config's path
        model_config = os.path.join(model_path, model_config)

        #build the model's path
        model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > Model config path: ", model_config)
        print(" | > Model file path: ", model_file)

        config = load_config(model_config)
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)

        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(
                sen, [config.text_cleaner], config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(
                sen, [config.text_cleaner])

        self.model = Tacotron(num_chars=config['num_chars'],
                              embedding_dim=config['embedding_size'],
                              linear_dim=self.ap.num_freq,
                              mel_dim=self.ap.num_mels,
                              r=config['r'])

        #load model state
        if use_cuda:
            cp = torch.load(model_file)
        else:
            cp = torch.load(model_file,
                            map_location=lambda storage, loc: storage)

        #load the model
        self.model.load_state_dict(cp['model'])

        #if cuda is enabled & available move tensors to GPU
        if use_cuda:
            self.model.cuda()

        #disables normalization techniques present in code
        self.model.eval()