예제 #1
0
    def say(self, text, output):
        # load the model
        model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                         self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                            self.CONFIG.min_level_db,
                            self.CONFIG.frame_shift_ms,
                            self.CONFIG.frame_length_ms,
                            self.CONFIG.ref_level_db, self.CONFIG.num_freq,
                            self.CONFIG.power, self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(self.MODEL_PATH)
        else:
            cp = torch.load(self.MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if self.use_cuda:
            model.cuda()
        model.eval()

        model.decoder.max_decoder_steps = 400
        wavs = self.text2audio(text, model, self.CONFIG, self.use_cuda, ap)

        audio = np.concatenate(wavs)
        ap.save_wav(audio, output)

        return
예제 #2
0
    def load_model(self, MODEL_PATH, sentence, CONFIG, use_cuda, OUT_FILE):
        # load the model
        num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
        model = Tacotron(num_chars,
                         CONFIG.embedding_size,
                         CONFIG.audio['num_freq'],
                         CONFIG.audio['num_mels'],
                         CONFIG.r,
                         attn_windowing=False)

        # load the audio processor
        # CONFIG.audio["power"] = 1.3
        CONFIG.audio["preemphasis"] = 0.97
        ap = AudioProcessor(**CONFIG.audio)

        # load model state
        if use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if use_cuda:
            model.cuda()

        model.eval()
        model.decoder.max_decoder_steps = 1000
        align, spec, stop_tokens, wav_norm = self.tts(model, sentence, CONFIG,
                                                      use_cuda, ap, OUT_FILE)
        return wav_norm
예제 #3
0
def load_tts_model():

    MODEL_PATH = dirpath + '/tts_model/best_model.pth.tar'
    CONFIG_PATH = dirpath + '/tts_model/config.json'
    CONFIG = load_config(CONFIG_PATH)
    use_cuda = False

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    # load the audio processor
    # CONFIG.audio["power"] = 1.3
    CONFIG.audio["preemphasis"] = 0.97
    ap = AudioProcessor(**CONFIG.audio)

    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()

    #model.eval()
    model.decoder.max_decoder_steps = 1000
    return model, ap, MODEL_PATH, CONFIG, use_cuda
예제 #4
0
class Synthesizer(object):

    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)        
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
        self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
                                 config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
                                 config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)  
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()       
    
    def save_wav(self, wav, path):
        wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
        # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
        # wav = wav / wav.max()
        # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
        scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
        # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)

    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen +='.'
            print(sen)
            sen = sen.strip()
            seq = np.array(text_to_sequence(text, text_cleaner))
            chars_var = torch.from_numpy(seq).unsqueeze(0)
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            # wav = wav[:self.ap.find_endpoint(wav)]
            out = io.BytesIO()
            wavs.append(wav)
            wavs.append(np.zeros(10000))
        self.save_wav(wav, out)
        return out
def tts(text,
        model_path='model/best_model.pth.tar',
        config_path='model/config.json',
        use_cuda=False):
    CONFIG = load_config(config_path)
    model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels,
                     CONFIG.r)
    if use_cuda:
        cp = torch.load(model_path + seq_to_seq_test_model_fname,
                        map_location='cuda:0')
    else:
        cp = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()
    model.decoder.max_decoder_steps = 250
    ap = AudioProcessor(CONFIG.sample_rate,
                        CONFIG.num_mels,
                        CONFIG.min_level_db,
                        CONFIG.frame_shift_ms,
                        CONFIG.frame_length_ms,
                        CONFIG.ref_level_db,
                        CONFIG.num_freq,
                        CONFIG.power,
                        CONFIG.preemphasis,
                        griffin_lim_iters=50)
    t_1 = time.time()
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    linear_out = model.forward(chars_var.long())
    linear_out = linear_out[0].data.cpu().numpy()
    waveform = ap.inv_spectrogram(linear_out.T)
    waveform = waveform[:ap.find_endpoint(waveform)]
    out_path = 'static/samples/'
    os.makedirs(out_path, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(out_path, file_name)
    ap.save_wav(waveform, out_path)
    # print(" >  Run-time: {}".format(time.time() - t_1))

    return file_name
예제 #6
0
class TTS_mod():
    def __init__(self, message):
        self.message = message
        self.MODEL_PATH = './stt_models/best_model.pth.tar'
        self.CONFIG_PATH = './stt_models/config.json'
        self.OUT_FOLDER = '/output'
        self.CONFIG = load_config(self.CONFIG_PATH)
        self.use_cuda = False


    def tts(self, model, text, CONFIG, use_cuda, ap):
        waveform, alignment, spectrogram, mel_spectrogram, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap)
        ap.save_wav(waveform, 'out.wav')
        return alignment, spectrogram, stop_tokens

    def load_model(self):
	    # load the model
        self.num_chars = len(phonemes) if self.CONFIG.use_phonemes else len(symbols)
        self.model = Tacotron(self.num_chars, self.CONFIG.embedding_size, self.CONFIG.audio['num_freq'], self.CONFIG.audio['num_mels'], self.CONFIG.r, attn_windowing=False)

        self.CONFIG.audio["preemphasis"] = 0.97
        self.ap = AudioProcessor(**self.CONFIG.audio)

	    # load model state
        if self.use_cuda:
            self.cp = torch.load(self.MODEL_PATH)
        else:
            self.cp = torch.load(self.MODEL_PATH, map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(self.cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.decoder.max_decoder_steps = 1000


        self.sentence = self.message
        align, spec, stop_tokens = self.tts(self.model, self.sentence, self.CONFIG, self.use_cuda, self.ap)
예제 #7
0
class tts_class:
    def __init__(self):

        # Set constants
        ROOT_PATH = 'TTS/tts_model/'
        MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
        # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
        CONFIG_PATH = ROOT_PATH + '/config.json'
        OUT_FOLDER = ROOT_PATH + '/test'
        self.CONFIG = load_config(CONFIG_PATH)
        self.use_cuda = True  # True

        # load the model
        self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                              self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                                 self.CONFIG.min_level_db,
                                 self.CONFIG.frame_shift_ms,
                                 self.CONFIG.frame_length_ms,
                                 self.CONFIG.ref_level_db,
                                 self.CONFIG.num_freq, self.CONFIG.power,
                                 self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        self.model.decoder.max_decoder_steps = 500

        self.nlp = spacy.load("en")

    def process(self, text):
        self.model.decoder.max_decoder_steps = 500
        wavefiles = self.text2audio(text, self.model, self.CONFIG,
                                    self.use_cuda, self.ap)
        return wavefiles

    def tts(self, model, text, CONFIG, use_cuda, ap, wavefile, figures=True):
        waveform, alignment, spectrogram, stop_tokens = create_speech(
            model, text, CONFIG, use_cuda, ap)

        self.ap.save_wav(waveform, wavefile)

    def text2audio(self, text, model, CONFIG, use_cuda, ap):
        wavefiles = []
        base_name = "gen_{}.wav"

        doc = self.nlp(text)
        for i, sent in enumerate(doc.sents):
            text = sent.text.strip()
            wavefile = base_name.format(i)
            self.tts(model, text, CONFIG, use_cuda, ap, wavefile)
            wavefiles.append(wavefile)

        return wavefiles

    def play(self, wavefiles):

        voice = AudioSegment.empty()

        for wavefile in wavefiles:
            voice += AudioSegment.from_wav(wavefile)

        play(voice)

        for w in wavefiles:
            os.remove(w)
예제 #8
0
ap = AudioProcessor(**CONFIG.audio)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels,
                 CONFIG.r, CONFIG.memory_size)

# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

bits = 10

wavernn = Model(
    rnn_dims=512,
    fc_dims=512,
    mode=VOCODER_CONFIG.mode,
    mulaw=VOCODER_CONFIG.mulaw,
    pad=VOCODER_CONFIG.pad,
    use_aux_net=VOCODER_CONFIG.use_aux_net,
    use_upsample_net=VOCODER_CONFIG.use_upsample_net,
    upsample_factors=VOCODER_CONFIG.upsample_factors,
    feat_dims=80,
    compute_dims=128,