Example #1
0
 def load_model(self, model_path, model_name, model_config, use_cuda):
     model_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_name)
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.model = Tacotron(config.embedding_size, config.num_freq,
                           config.num_mels, config.r)
     self.ap = AudioProcessor(config.sample_rate,
                              config.num_mels,
                              config.min_level_db,
                              config.frame_shift_ms,
                              config.frame_length_ms,
                              config.preemphasis,
                              config.ref_level_db,
                              config.num_freq,
                              config.power,
                              griffin_lim_iters=60)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file,
                         map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
Example #2
0
    def say(self, text, output):
        # load the model
        model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                         self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                            self.CONFIG.min_level_db,
                            self.CONFIG.frame_shift_ms,
                            self.CONFIG.frame_length_ms,
                            self.CONFIG.ref_level_db, self.CONFIG.num_freq,
                            self.CONFIG.power, self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(self.MODEL_PATH)
        else:
            cp = torch.load(self.MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if self.use_cuda:
            model.cuda()
        model.eval()

        model.decoder.max_decoder_steps = 400
        wavs = self.text2audio(text, model, self.CONFIG, self.use_cuda, ap)

        audio = np.concatenate(wavs)
        ap.save_wav(audio, output)

        return
Example #3
0
def load_tts_model():

    MODEL_PATH = dirpath + '/tts_model/best_model.pth.tar'
    CONFIG_PATH = dirpath + '/tts_model/config.json'
    CONFIG = load_config(CONFIG_PATH)
    use_cuda = False

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
    model = Tacotron(num_chars, CONFIG.embedding_size, CONFIG.audio['num_freq'], CONFIG.audio['num_mels'], CONFIG.r, attn_windowing=False)

    # load the audio processor
    # CONFIG.audio["power"] = 1.3
    CONFIG.audio["preemphasis"] = 0.97
    ap = AudioProcessor(**CONFIG.audio)

    # load model state
    if use_cuda:
        cp = torch.load(MODEL_PATH)
    else:
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

    # load the model
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()

    #model.eval()
    model.decoder.max_decoder_steps = 1000
    return model, ap, MODEL_PATH, CONFIG, use_cuda
Example #4
0
    def test_train_step():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
        linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
        mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
        mel_lengths[-1] = 120
        stop_targets = torch.zeros(8, 120, 1).float().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0

        stop_targets = stop_targets.view(input_dummy.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) >
                        0.0).unsqueeze(2).float().squeeze()

        criterion = L1LossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        model = Tacotron(
            num_chars=32,
            num_speakers=5,
            gst=True,
            postnet_output_dim=c.audio['num_freq'],
            decoder_output_dim=c.audio['num_mels'],
            r=c.r,
            memory_size=c.memory_size
        ).to(device)  #FIXME: missing num_speakers parameter to Tacotron ctor
        model.train()
        print(model)
        print(" > Num parameters for Tacotron GST model:%s" %
              (count_parameters(model)))
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        for _ in range(10):
            mel_out, linear_out, align, stop_tokens = model.forward(
                input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
            optimizer.zero_grad()
            loss = criterion(mel_out, mel_spec, mel_lengths)
            stop_loss = criterion_st(stop_tokens, stop_targets)
            loss = loss + criterion(linear_out, linear_spec,
                                    mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            assert (param != param_ref).any(
            ), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref)
            count += 1
Example #5
0
    def load_model(self, MODEL_PATH, sentence, CONFIG, use_cuda, OUT_FILE):
        # load the model
        num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
        model = Tacotron(num_chars,
                         CONFIG.embedding_size,
                         CONFIG.audio['num_freq'],
                         CONFIG.audio['num_mels'],
                         CONFIG.r,
                         attn_windowing=False)

        # load the audio processor
        # CONFIG.audio["power"] = 1.3
        CONFIG.audio["preemphasis"] = 0.97
        ap = AudioProcessor(**CONFIG.audio)

        # load model state
        if use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp['model'])
        if use_cuda:
            model.cuda()

        model.eval()
        model.decoder.max_decoder_steps = 1000
        align, spec, stop_tokens, wav_norm = self.tts(model, sentence, CONFIG,
                                                      use_cuda, ap, OUT_FILE)
        return wav_norm
Example #6
0
 def test_train_step(self):
     input = torch.randint(0, 24, (8, 128)).long().to(device)
     mel_spec = torch.rand(8, 30, c.num_mels).to(device)
     linear_spec = torch.rand(8, 30, c.num_freq).to(device)
     mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
     criterion = L1LossMasked().to(device)
     model = Tacotron(c.embedding_size, c.num_freq, c.num_mels,
                      c.r).to(device)
     model.train()
     model_ref = copy.deepcopy(model)
     count = 0
     for param, param_ref in zip(model.parameters(),
                                 model_ref.parameters()):
         assert (param - param_ref).sum() == 0, param
         count += 1
     optimizer = optim.Adam(model.parameters(), lr=c.lr)
     for i in range(5):
         mel_out, linear_out, align = model.forward(input, mel_spec)
         optimizer.zero_grad()
         loss = criterion(mel_out, mel_spec, mel_lengths)
         loss = 0.5 * loss + 0.5 * criterion(linear_out, linear_spec,
                                             mel_lengths)
         loss.backward()
         optimizer.step()
     # check parameter changes
     count = 0
     for param, param_ref in zip(model.parameters(),
                                 model_ref.parameters()):
         # ignore pre-higway layer since it works conditional
         if count not in [139, 59]:
             assert (param != param_ref).any(
             ), "param {} with shape {} not updated!! \n{}\n{}".format(
                 count, param.shape, param, param_ref)
         count += 1
Example #7
0
class Synthesizer(object):

    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)        
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
        self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
                                 config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
                                 config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)  
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()       
    
    def save_wav(self, wav, path):
        wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
        # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
        # wav = wav / wav.max()
        # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
        scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
        # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)

    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen +='.'
            print(sen)
            sen = sen.strip()
            seq = np.array(text_to_sequence(text, text_cleaner))
            chars_var = torch.from_numpy(seq).unsqueeze(0)
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            # wav = wav[:self.ap.find_endpoint(wav)]
            out = io.BytesIO()
            wavs.append(wav)
            wavs.append(np.zeros(10000))
        self.save_wav(wav, out)
        return out
Example #8
0
    def __init__(self):

        # Set constants
        ROOT_PATH = 'TTS/tts_model/'
        MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
        # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
        CONFIG_PATH = ROOT_PATH + '/config.json'
        OUT_FOLDER = ROOT_PATH + '/test'
        self.CONFIG = load_config(CONFIG_PATH)
        self.use_cuda = True  # True

        # load the model
        self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                              self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                                 self.CONFIG.min_level_db,
                                 self.CONFIG.frame_shift_ms,
                                 self.CONFIG.frame_length_ms,
                                 self.CONFIG.ref_level_db,
                                 self.CONFIG.num_freq, self.CONFIG.power,
                                 self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        self.model.decoder.max_decoder_steps = 500

        self.nlp = spacy.load("en")
Example #9
0
    def load_tts_model(self):
        CONFIG = load_config(CONFIG_PATH)

        model = Tacotron(len(phonemes),
                         CONFIG.embedding_size,
                         CONFIG.audio["num_freq"],
                         CONFIG.audio["num_mels"],
                         CONFIG.r,
                         attn_windowing=False)

        # load the audio processor
        ap = AudioProcessor(**CONFIG.audio)

        # load model state
        cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

        # load the model
        model.load_state_dict(cp["model"])

        model.decoder.max_decoder_steps = 650
        return model, ap, MODEL_PATH, CONFIG, use_cuda
def tts(text,
        model_path='model/best_model.pth.tar',
        config_path='model/config.json',
        use_cuda=False):
    CONFIG = load_config(config_path)
    model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels,
                     CONFIG.r)
    if use_cuda:
        cp = torch.load(model_path + seq_to_seq_test_model_fname,
                        map_location='cuda:0')
    else:
        cp = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()
    model.decoder.max_decoder_steps = 250
    ap = AudioProcessor(CONFIG.sample_rate,
                        CONFIG.num_mels,
                        CONFIG.min_level_db,
                        CONFIG.frame_shift_ms,
                        CONFIG.frame_length_ms,
                        CONFIG.ref_level_db,
                        CONFIG.num_freq,
                        CONFIG.power,
                        CONFIG.preemphasis,
                        griffin_lim_iters=50)
    t_1 = time.time()
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    linear_out = model.forward(chars_var.long())
    linear_out = linear_out[0].data.cpu().numpy()
    waveform = ap.inv_spectrogram(linear_out.T)
    waveform = waveform[:ap.find_endpoint(waveform)]
    out_path = 'static/samples/'
    os.makedirs(out_path, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(out_path, file_name)
    ap.save_wav(waveform, out_path)
    # print(" >  Run-time: {}".format(time.time() - t_1))

    return file_name
Example #11
0
    def test_train_step(self):
        input = torch.randint(0, 24, (8, 128)).long().to(device)
        mel_spec = torch.rand(8, 30, c.num_mels).to(device)
        linear_spec = torch.rand(8, 30, c.num_freq).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)

        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0

        stop_targets = stop_targets.view(input.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()

        criterion = L1LossMasked().to(device)
        criterion_st = nn.BCELoss().to(device)
        model = Tacotron(c.embedding_size, c.num_freq, c.num_mels,
                         c.r).to(device)
        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        for i in range(5):
            mel_out, linear_out, align, stop_tokens = model.forward(
                input, mel_spec)
            assert stop_tokens.data.max() <= 1.0
            assert stop_tokens.data.min() >= 0.0
            optimizer.zero_grad()
            loss = criterion(mel_out, mel_spec, mel_lengths)
            stop_loss = criterion_st(stop_tokens, stop_targets)
            loss = loss + criterion(linear_out, linear_spec,
                                    mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            if count not in [145, 59]:
                assert (param != param_ref).any(
                ), "param {} with shape {} not updated!! \n{}\n{}".format(
                    count, param.shape, param, param_ref)
            count += 1
Example #12
0
class TTS_mod():
    def __init__(self, message):
        self.message = message
        self.MODEL_PATH = './stt_models/best_model.pth.tar'
        self.CONFIG_PATH = './stt_models/config.json'
        self.OUT_FOLDER = '/output'
        self.CONFIG = load_config(self.CONFIG_PATH)
        self.use_cuda = False


    def tts(self, model, text, CONFIG, use_cuda, ap):
        waveform, alignment, spectrogram, mel_spectrogram, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap)
        ap.save_wav(waveform, 'out.wav')
        return alignment, spectrogram, stop_tokens

    def load_model(self):
	    # load the model
        self.num_chars = len(phonemes) if self.CONFIG.use_phonemes else len(symbols)
        self.model = Tacotron(self.num_chars, self.CONFIG.embedding_size, self.CONFIG.audio['num_freq'], self.CONFIG.audio['num_mels'], self.CONFIG.r, attn_windowing=False)

        self.CONFIG.audio["preemphasis"] = 0.97
        self.ap = AudioProcessor(**self.CONFIG.audio)

	    # load model state
        if self.use_cuda:
            self.cp = torch.load(self.MODEL_PATH)
        else:
            self.cp = torch.load(self.MODEL_PATH, map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(self.cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.decoder.max_decoder_steps = 1000


        self.sentence = self.message
        align, spec, stop_tokens = self.tts(self.model, self.sentence, self.CONFIG, self.use_cuda, self.ap)
Example #13
0
    def load_model(self):
	    # load the model
        self.num_chars = len(phonemes) if self.CONFIG.use_phonemes else len(symbols)
        self.model = Tacotron(self.num_chars, self.CONFIG.embedding_size, self.CONFIG.audio['num_freq'], self.CONFIG.audio['num_mels'], self.CONFIG.r, attn_windowing=False)

        self.CONFIG.audio["preemphasis"] = 0.97
        self.ap = AudioProcessor(**self.CONFIG.audio)

	    # load model state
        if self.use_cuda:
            self.cp = torch.load(self.MODEL_PATH)
        else:
            self.cp = torch.load(self.MODEL_PATH, map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(self.cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.decoder.max_decoder_steps = 1000


        self.sentence = self.message
        align, spec, stop_tokens = self.tts(self.model, self.sentence, self.CONFIG, self.use_cuda, self.ap)
Example #14
0
class tts_class:
    def __init__(self):

        # Set constants
        ROOT_PATH = 'TTS/tts_model/'
        MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'
        # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar'
        CONFIG_PATH = ROOT_PATH + '/config.json'
        OUT_FOLDER = ROOT_PATH + '/test'
        self.CONFIG = load_config(CONFIG_PATH)
        self.use_cuda = True  # True

        # load the model
        self.model = Tacotron(self.CONFIG.embedding_size, self.CONFIG.num_freq,
                              self.CONFIG.num_mels, self.CONFIG.r)

        # load the audio processor

        self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels,
                                 self.CONFIG.min_level_db,
                                 self.CONFIG.frame_shift_ms,
                                 self.CONFIG.frame_length_ms,
                                 self.CONFIG.ref_level_db,
                                 self.CONFIG.num_freq, self.CONFIG.power,
                                 self.CONFIG.preemphasis, 60)

        # load model state
        if self.use_cuda:
            cp = torch.load(MODEL_PATH)
        else:
            cp = torch.load(MODEL_PATH,
                            map_location=lambda storage, loc: storage)

        # load the model
        self.model.load_state_dict(cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()

        self.model.decoder.max_decoder_steps = 500

        self.nlp = spacy.load("en")

    def process(self, text):
        self.model.decoder.max_decoder_steps = 500
        wavefiles = self.text2audio(text, self.model, self.CONFIG,
                                    self.use_cuda, self.ap)
        return wavefiles

    def tts(self, model, text, CONFIG, use_cuda, ap, wavefile, figures=True):
        waveform, alignment, spectrogram, stop_tokens = create_speech(
            model, text, CONFIG, use_cuda, ap)

        self.ap.save_wav(waveform, wavefile)

    def text2audio(self, text, model, CONFIG, use_cuda, ap):
        wavefiles = []
        base_name = "gen_{}.wav"

        doc = self.nlp(text)
        for i, sent in enumerate(doc.sents):
            text = sent.text.strip()
            wavefile = base_name.format(i)
            self.tts(model, text, CONFIG, use_cuda, ap, wavefile)
            wavefiles.append(wavefile)

        return wavefiles

    def play(self, wavefiles):

        voice = AudioSegment.empty()

        for wavefile in wavefiles:
            voice += AudioSegment.from_wav(wavefile)

        play(voice)

        for w in wavefiles:
            os.remove(w)
Example #15
0
except:
    pass

CONFIG = load_config(CONFIG_PATH)
use_cuda = torch.cuda.is_available()

VOCODER_MODEL_PATH = 'WaveRNN/saver.pth.tar'
VOCODER_CONFIG_PATH = 'WaveRNN/config_16K.json'
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)

# load the model
ap2 = AudioProcessor(**VOCODER_CONFIG.audio)
ap = AudioProcessor(**CONFIG.audio)

num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels,
                 CONFIG.r, CONFIG.memory_size)

# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()

bits = 10