예제 #1
0
 def test_train_step(self):
     input = torch.randint(0, 24, (8, 128)).long().to(device)
     mel_spec = torch.rand(8, 30, c.num_mels).to(device)
     linear_spec = torch.rand(8, 30, c.num_freq).to(device)
     mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
     criterion = L1LossMasked().to(device)
     model = Tacotron(c.embedding_size, c.num_freq, c.num_mels,
                      c.r).to(device)
     model.train()
     model_ref = copy.deepcopy(model)
     count = 0
     for param, param_ref in zip(model.parameters(),
                                 model_ref.parameters()):
         assert (param - param_ref).sum() == 0, param
         count += 1
     optimizer = optim.Adam(model.parameters(), lr=c.lr)
     for i in range(5):
         mel_out, linear_out, align = model.forward(input, mel_spec)
         optimizer.zero_grad()
         loss = criterion(mel_out, mel_spec, mel_lengths)
         loss = 0.5 * loss + 0.5 * criterion(linear_out, linear_spec,
                                             mel_lengths)
         loss.backward()
         optimizer.step()
     # check parameter changes
     count = 0
     for param, param_ref in zip(model.parameters(),
                                 model_ref.parameters()):
         # ignore pre-higway layer since it works conditional
         if count not in [139, 59]:
             assert (param != param_ref).any(
             ), "param {} with shape {} not updated!! \n{}\n{}".format(
                 count, param.shape, param, param_ref)
         count += 1
예제 #2
0
    def test_train_step():
        input_dummy = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 120, c.audio['num_mels']).to(device)
        linear_spec = torch.rand(8, 120, c.audio['num_freq']).to(device)
        mel_lengths = torch.randint(20, 120, (8, )).long().to(device)
        mel_lengths[-1] = 120
        stop_targets = torch.zeros(8, 120, 1).float().to(device)
        speaker_ids = torch.randint(0, 5, (8, )).long().to(device)

        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0

        stop_targets = stop_targets.view(input_dummy.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) >
                        0.0).unsqueeze(2).float().squeeze()

        criterion = L1LossMasked(seq_len_norm=False).to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        model = Tacotron(
            num_chars=32,
            num_speakers=5,
            gst=True,
            postnet_output_dim=c.audio['num_freq'],
            decoder_output_dim=c.audio['num_mels'],
            r=c.r,
            memory_size=c.memory_size
        ).to(device)  #FIXME: missing num_speakers parameter to Tacotron ctor
        model.train()
        print(model)
        print(" > Num parameters for Tacotron GST model:%s" %
              (count_parameters(model)))
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        for _ in range(10):
            mel_out, linear_out, align, stop_tokens = model.forward(
                input_dummy, input_lengths, mel_spec, mel_lengths, speaker_ids)
            optimizer.zero_grad()
            loss = criterion(mel_out, mel_spec, mel_lengths)
            stop_loss = criterion_st(stop_tokens, stop_targets)
            loss = loss + criterion(linear_out, linear_spec,
                                    mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            assert (param != param_ref).any(
            ), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref)
            count += 1
예제 #3
0
class Synthesizer(object):

    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)        
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.model = Tacotron(config.embedding_size, config.num_freq, config.num_mels, config.r)
        self.ap = AudioProcessor(config.sample_rate, config.num_mels, config.min_level_db,
                                 config.frame_shift_ms, config.frame_length_ms, config.preemphasis,
                                 config.ref_level_db, config.num_freq, config.power, griffin_lim_iters=60)  
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()       
    
    def save_wav(self, wav, path):
        wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        # sf.write(path, wav.astype(np.int32), self.config.sample_rate, format='wav')
        # wav = librosa.util.normalize(wav.astype(np.float), norm=np.inf, axis=None)
        # wav = wav / wav.max()
        # sf.write(path, wav.astype('float'), self.config.sample_rate, format='ogg')
        scipy.io.wavfile.write(path, self.config.sample_rate, wav.astype(np.int16))
        # librosa.output.write_wav(path, wav.astype(np.int16), self.config.sample_rate, norm=True)

    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen +='.'
            print(sen)
            sen = sen.strip()
            seq = np.array(text_to_sequence(text, text_cleaner))
            chars_var = torch.from_numpy(seq).unsqueeze(0)
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            # wav = wav[:self.ap.find_endpoint(wav)]
            out = io.BytesIO()
            wavs.append(wav)
            wavs.append(np.zeros(10000))
        self.save_wav(wav, out)
        return out
예제 #4
0
    def test_train_step(self):
        input = torch.randint(0, 24, (8, 128)).long().to(device)
        mel_spec = torch.rand(8, 30, c.num_mels).to(device)
        linear_spec = torch.rand(8, 30, c.num_freq).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)

        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0

        stop_targets = stop_targets.view(input.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float()

        criterion = L1LossMasked().to(device)
        criterion_st = nn.BCELoss().to(device)
        model = Tacotron(c.embedding_size, c.num_freq, c.num_mels,
                         c.r).to(device)
        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        for i in range(5):
            mel_out, linear_out, align, stop_tokens = model.forward(
                input, mel_spec)
            assert stop_tokens.data.max() <= 1.0
            assert stop_tokens.data.min() >= 0.0
            optimizer.zero_grad()
            loss = criterion(mel_out, mel_spec, mel_lengths)
            stop_loss = criterion_st(stop_tokens, stop_targets)
            loss = loss + criterion(linear_out, linear_spec,
                                    mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            if count not in [145, 59]:
                assert (param != param_ref).any(
                ), "param {} with shape {} not updated!! \n{}\n{}".format(
                    count, param.shape, param, param_ref)
            count += 1
def tts(text,
        model_path='model/best_model.pth.tar',
        config_path='model/config.json',
        use_cuda=False):
    CONFIG = load_config(config_path)
    model = Tacotron(CONFIG.embedding_size, CONFIG.num_freq, CONFIG.num_mels,
                     CONFIG.r)
    if use_cuda:
        cp = torch.load(model_path + seq_to_seq_test_model_fname,
                        map_location='cuda:0')
    else:
        cp = torch.load(model_path, map_location=lambda storage, loc: storage)
    model.load_state_dict(cp['model'])
    if use_cuda:
        model.cuda()
    model.eval()
    model.decoder.max_decoder_steps = 250
    ap = AudioProcessor(CONFIG.sample_rate,
                        CONFIG.num_mels,
                        CONFIG.min_level_db,
                        CONFIG.frame_shift_ms,
                        CONFIG.frame_length_ms,
                        CONFIG.ref_level_db,
                        CONFIG.num_freq,
                        CONFIG.power,
                        CONFIG.preemphasis,
                        griffin_lim_iters=50)
    t_1 = time.time()
    text_cleaner = [CONFIG.text_cleaner]
    seq = np.array(text_to_sequence(text, text_cleaner))
    chars_var = torch.from_numpy(seq).unsqueeze(0)
    if use_cuda:
        chars_var = chars_var.cuda()
    linear_out = model.forward(chars_var.long())
    linear_out = linear_out[0].data.cpu().numpy()
    waveform = ap.inv_spectrogram(linear_out.T)
    waveform = waveform[:ap.find_endpoint(waveform)]
    out_path = 'static/samples/'
    os.makedirs(out_path, exist_ok=True)
    file_name = text.replace(" ", "_").replace(".", "") + ".wav"
    out_path = os.path.join(out_path, file_name)
    ap.save_wav(waveform, out_path)
    # print(" >  Run-time: {}".format(time.time() - t_1))

    return file_name