class Synthesizer(object): def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' print(sen) sen = sen.strip() seq = np.array( phoneme_to_sequence(sen, text_cleaner, self.config.phoneme_language)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.tts_config.text_cleaner], self.tts_config. phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.tts_config.text_cleaner]) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) if not sens: sens = [text + '.'] for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen) seq = np.array(self.input_adapter(sen)) text_hat = sequence_to_phoneme(seq) print(text_hat) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference( chars_var) postnet_out = postnet_out[0].data.cpu().numpy() if self.tts_config.model == "Tacotron": wav = self.ap.inv_spectrogram(postnet_out.T) elif self.tts_config.model == "Tacotron2": if self.wavernn: wav = self.wavernn.generate( torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = self.ap.inv_mel_spectrogram(postnet_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestTTSDataset, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): dataset = TTSDataset.MyDataset( c.data_path, 'metadata.csv', r, c.text_cleaner, preprocessor=ljspeech, ap=self.ap, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), use_phonemes=False) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, \ " !! Negative values in text_input: {}".format(check_count) # TODO: more assertion here assert linear_input.shape[0] == c.batch_size assert linear_input.shape[2] == self.ap.num_freq assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio['num_mels'] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= -self.ap.max_norm assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= 0 def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.items for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] avg_length = mel_lengths.numpy().mean() assert avg_length >= last_length dataloader.dataset.sort_items() assert frames[0] != dataloader.dataset.items[0] def test_padding_and_spec(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] # check mel_spec consistency wav = self.ap.load_wav(item_idx[0]) mel = self.ap.melspectrogram(wav) mel_dl = mel_input[0].cpu().numpy() assert ( abs(mel.T).astype("float32") - abs(mel_dl[:-1])).sum() == 0 # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_mel_spectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav') # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/linear_target_dataloader.wav') # check the last time step to be zero padded assert linear_input[0, -1].sum() == 0 assert linear_input[0, -2].sum() != 0 assert mel_input[0, -1].sum() == 0 assert mel_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[0] == linear_input[0].shape[0] assert mel_lengths[0] == mel_input[0].shape[0] # Test for batch size 2 dataloader, dataset = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the first item in the batch assert linear_input[idx, -1].sum() == 0 assert linear_input[idx, -2].sum() != 0, linear_input assert mel_input[idx, -1].sum() == 0 assert mel_input[idx, -2].sum() != 0, mel_input assert stop_target[idx, -1] == 1 assert stop_target[idx, -2] == 0 assert stop_target[idx].sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[idx] == mel_input[idx].shape[0] assert mel_lengths[idx] == linear_input[idx].shape[0] # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 assert stop_target[1 - idx, -1] == 1 assert len(mel_lengths.shape) == 1 # check batch conditions assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
class Synthesizer(object): """ Summary: Config is loaded and the model from the given path is loaded and prepared for inference. Parameters: @model_path = model's file directory path @model_name = model's file name @model_config = config's file name @use_cuda = GPU flag """ def load_model(self, model_path, model_name, model_config, use_cuda): #build the config's path model_config = os.path.join(model_path, model_config) #build the model's path model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > Model config path: ", model_config) print(" | > Model file path: ", model_file) config = load_config(model_config) self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [config.text_cleaner], config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [config.text_cleaner]) self.model = Tacotron(num_chars=config['num_chars'], embedding_dim=config['embedding_size'], linear_dim=self.ap.num_freq, mel_dim=self.ap.num_mels, r=config['r']) #load model state if use_cuda: cp = torch.load(model_file) else: cp = torch.load(model_file, map_location=lambda storage, loc: storage) #load the model self.model.load_state_dict(cp['model']) #if cuda is enabled & available move tensors to GPU if use_cuda: self.model.cuda() #disables normalization techniques present in code self.model.eval() """ Summary: Saves the wav at the given path Parameters: @wav = wav array @path = destination path """ def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) """ Summary: Gets an input, prepares it for the model and returns the predicted output. Parameters: @text = input sentence """ def tts(self, text, gl_mode=None): wavs = [] #split the input in sentences for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' #print('Input : {}'.format(sen)) #character => phonem => index seq = np.array(self.input_adapter(sen)) #numpy to pytorch array chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() #begin the inference mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) #move output tensor to cpu linear_out = linear_out[0].data.cpu().numpy() t = time.time() wav = self.ap.inv_spectrogram(linear_out.T, gl_mode) t = time.time() - t wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) self.save_wav(wavs, 'gla.wav') return out
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestTTSDataset, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): items = ljspeech(c.data_path, 'metadata.csv') dataset = TTSDataset.MyDataset(r, c.text_cleaner, ap=self.ap, meta_data=items, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), use_phonemes=False) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, \ " !! Negative values in text_input: {}".format(check_count) # TODO: more assertion here assert type(speaker_name[0]) is str assert linear_input.shape[0] == c.batch_size assert linear_input.shape[2] == self.ap.num_freq assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio['num_mels'] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= -self.ap.max_norm assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= 0 def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.items for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] avg_length = mel_lengths.numpy().mean() assert avg_length >= last_length dataloader.dataset.sort_items() is_items_reordered = False for idx, item in enumerate(dataloader.dataset.items): if item != frames[idx]: is_items_reordered = True break assert is_items_reordered def test_padding_and_spec(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] # check mel_spec consistency wav = np.asarray(self.ap.load_wav(item_idx[0]), dtype=np.float32) mel = self.ap.melspectrogram(wav).astype('float32') mel = torch.FloatTensor(mel).contiguous() mel_dl = mel_input[0] # NOTE: Below needs to check == 0 but due to an unknown reason # there is a slight difference between two matrices. # TODO: Check this assert cond more in detail. assert abs((abs(mel.T) - abs(mel_dl[:-1])).sum()) < 1e-5, ( abs(mel.T) - abs(mel_dl[:-1])).sum() # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_mel_spectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav') # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/linear_target_dataloader.wav') # check the last time step to be zero padded assert linear_input[0, -1].sum() == 0 assert linear_input[0, -2].sum() != 0 assert mel_input[0, -1].sum() == 0 assert mel_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[0] == linear_input[0].shape[0] assert mel_lengths[0] == mel_input[0].shape[0] # Test for batch size 2 dataloader, dataset = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] speaker_name = data[2] linear_input = data[3] mel_input = data[4] mel_lengths = data[5] stop_target = data[6] item_idx = data[7] if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the first item in the batch assert linear_input[idx, -1].sum() == 0 assert linear_input[idx, -2].sum() != 0, linear_input assert mel_input[idx, -1].sum() == 0 assert mel_input[idx, -2].sum() != 0, mel_input assert stop_target[idx, -1] == 1 assert stop_target[idx, -2] == 0 assert stop_target[idx].sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[idx] == mel_input[idx].shape[0] assert mel_lengths[idx] == linear_input[idx].shape[0] # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 assert stop_target[1 - idx, -1] == 1 assert len(mel_lengths.shape) == 1 # check batch conditions assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 assert (mel_input * stop_target.unsqueeze(2)).sum() == 0