def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, min_seq_len=0): with open(csv_file, "r", encoding="utf8") as f: self.frames = [line.split('|') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r'])
def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000
class Synthesizer(object): def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def tts(self, text): text_cleaner = [self.config.text_cleaner] wavs = [] for sen in text.split('.'): if len(sen) < 3: continue sen = sen.strip() sen += '.' print(sen) sen = sen.strip() seq = np.array( phoneme_to_sequence(sen, text_cleaner, self.config.phoneme_language)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, linear_out, alignments, stop_tokens = self.model.forward( chars_var) linear_out = linear_out[0].data.cpu().numpy() wav = self.ap.inv_spectrogram(linear_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
def main(): ap = AudioProcessor() # load model num_chars = len(phonemes) model = Tacotron(num_chars).to(device) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.eval() print('Text: {}'.format(args.text)) wav = tts(model, args.text, ap) file_name = args.text.replace(' ', '_') + '.wav' out_path = os.path.join(args.out_path, file_name) ap.save_wav(wav, out_path)
def load_model(self, model_path, model_name, model_config, use_cuda): #build the config's path model_config = os.path.join(model_path, model_config) #build the model's path model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > Model config path: ", model_config) print(" | > Model file path: ", model_file) config = load_config(model_config) self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [config.text_cleaner], config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [config.text_cleaner]) self.model = Tacotron(num_chars=config['num_chars'], embedding_dim=config['embedding_size'], linear_dim=self.ap.num_freq, mel_dim=self.ap.num_mels, r=config['r']) #load model state if use_cuda: cp = torch.load(model_file) else: cp = torch.load(model_file, map_location=lambda storage, loc: storage) #load the model self.model.load_state_dict(cp['model']) #if cuda is enabled & available move tensors to GPU if use_cuda: self.model.cuda() #disables normalization techniques present in code self.model.eval()
def load_model(self, model_path, model_name, model_config, use_cuda): model_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_name) print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.ap = AudioProcessor(**config.audio) self.model = Tacotron(61, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval()
class TestAudio(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**c.audio) def test_audio_synthesis(self): """ 1. load wav 2. set normalization parameters 3. extract mel-spec 4. invert to wav and save the output """ print(" > Sanity check for the process wav -> mel -> wav") def _test(max_norm, signal_norm, symmetric_norm, clip_norm): self.ap.max_norm = max_norm self.ap.signal_norm = signal_norm self.ap.symmetric_norm = symmetric_norm self.ap.clip_norm = clip_norm wav = self.ap.load_wav(INPUTPATH + "/example_1.wav") mel = self.ap.melspectrogram(wav) wav_ = self.ap.inv_mel_spectrogram(mel) file_name = "/audio_test-melspec_max_norm_{}-signal_norm_{}-symmetric_{}-clip_norm_{}.wav"\ .format(max_norm, signal_norm, symmetric_norm, clip_norm) print(" | > Creating wav file at : ", file_name) self.ap.save_wav(wav_, OUTPATH + file_name) # maxnorm = 1.0 _test(1., False, False, False) _test(1., True, False, False) _test(1., True, True, False) _test(1., True, False, True) _test(1., True, True, True) # maxnorm = 4.0 _test(4., False, False, False) _test(4., True, False, False) _test(4., True, True, False) _test(4., True, False, True) _test(4., True, True, True) def test_normalize(self): """Check normalization and denormalization for range values and consistency """ print(" > Testing normalization and denormalization.") wav = self.ap.load_wav(INPUTPATH + "/example_1.wav") self.ap.signal_norm = False x = self.ap.melspectrogram(wav) x_old = x self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= 0 - 1, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = False self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm + 1, x_norm.max() assert x_norm.min() >= -self.ap.max_norm - 2, x_norm.min() assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.clip_norm = True self.ap.max_norm = 4.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 # check value range assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() <= 0, x_norm.min() # check denorm. x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3, (x - x_).mean() self.ap.signal_norm = True self.ap.symmetric_norm = False self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3 self.ap.signal_norm = True self.ap.symmetric_norm = True self.ap.max_norm = 1.0 x_norm = self.ap._normalize(x) print(x_norm.max(), " -- ", x_norm.min()) assert (x_old - x).sum() == 0 assert x_norm.max() <= self.ap.max_norm, x_norm.max() assert x_norm.min() >= -self.ap.max_norm, x_norm.min() assert x_norm.min() < 0, x_norm.min() x_ = self.ap._denormalize(x_norm) assert (x - x_).sum() < 1e-3
class Synthesizer(object): def load_model(self, model_path, model_config, wavernn_path, use_cuda): self.model_file = model_path print(" > Loading model ...") print(" | > model config: ", model_config) print(" | > model file: ", self.model_file) config = load_config(model_config) self.config = config self.use_cuda = use_cuda self.use_phonemes = config.use_phonemes self.ap = AudioProcessor(**config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner]) self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True) self.model.decoder.max_decoder_steps = 8000 # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.model.load_state_dict(cp['model']) if use_cuda: self.model.cuda() self.model.eval() self.vocoder=WaveRNNVocoder.Vocoder() self.vocoder.loadWeights(wavernn_path) self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000) def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) #split text into chunks that are smaller than maxlen. Preferably, split on punctuation. def ttmel(self, text): mel_ret = [] text_list = split_text(text, maxlen) for t in text_list: if len(t) < 3: continue seq = np.array(self.input_adapter(t)) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() mel_out, _, alignments, stop_tokens = self.model.forward(chars_var) mel_out = mel_out[0].data.cpu().numpy().T mel_ret.append(mel_out) return np.hstack(mel_ret) def tts(self, mel): wav = self.vocoder.melToWav(mel) return wav
'--text_gst_prediction', type=bool, default=True, help='Predict style from the text itself for more dynamic speech.') args = parser.parse_args() if args.vocoder_path != "": from WaveRNN.models.wavernn import Model as VocoderModel # load the config C = load_config(args.config_path) C.forward_attn_mask = True # load the audio processor ap = AudioProcessor(**C.audio) # load speakers if args.speakers_json != '': speakers = json.load(open(args.speakers_json, 'r')) num_speakers = len(speakers) else: num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(args.model_path) model.load_state_dict(cp['model']) model.r = cp['r'] model.decoder.r = cp['r']
class LJSpeechDataset(Dataset): def __init__(self, csv_file, root_dir, outputs_per_step, sample_rate, text_cleaner, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power, min_seq_len=0): with open(csv_file, "r", encoding="utf8") as f: self.frames = [line.split('|') for line in f] self.root_dir = root_dir self.outputs_per_step = outputs_per_step self.sample_rate = sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len self.ap = AudioProcessor(sample_rate, num_mels, min_level_db, frame_shift_ms, frame_length_ms, preemphasis, ref_level_db, num_freq, power) print(" > Reading LJSpeech from - {}".format(root_dir)) print(" | > Number of instances : {}".format(len(self.frames))) self._sort_frames() def load_wav(self, filename): try: audio = librosa.core.load(filename, sr=self.sample_rate) return audio except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) def _sort_frames(self): r"""Sort sequences in ascending order""" lengths = np.array([len(ins[1]) for ins in self.frames]) print(" | > Max length sequence {}".format(np.max(lengths))) print(" | > Min length sequence {}".format(np.min(lengths))) print(" | > Avg length sequence {}".format(np.mean(lengths))) idxs = np.argsort(lengths) new_frames = [] ignored = [] for i, idx in enumerate(idxs): length = lengths[idx] if length < self.min_seq_len: ignored.append(idx) else: new_frames.append(self.frames[idx]) print(" | > {} instances are ignored by min_seq_len ({})".format( len(ignored), self.min_seq_len)) self.frames = new_frames def __len__(self): return len(self.frames) def __getitem__(self, idx): wav_name = os.path.join(self.root_dir, self.frames[idx][0]) + '.wav' text = self.frames[idx][1] text = np.asarray(text_to_sequence(text, [self.cleaners]), dtype=np.int32) wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32) sample = {'text': text, 'wav': wav, 'item_idx': self.frames[idx][0]} return sample def collate_fn(self, batch): r""" Perform preprocessing and create a final data batch: 1. PAD sequences with the longest sequence in the batch 2. Convert Audio signal to Spectrograms. 3. PAD sequences that can be divided by r. 4. Convert Numpy to Torch tensors. """ # Puts each data field into a tensor with outer dimension batch size if isinstance(batch[0], collections.Mapping): keys = list() wav = [d['wav'] for d in batch] item_idxs = [d['item_idx'] for d in batch] text = [d['text'] for d in batch] text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets stop_targets = [ np.array([0.] * (mel_len - 1)) for mel_len in mel_lengths ] # PAD stop targets stop_targets = prepare_stop_target(stop_targets, self.outputs_per_step) # PAD sequences with largest length of the batch text = prepare_data(text).astype(np.int32) wav = prepare_data(wav) # PAD features with largest length + a zero frame linear = prepare_tensor(linear, self.outputs_per_step) mel = prepare_tensor(mel, self.outputs_per_step) assert mel.shape[2] == linear.shape[2] timesteps = mel.shape[2] # B x T x D linear = linear.transpose(0, 2, 1) mel = mel.transpose(0, 2, 1) # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) linear = torch.FloatTensor(linear) mel = torch.FloatTensor(mel) mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) return text, text_lenghts, linear, mel, mel_lengths, stop_targets, item_idxs[ 0] raise TypeError(("batch must contain tensors, numbers, dicts or lists;\ found {}".format(type(batch[0]))))
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available(), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, pad=2, upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1<prd>",text) text = re.sub(websites,"<prd>\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text) text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".<stop>") text = text.replace("?","?<stop>") text = text.replace("!","!<stop>") text = text.replace("<prd>",".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) if len(sens) == 0: sens = [text+'.'] for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen)
def main(args): dataset = importlib.import_module('datasets.' + c.dataset) Dataset = getattr(dataset, 'MyDataset') audio = importlib.import_module('utils.' + c.audio_processor) AudioProcessor = getattr(audio, 'AudioProcessor') ap = AudioProcessor(sample_rate=c.sample_rate, num_mels=c.num_mels, min_level_db=c.min_level_db, frame_shift_ms=c.frame_shift_ms, frame_length_ms=c.frame_length_ms, ref_level_db=c.ref_level_db, num_freq=c.num_freq, power=c.power, preemphasis=c.preemphasis) # Setup the dataset train_dataset = Dataset(c.data_path, c.meta_file_train, c.r, c.text_cleaner, ap=ap, min_seq_len=c.min_seq_len) train_loader = DataLoader(train_dataset, batch_size=c.batch_size, shuffle=False, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=c.num_loader_workers, pin_memory=True) if c.run_eval: val_dataset = Dataset(c.data_path, c.meta_file_val, c.r, c.text_cleaner, ap=ap) val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size, shuffle=False, collate_fn=val_dataset.collate_fn, drop_last=False, num_workers=4, pin_memory=True) else: val_loader = None model = Tacotron(c.embedding_size, ap.num_freq, c.num_mels, c.r) print(" | > Num output units : {}".format(ap.num_freq), flush=True) optimizer = optim.Adam(model.parameters(), lr=c.lr) optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr) criterion = L1LossMasked() criterion_st = nn.BCELoss() if args.restore_path: checkpoint = torch.load(args.restore_path) model.load_state_dict(checkpoint['model']) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() optimizer.load_state_dict(checkpoint['optimizer']) # optimizer_st.load_state_dict(checkpoint['optimizer_st']) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() print(" > Model restored from step %d" % checkpoint['step'], flush=True) start_epoch = checkpoint['step'] // len(train_loader) best_loss = checkpoint['linear_loss'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 print("\n > Starting a new training", flush=True) if use_cuda: model = model.cuda() criterion.cuda() criterion_st.cuda() scheduler = AnnealLR(optimizer, warmup_steps=c.warmup_steps) num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params), flush=True) if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if 'best_loss' not in locals(): best_loss = float('inf') for epoch in range(0, c.epochs): train_loss, current_step = train(model, criterion, criterion_st, train_loader, optimizer, optimizer_st, scheduler, ap, epoch) val_loss = evaluate(model, criterion, criterion_st, val_loader, ap, current_step) print(" | > Train Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
def __init__(self, *args, **kwargs): super(TestAudio, self).__init__(*args, **kwargs) self.ap = AudioProcessor(**c.audio)
class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestTTSDataset, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio) def _create_dataloader(self, batch_size, r, bgs): dataset = TTSDataset.MyDataset( c.data_path, 'metadata.csv', r, c.text_cleaner, preprocessor=ljspeech, ap=self.ap, batch_group_size=bgs, min_seq_len=c.min_seq_len, max_seq_len=float("inf"), use_phonemes=False) dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=False, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) return dataloader, dataset def test_loader(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] neg_values = text_input[text_input < 0] check_count = len(neg_values) assert check_count == 0, \ " !! Negative values in text_input: {}".format(check_count) # TODO: more assertion here assert linear_input.shape[0] == c.batch_size assert linear_input.shape[2] == self.ap.num_freq assert mel_input.shape[0] == c.batch_size assert mel_input.shape[2] == c.audio['num_mels'] # check normalization ranges if self.ap.symmetric_norm: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= -self.ap.max_norm assert mel_input.min() < 0 else: assert mel_input.max() <= self.ap.max_norm assert mel_input.min() >= 0 def test_batch_group_shuffle(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(2, c.r, 16) last_length = 0 frames = dataset.items for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] avg_length = mel_lengths.numpy().mean() assert avg_length >= last_length dataloader.dataset.sort_items() assert frames[0] != dataloader.dataset.items[0] def test_padding_and_spec(self): if ok_ljspeech: dataloader, dataset = self._create_dataloader(1, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] # check mel_spec consistency wav = self.ap.load_wav(item_idx[0]) mel = self.ap.melspectrogram(wav) mel_dl = mel_input[0].cpu().numpy() assert ( abs(mel.T).astype("float32") - abs(mel_dl[:-1])).sum() == 0 # check mel-spec correctness mel_spec = mel_input[0].cpu().numpy() wav = self.ap.inv_mel_spectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav') # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/linear_target_dataloader.wav') # check the last time step to be zero padded assert linear_input[0, -1].sum() == 0 assert linear_input[0, -2].sum() != 0 assert mel_input[0, -1].sum() == 0 assert mel_input[0, -2].sum() != 0 assert stop_target[0, -1] == 1 assert stop_target[0, -2] == 0 assert stop_target.sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[0] == linear_input[0].shape[0] assert mel_lengths[0] == mel_input[0].shape[0] # Test for batch size 2 dataloader, dataset = self._create_dataloader(2, 1, 0) for i, data in enumerate(dataloader): if i == self.max_loader_iter: break text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_target = data[5] item_idx = data[6] if mel_lengths[0] > mel_lengths[1]: idx = 0 else: idx = 1 # check the first item in the batch assert linear_input[idx, -1].sum() == 0 assert linear_input[idx, -2].sum() != 0, linear_input assert mel_input[idx, -1].sum() == 0 assert mel_input[idx, -2].sum() != 0, mel_input assert stop_target[idx, -1] == 1 assert stop_target[idx, -2] == 0 assert stop_target[idx].sum() == 1 assert len(mel_lengths.shape) == 1 assert mel_lengths[idx] == mel_input[idx].shape[0] assert mel_lengths[idx] == linear_input[idx].shape[0] # check the second itme in the batch assert linear_input[1 - idx, -1].sum() == 0 assert mel_input[1 - idx, -1].sum() == 0 assert stop_target[1 - idx, -1] == 1 assert len(mel_lengths.shape) == 1 # check batch conditions assert (linear_input * stop_target.unsqueeze(2)).sum() == 0 assert (mel_input * stop_target.unsqueeze(2)).sum() == 0
def main(): ap = AudioProcessor() train_dataset = TTSDataset('data/LJSpeech-1.1', 'train.list', outputs_per_step=r) valid_dataset = TTSDataset('data/LJSpeech-1.1', 'valid.list', outputs_per_step=r) print('train data:', len(train_dataset)) print('valid data:', len(valid_dataset)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, drop_last=False, num_workers=0, pin_memory=False) valid_loader = torch.utils.data.DataLoader( valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=valid_dataset.collate_fn, drop_last=False, num_workers=0, pin_memory=False) # Create models num_chars = len(phonemes) model = Tacotron(num_chars, r=r).to(device) optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.0) # StopNetは二値分類タスクなので独自に訓練する optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=lr, weight_decay=0.0) criterion = L1LossMasked() criterion_st = nn.BCEWithLogitsLoss() num_params = count_parameters(model) print('Model has {} parameters'.format(num_params)) # Training best_loss = float('inf') global_step = 0 for epoch in range(0, epochs + 1): train_loss, global_step = train(train_loader, model, criterion, criterion_st, optimizer, optimizer_st, ap, global_step, epoch) valid_loss = evaluate(valid_loader, model, criterion, criterion_st, ap, global_step, epoch) print('Epoch [{}/{}] train_loss: {:.5f} valid_loss: {:.5f}'.format( epoch, epochs, train_loss, valid_loss)) if valid_loss < best_loss: print(' => valid_loss improved from {:.5f} to {:.5f}!'.format( best_loss, valid_loss)) new_state_dict = model.state_dict() state = { 'model': new_state_dict, 'optimizer': optimizer.state_dict(), 'epoch': epoch, 'linear_loss': valid_loss } best_loss = valid_loss best_model_path = os.path.join(writer.logdir, 'best_model.pth') torch.save(state, best_model_path)
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.tts_config.text_cleaner], self.tts_config. phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.tts_config.text_cleaner]) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) if not sens: sens = [text + '.'] for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen) seq = np.array(self.input_adapter(sen)) text_hat = sequence_to_phoneme(seq) print(text_hat) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference( chars_var) postnet_out = postnet_out[0].data.cpu().numpy() if self.tts_config.model == "Tacotron": wav = self.ap.inv_spectrogram(postnet_out.T) elif self.tts_config.model == "Tacotron2": if self.wavernn: wav = self.wavernn.generate( torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = self.ap.inv_mel_spectrogram(postnet_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
parser.add_argument("--ignore_errors", type=bool, default=False, help="ignore bad files.") args = parser.parse_args() config_path = args.config_path CONFIG = load_config(config_path) if args.data_path != '': CONFIG.data_path = args.data_path if type(CONFIG.mode) is int: CONFIG.audio['bits'] = CONFIG.mode ap = AudioProcessor(**CONFIG.audio) SEG_PATH = CONFIG.data_path # OUT_PATH = os.path.join(args.out_path, CONFIG.run_name, "data/") OUT_PATH = args.out_path QUANT_PATH = os.path.join(OUT_PATH, "quant/") MEL_PATH = os.path.join(OUT_PATH, "mel/") os.makedirs(OUT_PATH, exist_ok=True) os.makedirs(QUANT_PATH, exist_ok=True) os.makedirs(MEL_PATH, exist_ok=True) wav_files = get_files(SEG_PATH) print(" > Number of audio files : {}".format(len(wav_files))) wav_file = wav_files[1] m, quant, wav = process_file(wav_file)
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1:
def generate(self, mels, batched, target, overlap) : self.eval() output = [] start = time.time() rnn1 = self.get_gru_cell(self.rnn1) rnn2 = self.get_gru_cell(self.rnn2) with torch.no_grad() : # mels = torch.FloatTensor(mels).cuda().unsqueeze(0) wave_len = (mels.size(-1) - 1) * self.hop_length mels = self.pad_tensor(mels.transpose(1, 2), pad=self.pad, side='both') mels, aux = self.upsample(mels.transpose(1, 2)) if batched : mels = self.fold_with_overlap(mels, target, overlap) if aux is not None: aux = self.fold_with_overlap(aux, target, overlap) b_size, seq_len, _ = mels.size() h1 = torch.zeros(b_size, self.rnn_dims).cuda() h2 = torch.zeros(b_size, self.rnn_dims).cuda() x = torch.zeros(b_size, 1).cuda() if self.use_aux_net: d = self.aux_dims aux_split = [aux[:, :, d*i:d*(i+1)] for i in range(4)] for i in range(seq_len) : m_t = mels[:, i, :] if self.use_aux_net: a1_t, a2_t, a3_t, a4_t = \ (a[:, i, :] for a in aux_split) x = torch.cat([x, m_t, a1_t], dim=1) if self.use_aux_net else torch.cat([x, m_t], dim=1) x = self.I(x) h1 = rnn1(x, h1) x = x + h1 inp = torch.cat([x, a2_t], dim=1) if self.use_aux_net else x h2 = rnn2(inp, h2) x = x + h2 x = torch.cat([x, a3_t], dim=1) if self.use_aux_net else x x = F.relu(self.fc1(x)) x = torch.cat([x, a4_t], dim=1) if self.use_aux_net else x x = F.relu(self.fc2(x)) logits = self.fc3(x) if self.mode == 'mold': sample = sample_from_discretized_mix_logistic(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) x = sample.transpose(0, 1).cuda() elif self.mode == 'gauss': sample = sample_from_gaussian(logits.unsqueeze(0).transpose(1, 2)) output.append(sample.view(-1)) x = sample.transpose(0, 1).cuda() elif type(self.mode) is int: posterior = F.softmax(logits, dim=1) distrib = torch.distributions.Categorical(posterior) sample = 2 * distrib.sample().float() / (self.n_classes - 1.) - 1. output.append(sample) x = sample.unsqueeze(-1) else: raise RuntimeError("Unknown model mode value - ", self.mode) if i % 100 == 0 : self.gen_display(i, seq_len, b_size, start) output = torch.stack(output).transpose(0, 1) output = output.cpu().numpy() output = output.astype(np.float64) if batched : output = self.xfade_and_unfold(output, target, overlap) else : output = output[0] if self.mulaw and type(self.mode) == int: output = ap.mulaw_decode(output, self.mode) # Fade-out at the end to avoid signal cutting out suddenly fade_out = np.linspace(1, 0, 20 * self.hop_length) output = output[:wave_len] output[-20 * self.hop_length:] *= fade_out self.train() return output
type=str, help='Path to save final wav file.', ) args = parser.parse_args() try: path = os.path.realpath(os.path.dirname(__file__)) except NameError as e: path = './' C = load_config(os.path.join(path, 'pretrained_models/TTS/config.json')) C.forward_attn_mask = False C.windowing = True # load the audio processor ap = AudioProcessor(**C.audio) num_speakers = 0 # load the model num_chars = len(phonemes) if C.use_phonemes else len(symbols) model = setup_model(num_chars, num_speakers, C) cp = torch.load(os.path.join(path, 'pretrained_models/TTS/best_model.pth.tar'), map_location='cpu') model.load_state_dict(cp['model'], strict=False) model.r = cp['r'] model.decoder.r = cp['r'] model.eval() if use_cuda: model.cuda()
def __init__(self, *args, **kwargs): super(TestTTSDataset, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.ap = AudioProcessor(**c.audio)
def main(args): #pylint: disable=redefined-outer-name # Audio processor ap = AudioProcessor(**c.audio) # DISTRUBUTED if num_gpus > 1: init_distributed(args.rank, num_gpus, args.group_id, c.distributed["backend"], c.distributed["url"]) num_chars = len(phonemes) if c.use_phonemes else len(symbols) if c.use_speaker_embedding: speakers = get_speakers(c.data_path, c.meta_file_train, c.dataset) if args.restore_path: prev_out_path = os.path.dirname(args.restore_path) speaker_mapping = load_speaker_mapping(prev_out_path) assert all([speaker in speaker_mapping for speaker in speakers]), "As of now you, you cannot " \ "introduce new speakers to " \ "a previously trained model." else: speaker_mapping = {name: i for i, name in enumerate(speakers)} save_speaker_mapping(OUT_PATH, speaker_mapping) num_speakers = len(speaker_mapping) print("Training with {} speakers: {}".format(num_speakers, ", ".join(speakers))) else: num_speakers = 0 model = setup_model(num_chars, num_speakers, c) print(" | > Num output units : {}".format(ap.num_freq), flush=True) #optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0) optimizer = Ranger(model.parameters(), lr=c.lr, weight_decay=c.wd) optimizer_gst = Ranger(model.textgst.parameters(), lr=c.lr, weight_decay=c.wd) if c.text_gst else None if c.stopnet and c.separate_stopnet: optimizer_st = Ranger(model.decoder.stopnet.parameters(), lr=c.lr) else: optimizer_st = None if c.loss_masking: criterion = L1LossMasked() if c.model in ["Tacotron", "TacotronGST"] else MSELossMasked() else: criterion = nn.L1Loss() if c.model in ["Tacotron", "TacotronGST"] else nn.MSELoss() criterion_st = nn.BCEWithLogitsLoss() if c.stopnet else None criterion_gst = nn.L1Loss() if c.text_gst else None if args.restore_path: checkpoint = torch.load(args.restore_path) try: # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) del model_dict for group in optimizer.param_groups: group['lr'] = c.lr print( " > Model restored from step %d" % checkpoint['step'], flush=True) args.restore_step = checkpoint['step'] else: args.restore_step = 0 if use_cuda: model = model.cuda() criterion.cuda() if criterion_st: criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: model = apply_gradient_allreduce(model) if c.lr_decay: scheduler = NoamLR( optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1) else: scheduler = None num_params = count_parameters(model) print("\n > Model has {} parameters".format(num_params), flush=True) if 'best_loss' not in locals(): best_loss = float('inf') global_step = args.restore_step for epoch in range(0, c.epochs): # set gradual training if c.gradual_training is not None: r, c.batch_size = gradual_training_scheduler(global_step, c) c.r = r model.decoder.set_r(r) print(" > Number of outputs per iteration:", model.decoder.r) train_loss, global_step = train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch, criterion_gst=criterion_gst, optimizer_gst=optimizer_gst) if epoch % 5 == 0: val_loss = evaluate(model, criterion, criterion_st, criterion_gst, ap, global_step, epoch) print( " | > Training Loss: {:.5f} Validation Loss: {:.5f}".format( train_loss, val_loss), flush=True) target_loss = train_loss if c.run_eval: target_loss = val_loss best_loss = save_best_model(model, optimizer, optimizer_st, optimizer_gst, target_loss, best_loss, OUT_PATH, global_step, epoch)
raise FileNotFoundError('{} not found'.format(metadata_file)) melspec_dir = os.path.join(args.data_root, 'melspec') if not os.path.exists(melspec_dir): os.makedirs(melspec_dir, exist_ok=True) spec_dir = os.path.join(args.data_root, 'spec') if not os.path.exists(spec_dir): os.makedirs(spec_dir, exist_ok=True) phoneme_dir = os.path.join(args.data_root, 'phoneme') if not os.path.exists(phoneme_dir): os.makedirs(phoneme_dir, exist_ok=True) items = load_metadata(metadata_file) ap = AudioProcessor() for text, wav_file in tqdm(items): prefix = wav_file.replace('.wav', '') # 音素系列を生成 generate_phoneme_sequence(text, os.path.join(phoneme_dir, prefix + '.npy')) wav = np.array(ap.load_wav(os.path.join(wav_dir, wav_file)), dtype=np.float32) # メルスペクトログラムを生成 melspec = ap.melspectrogram(wav).astype('float32') np.save(os.path.join(melspec_dir, prefix + '.npy'), melspec)
def __init__(self, *args, **kwargs): super(TestTTSDatasetCached, self).__init__(*args, **kwargs) self.max_loader_iter = 4 self.c = load_config(os.path.join(c.data_path_cache, 'config.json')) self.ap = AudioProcessor(**self.c.audio)
parser.add_argument( "--num_procs", type=int, default=4, help="numer of parallel processes." ) parser.add_argument( "--data_path", type=str, default='', help="data path to overwrite config.json." ) args = parser.parse_args() config_path = args.config_path CONFIG = load_config(config_path) if args.data_path != '': CONFIG.data_path = args.data_path ap = AudioProcessor(**CONFIG.audio) # Point SEG_PATH to a folder containing your training wavs # Doesn't matter if it's LJspeech, CMU Arctic etc. it should work fine SEG_PATH = CONFIG.data_path OUT_PATH = os.path.join(CONFIG.out_path, CONFIG.run_name, "data/") QUANT_PATH = os.path.join(OUT_PATH, "quant/") MEL_PATH = os.path.join(OUT_PATH, "mel/") os.makedirs(OUT_PATH, exist_ok=True) os.makedirs(QUANT_PATH, exist_ok=True) os.makedirs(MEL_PATH, exist_ok=True) wav_files = get_files(SEG_PATH) print(" > Number of audio files : {}".format(len(wav_files))) wav_file = wav_files[1]
if args.restore_path: new_fields["restore_path"] = args.restore_path new_fields["github_branch"] = get_git_branch() copy_config_file(args.config_path, os.path.join(OUT_PATH, 'config.json'), new_fields) os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) if args.rank == 0: LOG_DIR = OUT_PATH tb_logger = Logger(LOG_DIR) # Conditional imports preprocessor = importlib.import_module('datasets.preprocess') preprocessor = getattr(preprocessor, c.dataset.lower()) # Audio processor ap = AudioProcessor(**c.audio) try: main(args) except KeyboardInterrupt: try: sys.exit(0) except SystemExit: os._exit(0) except Exception: remove_experiment_folder(OUT_PATH) traceback.print_exc() sys.exit(1)
if args.data_path != "": CONFIG.data_path = args.data_path DATA_PATH = CONFIG.data_path # DISTRUBUTED if num_gpus > 1: init_distributed( args.rank, num_gpus, args.group_id, CONFIG.distributed["backend"], CONFIG.distributed["url"], ) global ap ap = AudioProcessor(**CONFIG.audio) mode = CONFIG.mode # setup output paths and read configs _ = os.path.dirname(os.path.realpath(__file__)) if args.data_path != "": CONFIG.data_path = args.data_path if args.output_path == "": OUT_PATH = os.path.join(_, CONFIG.output_path) else: OUT_PATH = args.output_path if args.group_id == "": OUT_PATH = create_experiment_folder(OUT_PATH, CONFIG.model_name)