def train(self, model: Tacotron, optimizer: Optimizer) -> None: for i, session_params in enumerate(hp.tts_schedule, 1): r, lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=r, model_type='tacotron') session = TTSSession( index=i, r=r, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session)
def train(self, model: Tacotron, optimizer: Optimizer) -> None: tts_schedule = self.train_cfg['schedule'] tts_schedule = parse_schedule(tts_schedule) for i, session_params in enumerate(tts_schedule, 1): r, lr, max_step, bs = session_params if model.get_step() < max_step: train_set, val_set = get_tts_datasets( path=self.paths.data, batch_size=bs, r=r, model_type='tacotron', max_mel_len=self.train_cfg['max_mel_len'], filter_attention=False) session = TTSSession(index=i, r=r, lr=lr, max_step=max_step, bs=bs, train_set=train_set, val_set=val_set) self.train_session(model, optimizer, session=session)
lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).cuda() paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) model.restore(paths.tts_latest_weights) # model.reset_step() # model.set_r(hp.tts_r) optimiser = optim.Adam(model.parameters()) current_step = model.get_step() if not force_gta: for session in hp.tts_schedule: r, lr, max_step, batch_size = session if current_step < max_step: train_set, attn_example = get_tts_dataset( paths.data, batch_size, r) model.set_r(r) training_steps = max_step - current_step
num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_model.load('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f ] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.r simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', 11_000 if batched else 'N/A'), ('Overlap Samples', 550 if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x) if input_text:
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='Train Tacotron TTS') parser.add_argument('--force_train', '-f', action='store_true', help='Forces the model to train past total steps') parser.add_argument('--force_gta', '-g', action='store_true', help='Force the model to create GTA features') parser.add_argument( '--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') args = parser.parse_args() hp.configure(args.hp_file) # Load hparams from file paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) force_train = args.force_train force_gta = args.force_gta if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') for session in hp.tts_schedule: _, _, _, batch_size = session if batch_size % torch.cuda.device_count() != 0: raise ValueError( '`batch_size` must be evenly divisible by n_gpus!') else: device = torch.device('cpu') print('Using device:', device) # Instantiate Tacotron Model print('\nInitialising Tacotron Model...\n') model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) optimizer = optim.Adam(model.parameters()) restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True) if not force_gta: for i, session in enumerate(hp.tts_schedule): current_step = model.get_step() r, lr, max_step, batch_size = session training_steps = max_step - current_step # Do we need to change to the next session? if current_step >= max_step: # Are there no further sessions than the current one? if i == len(hp.tts_schedule) - 1: # There are no more sessions. Check if we force training. if force_train: # Don't finish the loop - train forever training_steps = 999_999_999 else: # We have completed training. Breaking is same as continue break else: # There is a following session, go to it continue model.r = r simple_table([('Steps with r=%s' % (repr1(r)), str(training_steps // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr), ('Outputs/Step (r)', model.r)]) train_set, attn_example = get_tts_datasets(paths.data, batch_size, r) tts_train_loop(paths, model, optimizer, train_set, lr, training_steps, attn_example) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') train_set, attn_example = get_tts_datasets(paths.data, 8, model.r) create_gta_features(model, train_set, paths.gta) print( '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n' )
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = 'taco_step%sK' % (repr1(k)) save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / '%s' % (repr1(step))) save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / '%s' % (repr1(step)), 600) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def train_session(self, model: Tacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 model.r = session.r simple_table([(f'Steps with r={session.r}', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Outputs/Step (r)', model.r)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens) in enumerate(session.train_set, 1): start = time.time() model.train() x, m = x.to(device), m.to(device) m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.tts_plot_every == 0: self.generate_plots(model, session) _, att_score = attention_score(attention, mel_lens) att_score = torch.mean(att_score) self.writer.add_scalar('Attention_Score/train', att_score, model.get_step()) self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/reduction_factor', session.r, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss, val_att_score = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) self.writer.add_scalar('Attention_Score/val', val_att_score, model.get_step()) save_checkpoint('tts', self.paths, model, optimizer, is_silent=True) loss_avg.reset() duration_avg.reset() print(' ')
class TaiwaneseTacotron(): def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) def generate(self, 華, input_text): inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])] if hp.tts_model == 'tacotron2': self.gen_tacotron2(華, inputs) elif hp.tts_model == 'tacotron': self.gen_tacotron(華, inputs) else: print(f"Wrong tts model type {{{tts_model_type}}}") print('\n\nDone.\n') # custom function def gen_tacotron2(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') print(x) x = np.array(x)[None, :] x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long() self.tts_model.eval() mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference( x) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = mel_outputs_postnet self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy() wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path) # custom function def gen_tacotron(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = self.tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path)
def TTS_Wave(self): os.makedirs('quick_start/tts_weights/', exist_ok=True) os.makedirs('quick_start/voc_weights/', exist_ok=True) zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r') zip_ref.extractall('quick_start/voc_weights/') zip_ref.close() zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r') zip_ref.extractall('quick_start/tts_weights/') zip_ref.close() # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument('-name', metavar='name', type=str,help='name of pdf') parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!') parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)') parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)') parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index') parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples') parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.set_defaults(batched=hp.voc_gen_batched) parser.set_defaults(target=hp.voc_target) parser.set_defaults(overlap=hp.voc_overlap) parser.set_defaults(input_text=None) parser.set_defaults(weights_path=None) args = parser.parse_args() batched = args.batched target = args.target overlap = args.overlap input_text = args.input_text weights_path = args.weights_path if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_device(0) else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.restore('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).to(device) tts_model.restore('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('final.txt') as f: inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.get_r() simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print("f'\n| Generating {i}/{len(inputs)}'") _, m, attention = tts_model.generate(x) if input_text: save_path = './output_audio/'+str(i)+'.wav' else: save_path = './output_audio/'+str(i)+'.wav' # save_attention(attention, save_path) m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) if i == 2: temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav") temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = temp1 + temp2 os.remove("./output_audio/"+str(i-1)+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") elif i > 2: preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav") newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = preTemp + newTemp os.remove("./output_audio/"+self.path[:-4]+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") print("Done")
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) #print(att_guides.shape) orig_attention = attention n = int(len(att_guides[0]) / r) #print("n", n) #reduce guide by r factor ga = [a[t] for a in att_guides for t in range(0, len(a), r)] assert n == len(attention[0]) guided_attention = [ga[k:k + n] for k in range(0, len(ga), n)] attention = np_now(attention) attention = [ pad2d_nonzero(x, n, len(att_guides[0][0])) for x in attention ] guided_attention = torch.tensor(guided_attention) guided_attention = guided_attention.to(device) attention = torch.tensor(attention) attention = attention.to(device) #create attention mask attention_masks = torch.ne(attention, -1).type(torch.FloatTensor) attention_masks = torch.tensor(attention_masks) attention_masks = attention.to(device) multiply = torch.abs( attention * guided_attention) * attention_masks attention_loss = torch.sum(multiply) mask_sum = torch.sum(attention_masks) attention_loss /= mask_sum m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #print("mask sum", mask_sum) #print("attention loss", attention_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss #print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss #print("loss + att", loss) optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(orig_attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def tts_train_loop_af_offline(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, hp=None): # setattr(model, 'mode', 'attention_forcing') # import pdb def smooth(d, eps=float(1e-10)): u = 1.0 / float(d.size()[2]) return eps * u + (1 - eps) * d device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss_out, running_loss_attn = 0, 0 # Perform 1 epoch for i, (x, m, ids, _, attn_ref) in enumerate(train_set, 1): # print(x.size()) # print(m.size()) # print(attn_ref.size()) # # print(m1_hat.size(), m2_hat.size()) # # print(attention.size(), attention.size(1)*model.r) # pdb.set_trace() x, m, attn_ref = x.to(device), m.to(device), attn_ref.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m, False, attn_ref) else: m1_hat, m2_hat, attention = model(x, m, generate_gta=False, attn_ref=attn_ref) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) # attn_loss = F.kl_div(torch.log(smooth(attention)), smooth(attn_ref), reduction='mean') # 'batchmean' attn_loss = F.l1_loss(smooth(attention), smooth(attn_ref)) loss_out = m1_loss + m2_loss loss_attn = attn_loss * hp.attn_loss_coeff loss = loss_out + loss_attn optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss_out += loss_out.item() avg_loss_out = running_loss_out / i running_loss_attn += loss_attn.item() avg_loss_attn = running_loss_attn / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attn_ref[idx][:, :160]), paths.tts_attention / f'{step}_tf') save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}_af') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss_out: {avg_loss_out:#.4}; Output_attn: {avg_loss_attn:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) model.restore(paths.tts_latest_weights) optimiser = optim.Adam(model.parameters()) train_set = get_tts_dataset(paths.data, batch_size) if not force_gta: total_steps = 10_000_000 if force_train else hp.tts_total_steps simple_table([ ('Remaining', str( (total_steps - model.get_step()) // 1000) + 'k Steps'), ('Batch Size', batch_size), ('Learning Rate', lr) ]) tts_train_loop(model, optimiser, train_set, lr, total_steps) print('Training Complete.') print( 'To continue training increase tts_total_steps in hparams.py or use --force_train\n' ) print('Creating Ground Truth Aligned Dataset...\n') create_gta_features(model, train_set, paths.gta) print(
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, max_y, max_x): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) reduced_guides = [] att_guide_path = hp.attention_path for j, item_id in enumerate(ids): att = np.load(f'{att_guide_path}/{item_id}.npy') reduced = att[0::r] pred_attention = attention[j] n_frames = pred_attention.shape[0] n_phones = pred_attention.shape[-1] # pred_attention = torch.tensor(pred_attention) # reduced = torch.tensor(reduced) padded_guides = pad2d_nonzero(reduced, n_frames, n_phones) #padded_guides = torch.tensor(padded_guides) reduced_guides.append(padded_guides) reduced_guides = torch.tensor(reduced_guides) mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor) mask = torch.tensor(mask) padded_guides = [ pad2d_zero(x, n_frames, n_phones) for x in reduced_guides ] padded_guides = torch.tensor(padded_guides) padded_guides = padded_guides.to(device) attention = attention.to(device) mask = mask.to(device) attention = attention * mask print("guide att shape", att.shape) print(att) print("reduced guide", padded_guides.shape) # print("attention size",n_frames, n_phones) print("mask", mask.shape) print(mask) print(padded_guides.shape, attention.shape, mask.shape) print(attention) print(padded_guides) multiply = torch.pow((attention - padded_guides), 2) print(multiply) #multiply = torch.pow((pred_attention - padded_guides),2)* mask #print(multiply) attention_loss = torch.sum(multiply) print(attention_loss) mask_sum1 = torch.sum(mask) attention_loss /= mask_sum1 print(attention_loss) # batch_attention_losses.append(attention_loss) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses) #print("attention loss", average_att_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss print("loss + att", loss) #exit() optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')