def save(self, iteration): """Save model checkpoint. Args: iteration (int): iteration number of the model to be saved. Returns: None """ io.save_parameters(self.checkpoint_dir, iteration, self.waveflow, self.optimizer)
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg['train']['max_iteration']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() if local_rank == 0: writer.add_scalar('training_loss/loss', loss.numpy(), global_step) # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def train(args, config): model = create_model(config) train_loader, valid_loader = create_data(config, args.input) optim = create_optimizer(model, config) global global_step max_iteration = config["max_iteration"] iterator = iter(tqdm.tqdm(train_loader)) while global_step <= max_iteration: # get inputs try: batch = next(iterator) except StopIteration: iterator = iter(tqdm.tqdm(train_loader)) batch = next(iterator) # unzip it text_seqs, text_lengths, specs, mels, num_frames = batch # forward & backward model.train() outputs = model(text_seqs, text_lengths, speakers=None, mel=mels) decoded, refined, attentions, final_state = outputs causal_mel_loss = model.spec_loss(decoded, mels, num_frames) non_causal_mel_loss = model.spec_loss(refined, mels, num_frames) loss = causal_mel_loss + non_causal_mel_loss loss.backward() # update optim.minimize(loss) # logging tqdm.tqdm.write("[train] step: {}\tloss: {:.6f}\tcausal:{:.6f}\tnon_causal:{:.6f}".format( global_step, loss.numpy()[0], causal_mel_loss.numpy()[0], non_causal_mel_loss.numpy()[0])) writer.add_scalar("loss/causal_mel_loss", causal_mel_loss.numpy()[0], step=global_step) writer.add_scalar("loss/non_causal_mel_loss", non_causal_mel_loss.numpy()[0], step=global_step) writer.add_scalar("loss/loss", loss.numpy()[0], step=global_step) if global_step % config["report_interval"] == 0: text_length = int(text_lengths.numpy()[0]) num_frame = int(num_frames.numpy()[0]) tag = "train_mel/ground-truth" img = cm.viridis(normalize(mels.numpy()[0, :num_frame].T)) writer.add_image(tag, img, step=global_step) tag = "train_mel/decoded" img = cm.viridis(normalize(decoded.numpy()[0, :num_frame].T)) writer.add_image(tag, img, step=global_step) tag = "train_mel/refined" img = cm.viridis(normalize(refined.numpy()[0, :num_frame].T)) writer.add_image(tag, img, step=global_step) vocoder = WaveflowVocoder() vocoder.model.eval() tag = "train_audio/ground-truth-waveflow" wav = vocoder(F.transpose(mels[0:1, :num_frame, :], (0, 2, 1))) writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050) tag = "train_audio/decoded-waveflow" wav = vocoder(F.transpose(decoded[0:1, :num_frame, :], (0, 2, 1))) writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050) tag = "train_audio/refined-waveflow" wav = vocoder(F.transpose(refined[0:1, :num_frame, :], (0, 2, 1))) writer.add_audio(tag, wav.numpy()[0], step=global_step, sample_rate=22050) attentions_np = attentions.numpy() attentions_np = attentions_np[:, 0, :num_frame // 4 , :text_length] for i, attention_layer in enumerate(np.rot90(attentions_np, axes=(1,2))): tag = "train_attention/layer_{}".format(i) img = cm.viridis(normalize(attention_layer)) writer.add_image(tag, img, step=global_step, dataformats="HWC") if global_step % config["save_interval"] == 0: save_parameters(writer.logdir, global_step, model, optim) # global step +1 global_step += 1
while global_step <= max_iterations: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm.tqdm(train_loader)) batch = next(iterator) audio_clips, mel_specs, audio_starts = batch model.train() y_var = model(audio_clips, mel_specs, audio_starts) loss_var = model.loss(y_var, audio_clips) loss_var.backward() loss_np = loss_var.numpy() writer.add_scalar("loss", loss_np[0], global_step) writer.add_scalar("learning_rate", optim._learning_rate.step().numpy()[0], global_step) optim.minimize(loss_var) optim.clear_gradients() print("global_step: {}\tloss: {:<8.6f}".format(global_step, loss_np[0])) if global_step % snap_interval == 0: valid_model(model, valid_loader, writer, global_step, sample_rate) if global_step % checkpoint_interval == 0: io.save_parameters(checkpoint_dir, global_step, model, optim) global_step += 1
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(dg.parallel.Env() .dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) reader = LJSpeechLoader( cfg['audio'], place, args.data, args.alignments_path, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) (character, mel, pos_text, pos_mel, alignment) = batch global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize(total_loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
# record learning rate before updating if env.local_rank == 0: writer.add_scalar("learning_rate", optim._learning_rate.step().numpy(), global_step) optim.minimize(l) optim.clear_gradients() # record step losses step_loss = {k: v.numpy()[0] for k, v in losses.items()} if env.local_rank == 0: tqdm.tqdm.write("[Train] global_step: {}\tloss: {}".format( global_step, step_loss["loss"])) for k, v in step_loss.items(): writer.add_scalar(k, v, global_step) # train state saving, the first sentence in the batch if env.local_rank == 0 and global_step % snap_interval == 0: input_specs = (mel_specs, lin_specs) state_saver(outputs, input_specs, global_step) # evaluation if env.local_rank == 0 and global_step % eval_interval == 0: evaluator(model, global_step) # save checkpoint if env.local_rank == 0 and global_step % save_interval == 0: save_parameters(ckpt_dir, global_step, model, optim) global_step += 1
] for idx, sent in enumerate(sentences): wav, attn = eval_model( dv3, sent, replace_pronounciation_prob, min_level_db, ref_level_db, power, n_iter, win_length, hop_length, preemphasis) wav_path = os.path.join( state_dir, "waveform", "eval_sample_{:09d}.wav".format(global_step)) sf.write(wav_path, wav, sample_rate) writer.add_audio( "eval_sample_{}".format(idx), wav, global_step, sample_rate=sample_rate) attn_path = os.path.join( state_dir, "alignments", "eval_sample_attn_{:09d}.png".format(global_step)) plot_alignment(attn, attn_path) writer.add_image( "eval_sample_attn{}".format(idx), cm.viridis(attn), global_step, dataformats="HWC") # save checkpoint if global_step % save_interval == 0: io.save_parameters(ckpt_dir, global_step, dv3, optim) global_step += 1
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader( cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) global_step += 1 while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss stop_loss = cross_entropy( stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight']) loss = loss + stop_loss if local_rank == 0: writer.add_scalar('training_loss/mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('training_loss/post_mel_loss', post_mel_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalar('alphas/encoder_alpha', model._layers.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model._layers.decoder.alpha.numpy(), global_step) else: writer.add_scalar('alphas/encoder_alpha', model.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model.decoder.alpha.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) global_step += 1 if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader() for epoch in range(cfg['train']['max_epochs']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if cfg['network']['stop_token']: label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss if local_rank == 0: writer.add_scalars( 'training_loss', { 'mel_loss': mel_loss.numpy(), 'post_mel_loss': post_mel_loss.numpy() }, global_step) if cfg['network']['stop_token']: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalars( 'alphas', { 'encoder_alpha': model._layers.encoder.alpha.numpy(), 'decoder_alpha': model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars( 'alphas', { 'encoder_alpha': model.encoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()