def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=2): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # HACK: setup separate training and eval sets training_files = data_config['training_files'] eval_files = data_config['eval_files'] del data_config['training_files'] del data_config['eval_files'] data_config['audio_files'] = training_files trainset = Mel2Samp(**data_config) data_config['audio_files'] = eval_files evalset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print("Creating dataloaders with " + str(num_workers) + " workers") train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=True, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger_train = SummaryWriter( os.path.join(output_directory, 'logs', 'train')) logger_eval = SummaryWriter( os.path.join(output_directory, 'logs', 'eval')) epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): model.train() with tqdm(total=len(train_loader)) as train_pbar: for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() train_pbar.set_description( "Epoch {} Iter {} Loss {:.3f}".format( epoch, iteration, reduced_loss)) if with_tensorboard and rank == 0 and iteration % 10 == 0: logger_train.add_scalar('loss', reduced_loss, i + len(train_loader) * epoch) # adding logging for GPU utilization and memory usage gpu_memory_used, gpu_utilization = get_gpu_stats() k = 'gpu' + str(0) logger_train.add_scalar(k + '/memory', gpu_memory_used, iteration) logger_train.add_scalar(k + '/load', gpu_utilization, iteration) logger_train.flush() if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 train_pbar.update(1) # Eval model.eval() torch.cuda.empty_cache() with torch.no_grad(): tensorboard_mel, tensorboard_audio = None, None loss_accum = [] with tqdm(total=len(eval_loader)) as eval_pbar: for i, batch in enumerate(eval_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs).item() loss_accum.append(loss) eval_pbar.set_description("Epoch {} Eval {:.3f}".format( epoch, loss)) outputs = None # use the first batch for tensorboard audio samples if i == 0: tensorboard_mel = mel tensorboard_audio = audio eval_pbar.update(1) if with_tensorboard and rank == 0: loss_avg = statistics.mean(loss_accum) tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg)) logger_eval.add_scalar('loss', loss_avg, iteration) # log audio samples to tensorboard tensorboard_audio_generated = model.infer(tensorboard_mel) for i in range(0, 5): ta = tensorboard_audio[i].cpu().numpy() tag = tensorboard_audio_generated[i].cpu().numpy() logger_eval.add_audio("sample " + str(i) + "/orig", ta, epoch, sample_rate=data_config['sampling_rate']) logger_eval.add_audio("sample " + str(i) + "/gen", tag, epoch, sample_rate=data_config['sampling_rate']) logger_eval.flush()
except KeyboardInterrupt: self.Save_Checkpoint() exit(1) self.tqdm.close() logging.info('Finished training.') if __name__ == '__main__': argParser = argparse.ArgumentParser() argParser.add_argument('-hp', '--hyper_parameters', required=True, type=str) argParser.add_argument('-s', '--steps', default=0, type=int) argParser.add_argument('-p', '--port', default=54321, type=int) argParser.add_argument('-r', '--local_rank', default=0, type=int) args = argParser.parse_args() hp = Recursive_Parse( yaml.load(open(args.hyper_parameters, encoding='utf-8'), Loader=yaml.Loader)) os.environ['CUDA_VISIBLE_DEVICES'] = hp.Device if hp.Use_Multi_GPU: init_distributed(rank=int(os.getenv('RANK', '0')), num_gpus=int(os.getenv("WORLD_SIZE", '1')), dist_backend='nccl', dist_url='tcp://127.0.0.1:{}'.format(args.port)) new_Trainer = Trainer(hp_path=args.hyper_parameters, steps=args.steps) new_Trainer.Train()
def train( num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, ): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader( trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, ) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, "logs")) # fixed for visualization real_mels, real_audios = zip(*[trainset[i] for i in range(8)]) real_mel = torch.cat(real_mels, dim=-1) real_audio = torch.cat(real_audios, dim=0) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: step = i + len(train_loader) * epoch logger.add_scalar("training_loss", reduced_loss, step) if step % 500 == 0: # select the first eight data sample model.eval() with torch.no_grad(): device = mel.device fake_audio = (model.infer( torch.stack(real_mels).to(device)).flatten( 0, 1).cpu()) model.train() fake_mel = trainset.get_mel(fake_audio) logger.add_image( "training_mel_real", plot_spectrogram_to_numpy(real_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_real", real_audio, step, 22050, ) logger.add_image( "training_mel_fake", plot_spectrogram_to_numpy(fake_mel), step, dataformats="HWC", ) logger.add_audio( "training_audio_fake", fake_audio, step, 22050, ) logger.flush() if iteration % iters_per_checkpoint == 0: if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=4): print("num_workers", num_workers) torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=1, gamma=0.96) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) evalset = Mel2Samp(**eval_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=False, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) epoch_offset = max(1, int(iteration / len(train_loader))) start_time = datetime.datetime.now() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print('Epoch:', epoch, 'LR:', scheduler.get_lr()) elapsed = datetime.datetime.now() - start_time print("Epoch: [{}][els: {}] {}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch)) model.train() total_loss = 0. for i, batch in enumerate(train_loader): model.zero_grad() if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: mel, audio, spk_embed_or_id = batch spk_embed_or_id = torch.autograd.Variable( spk_embed_or_id.cuda()) else: mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: outputs = model((mel, audio, spk_embed_or_id)) else: outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() total_loss += reduced_loss if i > 0 and i % 10 == 0: elapsed = datetime.datetime.now() - start_time print( "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}" .format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, iteration, i, len(train_loader), reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, total_loss / len(train_loader))) scheduler.step() eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch, waveglow_config["multi_speaker_config"]["use_multi_speaker"])
def train(n_gpus, rank, output_directory, epochs, optim_algo, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, finetune_layers, warmstart_checkpoint_path, with_tensorboard, grad_clip_val, fp16_run, tensorboard_path=None): fp16_run = bool(fp16_run) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), bool(model_config['use_gate_layer'])) model = Flowtron(**model_config).cuda() if len(finetune_layers): for name, param in model.named_parameters(): if name in finetune_layers: param.requires_grad = True else: param.requires_grad = False print("Initializing %s optimizer" % (optim_algo)) if optim_algo == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_algo == 'RAdam': optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) else: print("Unrecognized optimizer %s!" % (optim_algo)) exit(1) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) scaler = amp.GradScaler(enabled=fp16_run) train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("Output directory", output_directory) if with_tensorboard and rank == 0: tboard_out_path = tensorboard_path if tensorboard_path is None: tboard_out_path = os.path.join(output_directory, "logs/run1") print("Setting up Tensorboard log in %s" % (tboard_out_path)) logger = FlowtronLogger(tboard_out_path) # force set the learning rate to what is specified for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None with amp.autocast(enabled=fp16_run): z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item() reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = loss_gate.item() reduced_nll_loss = loss_nll.item() scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) scaler.step(optimizer) scaler.update() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration) logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if iteration % iters_per_checkpoint == 0: val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, warmstart_checkpoint_path, with_tensorboard, fp16_run): torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), model_config['use_gate_layer']) model = Flowtron(**model_config).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: logger = FlowtronLogger(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens) loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if (iteration % iters_per_checkpoint == 0): val_loss, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(data_config['training_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=False) if 'testing_files' in data_config: testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=True) else: testset = None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) else: logger = None model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() model.zero_grad() print("train batch loaded, {} ({} of {})".format( iteration, i, len(train_loader))) mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) is_overflow = math.isnan(grad_norm) optimizer.step() duration = time.perf_counter() - start print( "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format( iteration, i, len(train_loader), reduced_loss, duration)) if logger: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) logger.add_scalar('duration', duration, i + len(train_loader) * epoch) if testset and not is_overflow and (iteration % iters_per_checkpoint == 0): if testset: validate(model, criterion, testset, iteration, batch_size, num_gpus, logger) if rank == 0: rotate_checkpoints(output_directory) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def __init__(self, opt=None, train_dt=None, train_dt_warm=None, dis_list=[], val_dt_warm=None): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.opt = opt self.visualizer = Visualizer(opt) num_gpus = torch.cuda.device_count() #dis_list[1] print(dis_list) #torch.cuda.device_count() self.rank = dis_list[0] print(self.rank) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: #init_distributed(rank, num_gpus, group_name, **dist_config) dist_config = dis_list[3] init_distributed(dis_list[0], dis_list[1], dis_list[2], **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if opt.ge_net == "srfeat": self.netG = model.G() elif opt.ge_net == "carn": self.netG = model.G1() elif opt.ge_net == "carnm": self.netG = model.G2() else: raise Exception("unknow ") self.netD_vgg = model.D(input_c=512, input_width=18) self.netD = model.D() if opt.vgg_type == "style": self.vgg = load_vgg16(opt.vgg_model_path + '/models') elif opt.vgg_type == "classify": self.vgg = model.vgg19_withoutbn_customefinetune() self.vgg.eval() for param in self.vgg.parameters(): param.requires_grad = False # for p in self.vgg.parameters(): # p.requires_grad = False init_weights(self.netD, init_type=opt.init) init_weights(self.netD_vgg, init_type=opt.init) init_weights(self.netG, init_type=opt.init) self.vgg = self.vgg.to(self.device) self.netD = self.netD.to(self.device) self.netD_vgg = self.netD_vgg.to(self.device) self.netG = self.netG.to(self.device) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: #self.vgg = apply_gradient_allreduce(self.vgg) self.netD_vgg = apply_gradient_allreduce(self.netD_vgg) self.netD = apply_gradient_allreduce(self.netD) self.netG = apply_gradient_allreduce(self.netG) #=====END: ADDED FOR DISTRIBUTED====== print(opt) self.optim_G= torch. optim.Adam(filter(lambda p: p.requires_grad, self.netG.parameters()),\ lr=opt.warm_opt.lr, betas=opt.warm_opt.betas, weight_decay=0.0) # self.optim_G= torch.optim.Adam(filter(lambda p: p.requires_grad, self.netG.parameters()),\ # lr=opt.gen.lr, betas=opt.gen.betas, weight_decay=0.0) if opt.dis.optim == "sgd": self.optim_D= torch.optim.SGD( filter(lambda p: p.requires_grad, \ itertools.chain(self.netD_vgg.parameters(),self.netD.parameters() ) ),\ lr=opt.dis.lr, ) elif opt.dis.optim == "adam": self.optim_D= torch.optim.Adam( filter(lambda p: p.requires_grad, \ itertools.chain(self.netD_vgg.parameters(),self.netD.parameters() ) ),\ lr=opt.dis.lr,betas=opt.dis.betas, weight_decay=0.0 ) else: raise Exception("unknown") print("create schedule ") lr_sc_G = get_scheduler(self.optim_G, opt.gen) lr_sc_D = get_scheduler(self.optim_D, opt.dis) self.schedulers = [] self.schedulers.append(lr_sc_G) self.schedulers.append(lr_sc_D) # =====START: ADDED FOR DISTRIBUTED====== train_dt = torch.utils.data.ConcatDataset([train_dt, train_dt_warm]) train_sampler = DistributedSampler(train_dt) if num_gpus > 1 else None val_sampler_warm = DistributedSampler( val_dt_warm) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== kw = { "pin_memory": True, "num_workers": 8 } if torch.cuda.is_available() else {} dl_c =t_data.DataLoader(train_dt ,batch_size=opt.batch_size,\ sampler=train_sampler , drop_last=True, **kw ) dl_val_warm = t_data.DataLoader( val_dt_warm, batch_size=opt.batch_size if not hasattr(opt, "batch_size_warm") else opt.batch_size_warm, sampler=val_sampler_warm, drop_last=True, **kw) self.dt_train = dl_c self.dt_val_warm = dl_val_warm if opt.warm_opt.loss_fn == "mse": self.critic_pixel = torch.nn.MSELoss() elif opt.warm_opt.loss_fn == "l1": self.critic_pixel = torch.nn.L1Loss() elif opt.warm_opt.loss_fn == "smooth_l1": self.critic_pixel = torch.nn.SmoothL1Loss() else: raise Exception("unknown") self.critic_pixel = self.critic_pixel.to(self.device) self.gan_loss = GANLoss(gan_mode=opt.gan_loss_fn).to(self.device) print("init ....") self.save_dir = os.path.dirname(self.visualizer.log_name)
def train(n_gpus, rank, group_name): if n_gpus > 1: if rank == 0: print('Synchronizing distributed flow...') init_distributed(rank, n_gpus, group_name, config['dist_config']) torch.manual_seed(config['seed']) torch.cuda.manual_seed(config['seed']) if rank == 0: print('Initializing model, optimizer and loss...') model = Tacotron2(config).cuda() criterion = Tacotron2Loss() learning_rate = config['learning_rate'] optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=config['weight_decay']) if config['fp16_run']: if rank == 0: print('Using FP16...') from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') if rank == 0: print('Preparing dirs, data loaders and logger...') logger = prepare_directories_and_logger(config['output_directory'], config['log_directory'], rank) train_loader, valset, collate_fn = prepare_dataloaders( config['training_files'], config['validation_files'], config['n_frames_per_step'], n_gpus) iteration = 0 epoch_offset = 0 if not config['warm_up_checkpoint'] is None: if rank == 0: print('Loading checkpoint from {}...'.format( config['warm_up_checkpoint'])) model = load_checkpoint(config['warm_up_checkpoint'], model, optimizer) iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.compress_factorize(config=config['compress_config']) model.train() # Main training loop for epoch in range(epoch_offset, config['epochs']): print("Epoch: {}".format(epoch)) for _, batch in enumerate(train_loader): start = time.perf_counter() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if config['fp16_run']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if iteration % config['iters_per_grad_acc'] == 0: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), config['grad_clip_thresh']) optimizer.step() model.zero_grad() if rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it". format(iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if iteration % config['iters_per_validation'] == 0: validate(model, criterion, valset, iteration, config['batch_size'], n_gpus, collate_fn, logger, rank) if iteration % config['iters_per_checkpoint'] == 0: if rank == 0: checkpoint_path = os.path.join( config['output_directory'], "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, iteration, checkpoint_path) iteration += 1
def fit(a, epochs): if h.num_gpus > 1: init_distributed(a.rank, h.num_gpus, a.group_name, h.dist_config['dist_backend'], h.dist_config['dist_url']) generator = Generator().to(device) discriminator = MultiScaleDiscriminator().to(device) if h.num_gpus > 1: generator = apply_gradient_allreduce(generator) discriminator = apply_gradient_allreduce(discriminator) g_optim = torch.optim.Adam(generator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) d_optim = torch.optim.Adam(discriminator.parameters(), h.learning_rate, betas=[h.adam_b1, h.adam_b2]) steps = 0 if a.cp_g != "" and a.cp_d != "": generator, g_optim, steps = load_checkpoint(a.cp_g, generator, g_optim) discriminator, d_optim, steps = load_checkpoint(a.cp_d, discriminator, d_optim) steps += 1 with open(a.input_train_metafile, 'r', encoding='utf-8') as fi: training_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') for x in fi.read().split('\n') if len(x) > 0] with open(a.input_valid_metafile, 'r', encoding='utf-8') as fi: validation_files = [os.path.join(a.input_wavs_dir, x.split('|')[0] + '.wav') for x in fi.read().split('\n') if len(x) > 0] trainset = MelDataset(training_files, h.segment_size, h.n_fft, h.num_mels, h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax) train_sampler = DistributedSampler(trainset) if h.num_gpus > 1 else None train_loader = DataLoader(trainset, num_workers=h.num_workers, shuffle=False, sampler=train_sampler, batch_size=h.batch_size, pin_memory=False, drop_last=True) if a.rank == 0: validset = MelDataset(validation_files, h.segment_size, h.n_fft, h.num_mels, h.hop_size, h.win_size, h.sampling_rate, h.fmin, h.fmax, False, False, n_cache_reuse=0) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=None, batch_size=1, pin_memory=False, drop_last=True) if a.rank == 0: os.makedirs(a.cps, exist_ok=True) print("checkpoints directory : ", a.cps) sw = SummaryWriter(os.path.join(a.cps, 'logs')) epoch_offset = max(0, int(steps / len(train_loader))) generator.train() discriminator.train() for epoch in range(epoch_offset, epochs): start = time.time() if a.rank == 0: print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start_b = time.time() x, y, _ = batch x = torch.autograd.Variable(x.to(device)) y = torch.autograd.Variable(y.to(device)) y = y.unsqueeze(1) g_optim.zero_grad() y_ghat = generator(x) y_dhat_r, y_dhat_g, fmap_r, fmap_g = discriminator(y, y_ghat) loss_fm = feature_loss(fmap_r, fmap_g) loss_gen = generator_loss(y_dhat_g) + loss_fm if h.num_gpus > 1: reduced_loss_gen = reduce_tensor(loss_gen.data, h.num_gpus).item() else: reduced_loss_gen = loss_gen.item() loss_gen.backward() g_optim.step() d_optim.zero_grad() y_ghat = y_ghat.detach() y_dhat_r, y_dhat_g, _, _ = discriminator(y, y_ghat) loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_dhat_r, y_dhat_g) if h.num_gpus > 1: reduced_loss_disc = reduce_tensor(loss_disc.data, h.num_gpus).item() else: reduced_loss_disc = loss_disc.item() loss_disc.backward() d_optim.step() if a.rank == 0 and steps % a.stdout_interval == 0: print('Steps : {:d}, Gen Loss : {:4.3f}, Disc Loss : {:4.3f}, s/b : {:4.3f}'. format(steps, reduced_loss_gen, reduced_loss_disc, time.time() - start_b)) if a.rank == 0 and steps % a.checkpoint_interval == 0 and steps != 0: checkpoint_path = "{}/g_{:08d}".format(a.cps, steps) save_checkpoint(generator, g_optim, h.learning_rate, steps, checkpoint_path) checkpoint_path = "{}/d_{:08d}".format(a.cps, steps) save_checkpoint(discriminator, d_optim, h.learning_rate, steps, checkpoint_path) if a.rank == 0 and steps % a.summary_interval == 0: sw.add_scalar("training/gen_loss", reduced_loss_gen, steps) sw.add_scalar("training/disc_loss", reduced_loss_disc, steps) for i, (r, g) in enumerate(zip(losses_disc_r, losses_disc_g)): sw.add_scalar("training/disc{:d}_loss_r".format(i+1), r, steps) sw.add_scalar("training/disc{:d}_loss_g".format(i+1), g, steps) for i, (r, g) in enumerate(zip(y_dhat_r, y_dhat_g)): sw.add_histogram("training/disc{:d}_r_output".format(i+1), r, steps) sw.add_histogram("training/disc{:d}_g_output".format(i+1), g, steps) sw.add_histogram("training/gen_output", y_ghat, steps) sw.add_audio('training_gt/y', y[0], steps, h.sampling_rate) sw.add_audio('training_predicted/y_hat', y_ghat[0], steps, h.sampling_rate) if a.rank == 0 and steps % a.validation_interval == 0: # and steps != 0: for i, batch in enumerate(valid_loader): x, y, _ = batch y_ghat = generator(x.to(device)) sw.add_audio('validation_gt/y_{}'.format(i), y[0], steps, h.sampling_rate) sw.add_audio('validation_predicted/y_hat_{}'.format(i), y_ghat[0], steps, h.sampling_rate) # print(plot_spectrogram(x[i])) sw.add_figure('validation_gt/y_spec_{}'.format(i), plot_spectrogram(x[0]), steps) y_hat_spec = mel_spectrogram(y_ghat.detach().cpu().numpy()[0][0], h.n_fft, h.num_mels, h.sampling_rate, h.hop_size, h.win_size, h.fmin, h.fmax, center=False) sw.add_figure('validation_predicted/y_hat_spec_{}'.format(i), plot_spectrogram(y_hat_spec), steps) if i == 4: break steps += 1 if a.rank == 0: print('Time taken for epoch {} is {} sec\n'.format(epoch + 1, int(time.time()-start)))
def train(num_gpus, rank, group_name, output_directory, epochs, g_learning_rate, d_learning_rate, adv_ag, adv_fd, lamda_adv, lamda_feat, warmup_steps, decay_learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== model = torch.nn.Module() model.add_module('encoder', Encoder(**encoder_config)) model.add_module('generator', Generator(sum(encoder_config['n_out_channels']))) model.add_module('discriminator', MultiScaleDiscriminator(**discriminator_config)) model.add_module( 'disentangler', Disentangler(encoder_config['n_out_channels'][0], sum(encoder_config['n_out_channels'][1:]))) model = model.cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== # Using RAdam as optimizer # Lookahead has resume training issues: # lr schedule doesn't affect nested RAdam of Lookahead g_parameters = list(model.generator.parameters()) g_parameters = list(model.encoder.parameters()) + g_parameters g_optimizer = RAdam(g_parameters, lr=g_learning_rate) d_parameters = list(model.discriminator.parameters()) d_parameters = list(model.disentangler.parameters()) + d_parameters d_optimizer = RAdam(d_parameters, lr=d_learning_rate) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, g_optimizer, d_optimizer, iteration = load_checkpoint( checkpoint_path, model, g_optimizer, d_optimizer) iteration += 1 # next iteration is iteration + 1 customer_g_optimizer = Optimizer(g_optimizer, g_learning_rate, iteration, warmup_steps, decay_learning_rate) customer_d_optimizer = Optimizer(d_optimizer, d_learning_rate, iteration, warmup_steps, decay_learning_rate) criterion = nn.MSELoss() l1_loss = nn.L1Loss() stft_criterion = MultiResolutionSTFTLoss() trainset = Dataset(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=(train_sampler is None), sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) logdir = os.path.join( output_directory, time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())) os.makedirs(logdir, exist_ok=True) writer = SummaryWriter(logdir=logdir) anchors = [ 'loss_g', 'loss_g_sc', 'loss_g_mag', 'loss_g_adv', 'loss_g_feat', 'loss_g_fd', 'loss_d', 'loss_d_real', 'loss_d_fake', 'loss_d_fd' ] meters = { x: LossMeter(x, writer, 100, iteration, True) for x in anchors } model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): train_sampler.set_epoch(epoch) if train_sampler is not None else None tbar = tqdm( enumerate(train_loader)) if rank == 0 else enumerate(train_loader) for i, batch in tbar: model.zero_grad() cond, a = [to_gpu(x) for x in batch] # Get generator outputs x = model.encoder(cond) g_outputs = model.generator(x) losses = {} # Get Discrimiantor loss customer_d_optimizer.zero_grad() d_loss = [] # Adversarial training for audio generation if adv_ag == True: real_scores, _ = model.discriminator(a.unsqueeze(1)) fake_scores, _ = model.discriminator(g_outputs.detach()) d_loss_fake_list, d_loss_real_list = [], [] for (real_score, fake_score) in zip(real_scores, fake_scores): d_loss_real_list.append( criterion(real_score, torch.ones_like(real_score))) d_loss_fake_list.append( criterion(fake_score, torch.zeros_like(fake_score))) d_loss_real = sum(d_loss_real_list) / len(d_loss_real_list) d_loss_fake = sum(d_loss_fake_list) / len(d_loss_fake_list) d_loss = d_loss + [d_loss_real, d_loss_fake] losses.update({ 'loss_d_real': d_loss_real, 'loss_d_fake': d_loss_fake }) # Adversarial training for feature disentanglement if adv_fd == True: split_x = torch.split(x.detach(), encoder_config['n_out_channels'], dim=1) pred = model.disentangler(split_x[0]) d_loss_fd = F.l1_loss(pred, torch.cat((split_x[1:]), dim=1)) d_loss = d_loss + [d_loss_fd] losses.update({'loss_d_fd': d_loss_fd}) if len(d_loss) > 0: d_loss = sum(d_loss) d_loss.backward() nn.utils.clip_grad_norm_(d_parameters, max_norm=10) customer_d_optimizer.step_and_update_lr() losses.update({'loss_d': d_loss}) # Get generator loss customer_g_optimizer.zero_grad() g_clip_norm_scale = 10 # STFT Loss sc_loss, mag_loss = stft_criterion(g_outputs.squeeze(1), a) g_loss = sc_loss + mag_loss losses.update({'loss_g_sc': sc_loss, 'loss_g_mag': mag_loss}) # Adversarial training for audio generation if adv_ag == True: fake_scores, fake_feats = model.discriminator(g_outputs) real_scores, real_feats = model.discriminator(a.unsqueeze(1)) adv_loss_list, feat_loss_list = [], [] for i, fake_score in enumerate(fake_scores): adv_loss_list.append( criterion(fake_score, torch.ones_like(fake_score))) adv_loss = sum(adv_loss_list) / len(adv_loss_list) for i in range(len(fake_feats)): for j in range(len(fake_feats[i])): feat_loss_list.append( l1_loss(fake_feats[i][j], real_feats[i][j].detach())) feat_loss = sum(feat_loss_list) / len(feat_loss_list) g_loss = g_loss + adv_loss * lamda_adv + feat_loss * lamda_feat losses.update({'loss_g_adv': adv_loss}) losses.update({'loss_g_feat': feat_loss}) g_clip_norm_scale = 0.5 # Adversarial training for feature disentanglement if adv_fd == True: split_x = torch.split(x, encoder_config['n_out_channels'], dim=1) pred = model.disentangler(split_x[0]) g_loss_fd = F.l1_loss(pred, torch.cat((split_x[1:]), dim=1).detach()) g_loss = g_loss + (-1.0) * g_loss_fd losses.update({'loss_g_fd': g_loss_fd}) g_loss.backward() nn.utils.clip_grad_norm_(g_parameters, max_norm=g_clip_norm_scale) customer_g_optimizer.step_and_update_lr() losses.update({'loss_g': g_loss}) # only output log of 0-th GPU if rank == 0: tbar.set_description("{:>7}: ".format(iteration) + ', '.join([ "{}: {:.1e}".format(x[5:], losses[x].item()) for x in losses.keys() ])) for x in losses: meters[x].add(losses[x].item()) if (iteration % iters_per_checkpoint == 0): checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, g_optimizer, d_optimizer, iteration, checkpoint_path) iteration += 1
def train(model_name, train_list, max_seq_len, batch_size, train_epoch, learning_rate, iters_per_checkpoint, iters_per_eval, n_warm_up_epoch, warm_up_lr, checkpoint_dir, use_f0=True, preload_data=False, checkpoint_path="", seed=12345, num_gpus=1, rank=0, group_name=""): torch.manual_seed(seed) if num_gpus > 1: init_distributed(rank=rank, num_gpus=num_gpus, group_name=group_name, **dist_configs) timestamp = strftime("%Y%m%d_%H%M_" + checkpoint_dir, localtime()) output_path = join("checkpoints/", timestamp) dataset = MelCepstrumDataset(train_list, use_f0=use_f0, preload_data=preload_data) if rank == 0: print("Checkpoint dir: %s" % output_path) if not exists(output_path): os.makedirs(output_path) subprocess.run(["cp", "-r", args.config, "modules", "models", output_path]) with open(join(output_path, "speaker_label.json"), "w") as f: json.dump(dataset.speaker_label, f) train_sampler = DistributedSampler(dataset) if num_gpus > 1 else None print("Data directory: ", train_list) print("No. training data: ", len(dataset)) print("No. speakers:", dataset.n_speaker) print("Normalize: ", model_configs["norm"]) print("Use F0: ", use_f0) collate_fn = MelCepstrumCollateFn(max_seq_len=max_seq_len) dataloader = DataLoader(dataset=dataset, sampler=train_sampler, batch_size=batch_size//num_gpus, collate_fn=collate_fn, num_workers=4, pin_memory=True, shuffle=False) model = None if model_name == "VQVAE3Stage": model = VQVAE3Stage(n_speaker=dataset.n_speaker, **model_configs).cuda() elif model_name == "VQVAE2Stage": model = VQVAE2Stage(n_speaker=dataset.n_speaker, **model_configs).cuda() elif model_name == "VQVAE1Stage": model = VQVAE1Stage(n_speaker=dataset.n_speaker, **model_configs).cuda() else: print("Unsupported model name: %s" % model_name) if checkpoint_path != "": print(checkpoint_path) model.load_state_dict(torch.load(checkpoint_path)) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = Adam(model.parameters(), lr=warm_up_lr) if rank == 0: logger = DataLogger(logdir=join(output_path, "logs")) validator = Validator(logger=logger, speaker_label=dataset.speaker_label, use_f0=use_f0, **validation_configs) else: logger = None validator = None iteration = 0 for epoch in range(train_epoch): model.train() if train_sampler is not None: train_sampler.set_epoch(epoch) if rank == 0: iterator = progressbar(dataloader, redirect_stdout=True) else: iterator = dataloader for batch in iterator: model.zero_grad() batch = [batch[0].cuda(), batch[1].cuda()] loss, loss_components = model(batch) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() for i in range(len(loss_components)): if isinstance(loss_components[i], list): for j in range(len(loss_components[i])): loss_components[i][j] = reduce_tensor(loss_components[i][j].data, num_gpus).item() else: loss_components[i] = reduce_tensor(loss_components[i].data, num_gpus).item() else: reduced_loss = loss.item() for i in range(len(loss_components)): if isinstance(loss_components[i], list): for j in range(len(loss_components[i])): loss_components[i][j] = loss_components[i][j].item() else: loss_components[i] = loss_components[i].item() loss.backward() optimizer.step() if rank == 0: rc_loss, mel_loss, vq_loss, commitment_loss, perplexity = loss_components print("%d|%d: loss=%.2e, rc_loss=%.2e, mel_loss=%.2e, vq_loss=%.2e" % (epoch, iteration, reduced_loss, rc_loss, mel_loss, vq_loss)) perplexity_tag = ["training/perplexity"] + [str(i) for i in range(len(perplexity))] if logger is not None: logger.log_training([reduced_loss, rc_loss, mel_loss, vq_loss, perplexity], ["training/loss", "training/rc_loss", "training/mel_loss", "training/vq_loss", perplexity_tag], iteration) if (iteration % iters_per_eval) == 0: torch.save(model.state_dict(), join(output_path, "weight_latest.pt")) if validator is not None: validator(model, iteration) if (iteration % iters_per_checkpoint) == 0 and iteration > 0: torch.save(model.state_dict(), join(output_path, "weight_%d.pt" % iteration)) iteration += 1 if epoch < n_warm_up_epoch: lr = min(learning_rate, warm_up_lr - epoch * (warm_up_lr - learning_rate)/n_warm_up_epoch) for param_group in optimizer.param_groups: param_group['lr'] = lr print("Finished!") return
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, warm_start): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = Over9000(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, warm_start) if fp16_run and not warm_start: amp.load_state_dict(torch.load(checkpoint_path)['amp']) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=16, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.999, patience=250, cooldown=250, verbose=True, min_lr=1e-5) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = mel.cuda() audio = audio.cuda() outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 1.0) optimizer.step() if epoch > 1: scheduler.step(loss) print("{}:\t{:.9f}\t{:.9f}".format(iteration, reduced_loss, grad_norm)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, amp, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 print(f"receptive_field: {model.receptive_field()}") trainset = WavenetDataset( dataset_file='data/dataset.npz', item_length=model.receptive_field() + 1000 + model.output_length - 1, target_length=model.output_length, file_location='data/', test_stride=500, ) print(trainset._length) print('the dataset has ' + str(len(trainset)) + ' items') train_loader = DataLoader( trainset, batch_size=batch_size, shuffle=True, pin_memory=False, ) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== start = time.time() for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() y, target = batch y = to_gpu(y).float() target = to_gpu(target) y_pred = model((None, y)) loss = criterion(y_pred[:, :, -model.output_length:], target) loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, loss)) print_etr(start, total_iterations=(epochs - epoch_offset) * len(train_loader), current_iteration=epoch * len(train_loader) + i + 1) writer.add_scalar('Loss/train', loss, global_step=iteration) if (iteration % iters_per_checkpoint == 0): y_choice = y_pred[0].detach().cpu().transpose(0, 1) y_prob = F.softmax(y_choice, dim=1) y_prob_collapsed = torch.multinomial(y_prob, num_samples=1).squeeze(1) y_pred_audio = mu_law_decode_numpy(y_prob_collapsed.numpy(), model.n_out_channels) import torchaudio y_audio = mu_law_decode_numpy(y.numpy(), model.n_out_channels) torchaudio.save("test_in.wav", torch.tensor(y_audio), 16000) torchaudio.save("test_out.wav", torch.tensor(y_pred_audio), 16000) writer.add_audio('Audio', y_pred_audio, global_step=iteration, sample_rate=data_config['sampling_rate']) checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) writer.flush() iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 #trainset = Mel2SampOnehot(**data_config) trainset = DeepMels(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): total_loss = 0 print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() total_loss += reduced_loss if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 print("epoch:{}, total epoch loss:{}".format(epoch, total_loss))
def train(num_gpus, rank, group_name, device, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path, use_scheduled_sampling=False, use_wavenet_autoencoder=True, use_variational_autoencoder=False, diversity_scale=0.005, use_logistic_mixtures=False, n_mixtures=3, audio_hz=16000, midi_hz=250, aggressive_loss_threshold=3.0, encoder_error_thresh=0.0): assert use_wavenet_autoencoder is True if num_gpus > 1: device = init_distributed(rank, num_gpus, group_name, **dist_config) device = torch.device(device) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if use_logistic_mixtures: sampler = DML.SampleDiscretizedMixLogistics() criterion = DML.DiscretizedMixLogisticLoss() else: sampler = utils.CategoricalSampler() criterion = CrossEntropyLoss() model = WavenetAutoencoder(wavenet_config, cond_wavenet_config, use_variational_autoencoder).to(device) if use_variational_autoencoder: diversity_loss = L2DiversityLoss() if num_gpus > 1: model = apply_gradient_allreduce(model) if use_scheduled_sampling: scheduled_sampler = ScheduledSamplerWithPatience( model, sampler, **scheduled_sampler_config) encoder_optimizer = torch.optim.Adam(model.encoder_wavenet.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adam(model.wavenet.parameters(), lr=learning_rate) # Train state params aggressive = True train_encoder = False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, encoder_optimizer, decoder_optimizer, aggressive, iteration = load_checkpoint( checkpoint_path, model, encoder_optimizer, decoder_optimizer) iteration += 1 # Dataloader trainset = MaestroDataloader(**data_config) if num_gpus > 1: train_sampler = DistributedSampler(trainset) else: train_sampler = None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready for distributed if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) # Initialize training variables epoch_offset = max(0, int(iteration / len(train_loader))) start_iter = iteration loss_idx = 0 loss_sum = 0 prev_loss = 999999999 print("output directory: " + output_directory) # write loss to csv file loss_writer = DictWriter(open(output_directory + "/train.csv", 'w', newline=''), fieldnames=['iteration', 'loss']) loss_writer.writeheader() signal_writer = DictWriter(open(output_directory + "/signal.csv", "w", newline=''), fieldnames=[ 'iteration', 'cosim', 'p-dist', 'forwardMagnitude', 'midiMagnitude' ]) signal_writer.writeheader() model.train() # ================ MAIN TRAINING LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch x = as_variable(x, device) y = as_variable(y, device) y_true = y.clone() if use_scheduled_sampling: y = scheduled_sampler(x, y) y_preds = model((x, y)) if use_wavenet_autoencoder: q_bar = y_preds[1] y_preds = y_preds[0] loss = criterion(y_preds, y_true) if use_variational_autoencoder: div_loss = diversity_loss(q_bar) loss = loss + (diversity_scale * div_loss) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.data.item() loss.backward() if aggressive and train_encoder: encoder_optimizer.step() print("Encoder step") elif aggressive: decoder_optimizer.step() print("Decoder step") else: # normal training encoder_optimizer.step() decoder_optimizer.step() print("total loss: {}:\t{:.9f}".format(iteration, reduced_loss)) if use_variational_autoencoder: print(" diversity loss: {:.9f}".format(div_loss)) if use_scheduled_sampling: scheduled_sampler.update(reduced_loss) # record running average of loss loss_sum += reduced_loss loss_idx += 1 if (iteration % 10 == 0): loss_avg = loss_sum / loss_idx print("floating avg of 10: " + str(loss_avg)) #loss_writer.writerow({"iteration": str(i), # "loss": str(reduced_loss)}) if aggressive and loss_avg < aggressive_loss_threshold: agressive = False elif aggressive and train_encoder and loss_avg >= ( prev_loss + encoder_error_thresh): train_encoder = False elif aggressive: train_encoder = True prev_loss = loss_avg loss_sum = 0 loss_idx = 0 # save model if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint_autoencoder(model, device, use_variational_autoencoder, encoder_optimizer, decoder_optimizer, aggressive, learning_rate, iteration, checkpoint_path) iteration += 1 del loss
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 print("checkpoint path", checkpoint_path) #model = warm_load_checkpoint(checkpoint_path, model) model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() loss.backward() optimizer.step() if (iteration % iters_per_checkpoint == 0): print("{}:\t{:.9f}".format(iteration, reduced_loss)) checkpoint_path = "{}/waveglow".format(output_directory) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, stage, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== from model import HiFiGAN, HiFiGANLoss criterion = HiFiGANLoss(**hifigan_config).cuda() model = HiFiGAN(**hifigan_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) if stage >= 2: criterion = apply_gradient_allreduce(criterion) #=====END: ADDED FOR DISTRIBUTED====== criterion, optimizer_d = get_optimizer(criterion, optimizer, fp16_run, optimizer_fused=True) if stage >= 2 else (criterion, None) model, optimizer = get_optimizer(model, optimizer, fp16_run, optimizer_fused=True) ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-8 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs') print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler=False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, criterion, optimizer_d, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, criterion, optimizer_d, scheduler, fp16_run, stage, warm_start=warm_start) iteration += 1 # next iteration is iteration + 1 if start_zero: iteration = 0 trainset = Mel2Samp(**data_config, check_files=True) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter if datedlogdir: timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, logdirname, timestr) else: log_directory = os.path.join(output_directory, logdirname) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 200)) # average loss over entire Epoch rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_iter = time.time() start_time_dekaiter = time.time() model.train() # best (averaged) training loss if os.path.exists(os.path.join(output_directory, "best_model")+".txt"): best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = 9e9 # best (validation) MSE on inferred spectrogram. if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"): best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) print_params(model, name='generator') print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}") training = True while training: try: if rank == 0: epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch") else: epochs_iterator = range(epoch_offset, epochs) # ================ MAIN TRAINING LOOP! =================== for epoch in epochs_iterator: print(f"Epoch: {epoch}") if num_gpus > 1: train_sampler.set_epoch(epoch) if rank == 0: iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True) else: iters_iterator = enumerate(train_loader) for i, batch in iters_iterator: # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." for param_group in optimizer.param_groups: param_group['lr'] = learning_rate if optimizer_d is not None: for param_group in optimizer_d.param_groups: param_group['lr'] = learning_rate*d_lr_scale else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr if override_scheduler_best: scheduler.best = override_scheduler_best if override_scheduler_last_lr or override_scheduler_best: print(f"scheduler._last_lr = {scheduler._last_lr} scheduler.best = {scheduler.best} |", end='') model.zero_grad() noisy_audio, gt_audio, speaker_ids = batch noisy_audio = torch.autograd.Variable(noisy_audio.cuda(non_blocking=True)) gt_audio = torch.autograd.Variable(gt_audio.cuda(non_blocking=True)) speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1) pred_audio = model(noisy_audio)#, speaker_ids) metrics = criterion(pred_audio, gt_audio, amp, model, optimizer, optimizer_d, num_gpus, use_grad_clip, grad_clip_thresh) if not metrics['is_overflow'] and rank == 0: # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768 if with_tensorboard: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) for key, value in metrics.items(): if key not in ['is_overflow',]: logger.add_scalar(key, value, iteration) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration) logged_loss = metrics['g_train_loss'] if stage >= 2 else metrics['train_loss'] grad_norm = metrics['grad_norm'] average_loss = rolling_sum.process(logged_loss) if (iteration % 10 == 0): tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, logged_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus))) start_time_dekaiter = time.time() else: tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, logged_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale)) start_time_iter = time.time() if rank == 0 and (len(rolling_sum.values) > moving_average-2): if (average_loss+best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_model") try: save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss)+"\n"+str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if iteration%validation_interval == 0: if rank == 0: MSE, MAE = validate(model, trainset, logger, iteration, data_config['validation_files'], speaker_lookup, output_directory, data_config) if scheduler: MSE = torch.tensor(MSE, device='cuda') if num_gpus > 1: broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join(output_directory, "best_val_model") try: save_checkpoint(model, optimizer, criterion, optimizer_d, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(MSE.item())+"\n"+str(iteration)) text_file.close() best_MSE = MSE.item() else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) iteration += 1 training = False # exit the training While loop except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists(checkpoint_path), "best_model must exist for automatic restarts" # clearing VRAM for load checkpoint audio = mel = speaker_ids = loss = None torch.cuda.empty_cache() model.eval() model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() iteration += 1 pass # and continue training.
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hparams.sigma) model = WaveGlow(hparams).cuda() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = TextMelLoader(hparams.training_files, hparams) collate_fn = TextMelCollate() # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== batch_size = hparams.batch_size train_loader = DataLoader(trainset, num_workers=0, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) # Get shared output_directory readya if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hparams.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) print("Total Epochs: {}".format(hparams.epochs)) print("Batch Size: {}".format(hparams.batch_size)) print("learning rate: {}".format(hparams.learning_rate)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) with torch.no_grad(): enc_outputs, alignments = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel_padded = mel_padded.transpose(1, 2) # mel_padded = mel_padded / torch.abs(mel_padded).max().item() mel_pos = torch.arange(1000) mel_pos = to_gpu(mel_pos).long().unsqueeze(0) mel_pos = mel_pos.expand(hparams.batch_size, -1) src_pos = torch.arange(hparams.n_position) src_pos = to_gpu(src_pos).long().unsqueeze(0) src_pos = src_pos.expand(hparams.batch_size, -1) mel_padded = (mel_padded + 5) / 10 z, log_s_list, log_det_w_list, dec_enc_attn = model( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) outputs = (z, log_s_list, log_det_w_list, dec_enc_attn) loss = criterion(outputs, alignments) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hparams.with_tensorboard and rank == 0: logger.log_training(reduced_loss, grad_norm, learning_rate, iteration) if (iteration % hparams.iters_per_checkpoint == 0): if rank == 0: mel_predict, test_attn = model.test( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) logger.log_alignment(model, dec_enc_attn, alignments, mel_padded, mel_predict, test_attn, iteration) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, batch_size, seed, checkpoint_path, hparams): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) if num_gpus >= 1: model = WaveGlow(**waveglow_config, hparams=hparams).cuda() else: model = WaveGlow(**waveglow_config, hparams=hparams) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration, eval_iteration = 0, 0 if checkpoint_path != "": model, optimizer, iteration, eval_iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 eval_iteration += 1 # trainset = Mel2Samp(**data_config) trainset = TextMelLoader( audiopaths_and_text='./filelists/ljs_audio_text_train_filelist.txt', hparams=hparams) testset = TextMelLoader( audiopaths_and_text='./filelists/ljs_audio_text_test_filelist.txt', hparams=hparams) collate_fn = TextMelCollate(hparams, fixed_length=True) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, collate_fn=collate_fn, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) test_loader = DataLoader(testset, num_workers=1, collate_fn=collate_fn, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) log_path = os.path.join(output_directory, 'log-event') os.makedirs(log_path, exist_ok=True) logger = WaveGlowLogger(log_path) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() tacotron2 = Tacotron2(hparams) batch_parser = tacotron2.parse_batch # we use tacotron-2's pipeline epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) model.train() for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch_parser(batch) text_padded, input_lengths, mel_padded, max_len, output_lengths = x # print(text_padded.size(), mel_padded.size()) mel_padded, gate_padded = y outputs = model((text_padded, mel_padded)) loss = criterion(outputs) logger.log_loss('train/loss', loss, iteration) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) iteration += 1 # model.eval() # for i, batch in enumerate(test_loader): # x, y = batch_parser(batch) # text_padded, input_lengths, mel_padded, max_len, output_lengths = x # mel_padded, gate_padded = y # outputs = model((text_padded, mel_padded)) # loss = criterion(outputs) # logger.log_loss('eval/loss', loss, iteration) # eval_iteration += 1 if rank == 0: checkpoint_path = "{}/waveglow_epoch_{}".format(output_directory, epoch) save_checkpoint(model, optimizer, learning_rate, iteration, eval_iteration, checkpoint_path, hparams=hparams)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, loss_empthasis, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== global WaveGlow global WaveGlowLoss ax = True # this is **really** bad coding practice :D if ax: from efficient_model_ax import WaveGlow from efficient_loss import WaveGlowLoss else: if waveglow_config["yoyo"]: # efficient_mode # TODO: Add to Config File from efficient_model import WaveGlow from efficient_loss import WaveGlowLoss else: from glow import WaveGlow, WaveGlowLoss criterion = WaveGlowLoss(sigma, loss_empthasis) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== STFTs = [STFT.TacotronSTFT(filter_length=window, hop_length=data_config['hop_length'], win_length=window, sampling_rate=data_config['sampling_rate'], n_mel_channels=160, mel_fmin=0, mel_fmax=16000) for window in data_config['validation_windows']] loader_STFT = STFT.TacotronSTFT(filter_length=data_config['filter_length'], hop_length=data_config['hop_length'], win_length=data_config['win_length'], sampling_rate=data_config['sampling_rate'], n_mel_channels=data_config['n_mel_channels'] if 'n_mel_channels' in data_config.keys() else 160, mel_fmin=data_config['mel_fmin'], mel_fmax=data_config['mel_fmax']) #optimizer = "Adam" optimizer = optimizer.lower() optimizer_fused = bool( 0 ) # use Apex fused optimizer, should be identical to normal but slightly faster and only works on RTX cards if optimizer_fused: from apex import optimizers as apexopt if optimizer == "adam": optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": optimizer = apexopt.FusedLAMB(model.parameters(), lr=learning_rate, max_grad_norm=200) else: if optimizer == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": from lamb import Lamb as optLAMB optimizer = optLAMB(model.parameters(), lr=learning_rate) #import torch_optimizer as optim #optimizer = optim.Lamb(model.parameters(), lr=learning_rate) #raise# PyTorch doesn't currently include LAMB optimizer. if fp16_run: global amp from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-8 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs') print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler=False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run, warm_start=warm_start) iteration += 1 # next iteration is iteration + 1 if start_zero: iteration = 0 trainset = Mel2Samp(**data_config, check_files=True) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter if datedlogdir: timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, logdirname, timestr) else: log_directory = os.path.join(output_directory, logdirname) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 100)) # average loss over entire Epoch rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_iter = time.time() start_time_dekaiter = time.time() model.train() # best (averaged) training loss if os.path.exists(os.path.join(output_directory, "best_model")+".txt"): best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = -6.20 # best (validation) MSE on inferred spectrogram. if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"): best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters in model".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}") training = True while training: try: if rank == 0: epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch") else: epochs_iterator = range(epoch_offset, epochs) # ================ MAIN TRAINING LOOP! =================== for epoch in epochs_iterator: print(f"Epoch: {epoch}") if num_gpus > 1: train_sampler.set_epoch(epoch) if rank == 0: iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True) else: iters_iterator = enumerate(train_loader) for i, batch in iters_iterator: # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr if override_scheduler_best: scheduler.best = override_scheduler_best if override_scheduler_last_lr or override_scheduler_best: print("scheduler._last_lr =", scheduler._last_lr, "scheduler.best =", scheduler.best, " |", end='') model.zero_grad() mel, audio, speaker_ids = batch mel = torch.autograd.Variable(mel.cuda(non_blocking=True)) audio = torch.autograd.Variable(audio.cuda(non_blocking=True)) speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1) outputs = model(mel, audio, speaker_ids) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (reduced_loss > LossExplosionThreshold) or (math.isnan(reduced_loss)): model.zero_grad() raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") if use_grad_clip: if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: is_overflow = False; grad_norm=0.00001 optimizer.step() if not is_overflow and rank == 0: # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768 if with_tensorboard: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_samples', reduced_loss, iteration*batch_size) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration) average_loss = rolling_sum.process(reduced_loss) if (iteration % 10 == 0): tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus))) start_time_dekaiter = time.time() else: tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale)) start_time_iter = time.time() if rank == 0 and (len(rolling_sum.values) > moving_average-2): if (average_loss+best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss)+"\n"+str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (iteration % validation_interval == 0): if rank == 0: MSE, MAE = validate(model, loader_STFT, STFTs, logger, iteration, data_config['validation_files'], speaker_lookup, sigma, output_directory, data_config) if scheduler: MSE = torch.tensor(MSE, device='cuda') if num_gpus > 1: broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join(output_directory, "best_val_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(MSE.item())+"\n"+str(iteration)) text_file.close() best_MSE = MSE.item() #Only save the model if X better than the current loss. else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) learning_rate = optimizer.param_groups[0]['lr'] #check actual learning rate (because I sometimes see learning_rate variable go out-of-sync with real LR) iteration += 1 training = False # exit the While loop except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists(checkpoint_path), "best_val_model must exist for automatic restarts" # clearing VRAM for load checkpoint audio = mel = speaker_ids = loss = None torch.cuda.empty_cache() model.eval() model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() iteration += 1 pass # and continue training.
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, weight_sharing, optimizer_type, dataloader_type): ws = weight_sharing torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer_type = optimizer_type.lower() if optimizer_type == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) elif optimizer_type == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) else: print("Unsupported optimizer: %s. Aborting." % optimizer_type) return None if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 dataloader_type = dataloader_type.lower() if dataloader_type == "vanilla": trainset = Mel2Samp(**data_config) elif dataloader_type == "split": trainset = Mel2SampSplit(**data_config) else: print("Unsupported dataloader type: %s. Aborting." % dataloader_type) return None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=(num_gpus == 1), sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) name = "waveglow_ws%d_%s_%s_batch%d" % (ws, optimizer_type, dataloader_type, batch_size) if learning_rate != 1e-4: name = name + "_lr{:.0e}".format(learning_rate) if num_gpus > 1: name = name + "_x%d" % num_gpus if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join("./logs", name)) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== stime2 = None for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) stime = time() for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if (iteration % 100 == 0): if not stime2 is None: tot_time2 = time() - stime2 print("{}:\t{:.9f}, time: {}".format( iteration, reduced_loss, int(tot_time2))) stime2 = time() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}_{}".format( output_directory, name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 tot_time = time() - stime print("Epoch %d completed. Time: %d seconds" % (epoch, int(tot_time)))
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path): # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hp.sigma) model = WaveGlow().cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hp.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hp.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # Get dataset dataset = FastSpeechDataset() # Get training loader print("Get Training Loader") training_loader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=cpu_count()) if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hp.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model = model.train() epoch_offset = max(0, int(iteration / len(training_loader))) beta = hp.batch_size print("Total Epochs: {}".format(hp.epochs)) print("Batch Size: {}".format(hp.batch_size)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hp.epochs): print("Epoch: {}".format(epoch)) for i, data_of_batch in enumerate(training_loader): model.zero_grad() if not hp.pre_target: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = get_alignment(src_seq, tacotron2).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) else: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] alignment_target = data_of_batch["alignment"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = torch.from_numpy( alignment_target).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) outputs = model(src_seq, src_pos, mel_tgt, mel_max_len, alignment_target) _, _, _, duration_predictor = outputs mel_tgt = mel_tgt.transpose(1, 2) max_like, dur_loss = criterion(outputs, alignment_target, mel_tgt) if beta > 1 and iteration % 10000 == 0: beta = beta // 2 loss = max_like + dur_loss if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hp.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() #grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hp.with_tensorboard and rank == 0: logger.log_training(reduced_loss, dur_loss, learning_rate, iteration) if (iteration % hp.save_step == 0): if rank == 0: # logger.log_alignment(model, mel_predict, mel_tgt, iteration) checkpoint_path = "{}/TTSglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if train_data_config["no_chunks"]: criterion = MaskedCrossEntropyLoss() else: criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() ema = ExponentialMovingAverage(ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=200000, gamma=0.5) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model, optimizer, scheduler, ema) iteration += 1 # next iteration is iteration + 1 trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config) validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print(train_data_config) if train_data_config["no_chunks"]: collate_fn = utils.collate_fn else: collate_fn = torch.utils.data.dataloader.default_collate train_loader = DataLoader(trainset, num_workers=1, shuffle=False, collate_fn=collate_fn, sampler=train_sampler, batch_size=batch_size, pin_memory=True, drop_last=True) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=valid_sampler, batch_size=1, pin_memory=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) writer = SummaryWriter(log_dir) print("Checkpoints writing to: {}".format(log_dir)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): if low_memory: torch.cuda.empty_cache() scheduler.step() model.zero_grad() if train_data_config["no_chunks"]: x, y, seq_lens = batch seq_lens = to_gpu(seq_lens) else: x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) if train_data_config["no_chunks"]: loss = criterion(y_pred, y, seq_lens) else: loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) print("{}:\t{:.9f}".format(iteration, reduced_loss)) if rank == 0: writer.add_scalar('loss', reduced_loss, iteration) if (iteration % iters_per_checkpoint == 0 and iteration): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path, ema, wavenet_config) if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]): if low_memory: torch.cuda.empty_cache() if rank == 0: model_eval = nv_wavenet.NVWaveNet(**(model.export_weights())) for j, valid_batch in enumerate(valid_loader): mel, audio = valid_batch mel = to_gpu(mel).float() cond_input = model.get_cond_input(mel) predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO) predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256) writer.add_audio("valid/predicted_audio_{}".format(j), predicted_audio, iteration, 22050) audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256) writer.add_audio("valid_true/audio_{}".format(j), audio, iteration, 22050) if low_memory: torch.cuda.empty_cache() iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, init_lr, final_lr, sigma, epochs_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): os.makedirs(output_directory, exist_ok=True) torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=init_lr) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists epoch_offset = 1 if checkpoint_path != "": model, optimizer, epoch_offset = load_checkpoint( checkpoint_path, model, optimizer) epoch_offset += 1 # next epoch is epoch_offset + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=8, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs + 1): print(f'Epoch: {epoch}') adjust_learning_rate(optimizer, epoch, init_lr, final_lr, epochs) for i, batch in enumerate(tqdm.tqdm(train_loader)): optimizer.zero_grad() batch = model.pre_process(batch) outputs = model(batch) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + 1 + len(train_loader) * epoch) if epoch % epochs_per_checkpoint == 0: if rank == 0: # Keep only one checkpoint last_chkpt = os.path.join( output_directory, f'waveglow_{epoch - epochs_per_checkpoint:06d}.pt') if os.path.exists(last_chkpt): os.remove(last_chkpt) checkpoint_path = os.path.join(output_directory, f'waveglow_{epoch:06d}.pt') save_checkpoint(model, optimizer, epoch, checkpoint_path)
def train(num_gpus, rank, group_name, prj_name, run_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, grad_clip_thresh, checkpoint_path, pretrained_path, with_tensorboard, with_wandb): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 if pretrained_path != "": model = load_pretrained(pretrained_path, model) trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset) shuffle_at_dataloader = False else: train_sampler = None shuffle_at_dataloader = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle_at_dataloader, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): iter_start = time.perf_counter() float_epoch = float(iteration) / len(train_loader) model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss, etc = criterion(outputs) (z_L2_normalized, neg_log_s_total, neg_log_det_W_total) = etc if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isnan(grad_norm) if not is_overflow: clipped_grad_norm = get_clip_grad_norm( grad_norm, grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) clipped_grad_norm = get_clip_grad_norm(grad_norm, grad_clip_thresh) optimizer.step() iter_duration = time.perf_counter() - iter_start print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if with_wandb and rank == 0: wandb.log( { 'iteration': iteration, 'epoch': float_epoch, 'iter_duration': iter_duration, 'training_loss': reduced_loss, 'training_loss/z_L2_normalized': z_L2_normalized, 'training_loss/neg_log_s_total': neg_log_s_total, 'training_loss/neg_log_det_W_total': neg_log_det_W_total, }, step=iteration) if not is_overflow: wandb.log( { 'grad_norm': grad_norm, 'clipped_grad_norm': clipped_grad_norm, }, step=iteration) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/{}/{}/waveglow_{}".format( output_directory, prj_name, run_name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cpu()) audio = torch.autograd.Variable(audio.cpu()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if n_gpus > 1: init_distributed(rank, n_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== model, criterion = getCore(hparams) #=====START: ADDED FOR DISTRIBUTED====== if n_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== STFT = [ TacotronSTFT(filter_length=window, hop_length=hparams.hop_length, win_length=window, sampling_rate=hparams.sampling_rate, n_mel_channels=160, mel_fmin=hparams.mel_fmin, mel_fmax=hparams.mel_fmax) for window in hparams.validation_windows ] optimizer = getOptimizer(model, hparams) if hparams.fp16_run: global amp from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=hparams.fp16_opt_level, min_loss_scale=2.0) else: amp = None # LEARNING RATE SCHEDULER if hparams.LRScheduler.lower() == "ReduceLROnPlateau".lower(): from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-5 factor = 0.1**( 1 / 5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True) print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler = None # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration, scheduler = load_checkpoint( warm_start, warm_start_force, checkpoint_path, model, optimizer, scheduler, hparams.fp16_run) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(hparams) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if n_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=hparams.n_dataloader_workers, shuffle=shuffle, sampler=train_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if rank == 0: from tensorboardX import SummaryWriter if False: # dated and seperated log dirs for each run timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, log_directory, timestr) else: log_directory = os.path.join(output_directory, log_directory) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 100)) # average loss over 100 iters rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_single_batch = time.time() model.train() if os.path.exists(os.path.join(output_directory, "best_train_model")): best_model_loss = float( str( open(os.path.join(output_directory, "best_train_model") + ".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = -4.20 if os.path.exists(os.path.join(output_directory, "best_val_model")): best_MSE = float( str( open(os.path.join(output_directory, "best_val_model") + ".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters.".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) learning_rate = hparams.learning_rate # ================ MAIN TRAINING LOOP! =================== for epoch in get_progress_bar(range(epoch_offset, hparams.epochs), dict(initial=epoch_offset, total=hparams.epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch"), hparams, rank=rank): cprint(f"Epoch: {epoch}", b_tqdm=hparams.tqdm) if n_gpus > 1: train_sampler.set_epoch(epoch) for i, batch in get_progress_bar(enumerate(train_loader), dict(desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True), hparams, rank=rank): # run external code every iter, allows the run to be adjusted without restarts if (i == 0 or iteration % param_interval == 0): try: with open("hparams_realtime.py") as f: internal_text = str(f.read()) ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) except Exception as ex: cprint(f"Custom code FAILED to run!\n{ex}", b_tqdm=hparams.tqdm) globals().update(ldict) locals().update(ldict) if show_live_params: cprint(internal_text, b_tqdm=hparams.tqdm) assert warmup_start <= iteration, "Current iteration less than warmup_start." # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_end: learning_rate = (iteration - warmup_start) * ( (A_ + C_) - warmup_start_lr ) / ( warmup_end - warmup_start ) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_ * (e**(-iteration_adjusted / B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr cprint("Scheduler last_lr overriden. scheduler._last_lr =", scheduler._last_lr, b_tqdm=hparams.tqdm) if not iteration % 20: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] if override_scheduler_best: scheduler.best = override_scheduler_best cprint("Scheduler best metric overriden. scheduler.best =", override_scheduler_best, b_tqdm=hparams.tqdm) model.zero_grad() mel, audio, speaker_ids = batch mel = torch.autograd.Variable(mel.cuda(non_blocking=True)) audio = torch.autograd.Variable(audio.cuda(non_blocking=True)) if model.multispeaker: speaker_ids = torch.autograd.Variable( speaker_ids.cuda(non_blocking=True)).long().squeeze(1) outputs = model(mel, audio, speaker_ids) else: outputs = model(mel, audio) loss = criterion(outputs) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() assert reduced_loss < 1e5, "Model Diverged. Loss > 1e5" if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.b_grad_clip: if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: is_overflow = False grad_norm = 0.00001 optimizer.step() if not is_overflow and rank == 0: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) logger.add_scalar('training_loss', reduced_loss, iteration) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time) / 10), iteration) start_time_single_batch = time.time() average_loss = rolling_sum.process(reduced_loss) if rank == 0: if (iteration % 10 == 0): cprint( "{} {}: {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item" .format( time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, round(grad_norm, 3), learning_rate, min((hparams.grad_clip_thresh / grad_norm) * learning_rate, learning_rate), (time.time() - start_time) / 10, ((time.time() - start_time) / 10) / (hparams.batch_size * n_gpus)), b_tqdm=hparams.tqdm) start_time = time.time() else: cprint( "{} {}: {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective)" .format( time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, round(grad_norm, 3), learning_rate, min((hparams.grad_clip_thresh / grad_norm) * learning_rate, learning_rate)), b_tqdm=hparams.tqdm) if rank == 0 and (len(rolling_sum.values) > moving_average - 2): if (average_loss + best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_train_model") try: save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss) + "\n" + str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and ((iteration % hparams.iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) start_time_single_batch = time.time() if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (iteration % validation_interval == 0): if rank == 0: MSE, MAE = validate(model, STFT, logger, iteration, speaker_lookup, hparams, output_directory) if scheduler and n_gpus > 1: MSE = torch.tensor(MSE, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join( output_directory, "best_val_model") try: save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, hparams, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write( str(MSE.item()) + "\n" + str(iteration)) text_file.close() best_MSE = MSE.item( ) #Only save the model if X better than the current loss. else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) iteration += 1