def compute_validation_loss(model, criterion, valset, collate_fn, batch_size, n_gpus): model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if n_gpus > 1 else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loss, val_loss_nll, val_loss_gate = 0.0, 0.0, 0.0 for i, batch in enumerate(val_loader): mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() reduced_val_loss_nll = reduce_tensor(loss_nll.data, n_gpus).item() reduced_val_loss_gate = reduce_tensor(loss_gate.data, n_gpus).item() else: reduced_val_loss = loss.item() reduced_val_loss_nll = loss_nll.item() reduced_val_loss_gate = loss_gate.item() val_loss += reduced_val_loss val_loss_nll += reduced_val_loss_nll val_loss_gate += reduced_val_loss_gate val_loss = val_loss / (i + 1) val_loss_nll = val_loss_nll / (i + 1) val_loss_gate = val_loss_gate / (i + 1) print("Mean {}\nLogVar {}\nProb {}".format(mean, log_var, prob)) model.train() return val_loss, val_loss_nll, val_loss_gate, attn, gate_pred, gate_target
def validation(model, data_loader, criterion, rank, world_size, epoch, num_epoch): model.eval() summary_loss = AverageMeter() if rank == 0: pbar = tqdm(total=len(data_loader), unit="batch") pbar.set_description(f"Epoch[{epoch+1}/{num_epoch}].Val") for imgs, labels in data_loader: batch_size = imgs.shape[0] imgs = imgs.to(rank) labels = labels.to(rank) output = model(imgs) loss = criterion(output, labels) reduced_loss = reduce_tensor(loss, world_size) summary_loss.update(reduced_loss.detach().item(), batch_size) if rank == 0: pbar.set_postfix({"loss": summary_loss.avg}) pbar.update(1) return summary_loss
def Train_Step(self, features): loss_dict = {} features = features.to(self.device, non_blocking=True) with torch.cuda.amp.autocast(enabled=self.hp.Use_Mixed_Precision): embeddings = self.model(features) loss_dict['Embedding'] = self.criterion( embeddings, self.hp.Train.Batch.Train.Pattern_per_Speaker) self.optimizer.zero_grad() self.scaler.scale(loss_dict['Embedding']).backward() if self.hp.Train.Gradient_Norm > 0.0: self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_( parameters=self.model.parameters(), max_norm=self.hp.Train.Gradient_Norm) self.scaler.step(self.optimizer) self.scaler.update() self.scheduler.step() self.steps += 1 self.tqdm.update(1) for tag, loss in loss_dict.items(): loss = reduce_tensor( loss.data, self.num_gpus).item() if self.num_gpus > 1 else loss.item() self.scalar_dict['Train']['Loss/{}'.format(tag)] += loss
def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, rank): """Handles all the validation scoring and printing""" model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if n_gpus > 1 else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 for i, batch in enumerate(val_loader): x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() \ if n_gpus > 1 else loss.item() val_loss += reduced_val_loss val_loss = val_loss / (i + 1) model.train() if rank == 0: print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) logger.log_validation(reduced_val_loss, model, y, y_pred, iteration) return val_loss
def Evaluation_Step(self, tokens, notes, durations, token_lengths, features, feature_lengths): loss_dict = {} tokens = tokens.to(self.device, non_blocking=True) notes = notes.to(self.device, non_blocking=True) durations = durations.to(self.device, non_blocking=True) token_lengths = token_lengths.to(self.device, non_blocking=True) features = features.to(self.device, non_blocking=True) feature_lengths = feature_lengths.to(self.device, non_blocking=True) pre_features, post_features, alignments = self.model( tokens=tokens, notes=notes, durations=durations, token_lengths=token_lengths, features=features, feature_lengths=feature_lengths, is_training=True) loss_dict['Pre'] = self.criterion_dict['MSE'](pre_features, features) loss_dict['Post'] = self.criterion_dict['MSE'](post_features, features) loss_dict['Guided_Attention'] = self.criterion_dict['GAL']( alignments, feature_lengths, token_lengths) loss_dict['Total'] = loss_dict['Pre'] + loss_dict['Post'] + loss_dict[ 'Guided_Attention'] for tag, loss in loss_dict.items(): loss = reduce_tensor( loss.data, self.num_gpus).item() if self.num_gpus > 1 else loss.item() self.scalar_dict['Evaluation']['Loss/{}'.format(tag)] += loss return pre_features, post_features, alignments
def train_one_epoch(model, train_loader, optimizer, criterion, rank, world_size, epoch, num_epoch): model.train() summary_loss = AverageMeter() if rank == 0: pbar = tqdm(total=len(train_loader), unit="batch") pbar.set_description(f"Epoch[{epoch+1}/{num_epoch}].Train") for imgs, labels in train_loader: batch_size = imgs.shape[0] imgs = imgs.to(rank) labels = labels.to(rank) output = model(imgs) loss = criterion(output, labels) optimizer.zero_grad() loss.backward() optimizer.step() reduced_loss = reduce_tensor(loss, world_size) summary_loss.update(reduced_loss.detach().item(), batch_size) if rank == 0: pbar.set_postfix({"loss": summary_loss.avg}) pbar.update(1) return summary_loss
def Evaluation_Step(self, features): loss_dict = {} features = features.to(self.device, non_blocking=True) embeddings = self.model(features) loss_dict['Embedding'] = self.criterion( embeddings, self.hp.Train.Batch.Eval.Pattern_per_Speaker) for tag, loss in loss_dict.items(): loss = reduce_tensor( loss.data, self.num_gpus).item() if self.num_gpus > 1 else loss.item() self.scalar_dict['Evaluation']['Loss/{}'.format(tag)] += loss
def Train_Step(self, tokens, notes, durations, token_lengths, features, feature_lengths): loss_dict = {} tokens = tokens.to(self.device, non_blocking=True) notes = notes.to(self.device, non_blocking=True) durations = durations.to(self.device, non_blocking=True) token_lengths = token_lengths.to(self.device, non_blocking=True) features = features.to(self.device, non_blocking=True) feature_lengths = feature_lengths.to(self.device, non_blocking=True) with torch.cuda.amp.autocast(enabled=self.hp.Use_Mixed_Precision): pre_features, post_features, alignments = self.model( tokens=tokens, notes=notes, durations=durations, token_lengths=token_lengths, features=features, feature_lengths=feature_lengths, is_training=True) loss_dict['Pre'] = self.criterion_dict['MSE'](pre_features, features) loss_dict['Post'] = self.criterion_dict['MSE'](post_features, features) loss_dict['Guided_Attention'] = self.criterion_dict['GAL']( alignments, feature_lengths, token_lengths) loss_dict['Total'] = loss_dict['Pre'] + loss_dict[ 'Post'] + loss_dict['Guided_Attention'] self.optimizer.zero_grad() self.scaler.scale(loss_dict['Total']).backward() if self.hp.Train.Gradient_Norm > 0.0: self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_( parameters=self.model.parameters(), max_norm=self.hp.Train.Gradient_Norm) self.scaler.step(self.optimizer) self.scaler.update() self.scheduler.step() self.steps += 1 self.tqdm.update(1) for tag, loss in loss_dict.items(): loss = reduce_tensor( loss.data, self.num_gpus).item() if self.num_gpus > 1 else loss.item() self.scalar_dict['Train']['Loss/{}'.format(tag)] += loss
def eval(eval_loader, model, criterion, num_gpus, start_time, epoch, use_multi_speaker=False): print("[%s] start evaluation" % datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) with torch.no_grad(): model.eval() total_loss = 0. for i, batch in enumerate(eval_loader): model.zero_grad() if use_multi_speaker: mel, audio, spk_embed_or_id = batch spk_embed_or_id = torch.autograd.Variable( spk_embed_or_id.cuda()) else: mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) if use_multi_speaker: outputs = model((mel, audio, spk_embed_or_id)) else: outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() total_loss += reduced_loss if i > 0 and i % 100 == 0: elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] eval {}/{} steps:\t{:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, i, len(eval_loader), reduced_loss)) elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] {} epoch :\t eval avg loss {:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, total_loss / len(eval_loader)))
def validate(model, criterion, testset, iteration, batch_size, num_gpus, logger): model.eval() with torch.no_grad(): print("validation {}:".format(iteration)) batch_size = min(batch_size, int(len(testset) / num_gpus)) test_sampler = DistributedSampler(testset) if num_gpus > 1 else None test_loader = DataLoader(testset, num_workers=1, shuffle=False, sampler=test_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # # why is this necessary?? # for j, test_batch in enumerate(test_loader): # print("\twtfbatch loaded, {} of {}".format(j + 1, len(test_loader))) # mel, audio = test_batch # print("\twtfbatch done, {} of {}: {} {}".format(j + 1, len(test_loader), mel.size(), audio.size())) val_loss = 0.0 for j, test_batch in enumerate(test_loader): print("\tval batch loaded, {} of {}".format( j + 1, len(test_loader))) mel, audio = test_batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_val_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss print("\tval batch done, {} of {}: {:.9f}".format( j + 1, len(test_loader), reduced_val_loss)) val_loss = val_loss / len(test_loader) print("val loss: {}:\t{:.9f}".format(iteration, val_loss)) if logger: logger.add_scalar('test_loss', val_loss, iteration) model.train()
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=4): print("num_workers", num_workers) torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=1, gamma=0.96) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) evalset = Mel2Samp(**eval_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=False, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) epoch_offset = max(1, int(iteration / len(train_loader))) start_time = datetime.datetime.now() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print('Epoch:', epoch, 'LR:', scheduler.get_lr()) elapsed = datetime.datetime.now() - start_time print("Epoch: [{}][els: {}] {}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch)) model.train() total_loss = 0. for i, batch in enumerate(train_loader): model.zero_grad() if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: mel, audio, spk_embed_or_id = batch spk_embed_or_id = torch.autograd.Variable( spk_embed_or_id.cuda()) else: mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) if waveglow_config["multi_speaker_config"]["use_multi_speaker"]: outputs = model((mel, audio, spk_embed_or_id)) else: outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() total_loss += reduced_loss if i > 0 and i % 10 == 0: elapsed = datetime.datetime.now() - start_time print( "[{}][els: {}] epoch {},total steps{}, {}/{} steps:\t{:.9f}" .format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, iteration, i, len(train_loader), reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 elapsed = datetime.datetime.now() - start_time print("[{}][els: {}] {} epoch :\tavg loss {:.9f}".format( datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S"), elapsed, epoch, total_loss / len(train_loader))) scheduler.step() eval.eval(eval_loader, model, criterion, num_gpus, start_time, epoch, waveglow_config["multi_speaker_config"]["use_multi_speaker"])
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, loss_empthasis, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, logdirname, datedlogdir, warm_start=False, optimizer='ADAM', start_zero=False): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== global WaveGlow global WaveGlowLoss ax = True # this is **really** bad coding practice :D if ax: from efficient_model_ax import WaveGlow from efficient_loss import WaveGlowLoss else: if waveglow_config["yoyo"]: # efficient_mode # TODO: Add to Config File from efficient_model import WaveGlow from efficient_loss import WaveGlowLoss else: from glow import WaveGlow, WaveGlowLoss criterion = WaveGlowLoss(sigma, loss_empthasis) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== STFTs = [STFT.TacotronSTFT(filter_length=window, hop_length=data_config['hop_length'], win_length=window, sampling_rate=data_config['sampling_rate'], n_mel_channels=160, mel_fmin=0, mel_fmax=16000) for window in data_config['validation_windows']] loader_STFT = STFT.TacotronSTFT(filter_length=data_config['filter_length'], hop_length=data_config['hop_length'], win_length=data_config['win_length'], sampling_rate=data_config['sampling_rate'], n_mel_channels=data_config['n_mel_channels'] if 'n_mel_channels' in data_config.keys() else 160, mel_fmin=data_config['mel_fmin'], mel_fmax=data_config['mel_fmax']) #optimizer = "Adam" optimizer = optimizer.lower() optimizer_fused = bool( 0 ) # use Apex fused optimizer, should be identical to normal but slightly faster and only works on RTX cards if optimizer_fused: from apex import optimizers as apexopt if optimizer == "adam": optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": optimizer = apexopt.FusedLAMB(model.parameters(), lr=learning_rate, max_grad_norm=200) else: if optimizer == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) elif optimizer == "lamb": from lamb import Lamb as optLAMB optimizer = optLAMB(model.parameters(), lr=learning_rate) #import torch_optimizer as optim #optimizer = optim.Lamb(model.parameters(), lr=learning_rate) #raise# PyTorch doesn't currently include LAMB optimizer. if fp16_run: global amp from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') else: amp = None ## LEARNING RATE SCHEDULER if True: from torch.optim.lr_scheduler import ReduceLROnPlateau min_lr = 1e-8 factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True, threshold=0.0001, threshold_mode='abs') print("ReduceLROnPlateau used as Learning Rate Scheduler.") else: scheduler=False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run, warm_start=warm_start) iteration += 1 # next iteration is iteration + 1 if start_zero: iteration = 0 trainset = Mel2Samp(**data_config, check_files=True) speaker_lookup = trainset.speaker_ids # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset, shuffle=True) shuffle = False else: train_sampler = None shuffle = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=3, shuffle=shuffle, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter if datedlogdir: timestr = time.strftime("%Y_%m_%d-%H_%M_%S") log_directory = os.path.join(output_directory, logdirname, timestr) else: log_directory = os.path.join(output_directory, logdirname) logger = SummaryWriter(log_directory) moving_average = int(min(len(train_loader), 100)) # average loss over entire Epoch rolling_sum = StreamingMovingAverage(moving_average) start_time = time.time() start_time_iter = time.time() start_time_dekaiter = time.time() model.train() # best (averaged) training loss if os.path.exists(os.path.join(output_directory, "best_model")+".txt"): best_model_loss = float(str(open(os.path.join(output_directory, "best_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_model_loss = -6.20 # best (validation) MSE on inferred spectrogram. if os.path.exists(os.path.join(output_directory, "best_val_model")+".txt"): best_MSE = float(str(open(os.path.join(output_directory, "best_val_model")+".txt", "r", encoding="utf-8").read()).split("\n")[0]) else: best_MSE = 9e9 epoch_offset = max(0, int(iteration / len(train_loader))) pytorch_total_params = sum(p.numel() for p in model.parameters()) print("{:,} total parameters in model".format(pytorch_total_params)) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("{:,} trainable parameters.".format(pytorch_total_params)) print(f"Segment Length: {data_config['segment_length']:,}\nBatch Size: {batch_size:,}\nNumber of GPUs: {num_gpus:,}\nSamples/Iter: {data_config['segment_length']*batch_size*num_gpus:,}") training = True while training: try: if rank == 0: epochs_iterator = tqdm(range(epoch_offset, epochs), initial=epoch_offset, total=epochs, smoothing=0.01, desc="Epoch", position=1, unit="epoch") else: epochs_iterator = range(epoch_offset, epochs) # ================ MAIN TRAINING LOOP! =================== for epoch in epochs_iterator: print(f"Epoch: {epoch}") if num_gpus > 1: train_sampler.set_epoch(epoch) if rank == 0: iters_iterator = tqdm(enumerate(train_loader), desc=" Iter", smoothing=0, total=len(train_loader), position=0, unit="iter", leave=True) else: iters_iterator = enumerate(train_loader) for i, batch in iters_iterator: # run external code every iter, allows the run to be adjusted without restarts if (i==0 or iteration % param_interval == 0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration, 'seconds_elapsed': time.time()-start_time} exec(internal_text, globals(), ldict) else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) if show_live_params: print(internal_text) if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR) learning_rate = optimizer.param_groups[0]['lr'] # Learning Rate Schedule if custom_lr: old_lr = learning_rate if iteration < warmup_start: learning_rate = warmup_start_lr elif iteration < warmup_end: learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations. else: if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ assert learning_rate > -1e-8, "Negative Learning Rate." if old_lr != learning_rate: for param_group in optimizer.param_groups: param_group['lr'] = learning_rate else: scheduler.patience = scheduler_patience scheduler.cooldown = scheduler_cooldown if override_scheduler_last_lr: scheduler._last_lr = override_scheduler_last_lr if override_scheduler_best: scheduler.best = override_scheduler_best if override_scheduler_last_lr or override_scheduler_best: print("scheduler._last_lr =", scheduler._last_lr, "scheduler.best =", scheduler.best, " |", end='') model.zero_grad() mel, audio, speaker_ids = batch mel = torch.autograd.Variable(mel.cuda(non_blocking=True)) audio = torch.autograd.Variable(audio.cuda(non_blocking=True)) speaker_ids = speaker_ids.cuda(non_blocking=True).long().squeeze(1) outputs = model(mel, audio, speaker_ids) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (reduced_loss > LossExplosionThreshold) or (math.isnan(reduced_loss)): model.zero_grad() raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") if use_grad_clip: if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: is_overflow = False; grad_norm=0.00001 optimizer.step() if not is_overflow and rank == 0: # get current Loss Scale of first optimizer loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if fp16_run else 32768 if with_tensorboard: if (iteration % 100000 == 0): # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.add_histogram(tag, value.data.cpu().numpy(), iteration) logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_samples', reduced_loss, iteration*batch_size) if (iteration % 20 == 0): logger.add_scalar('learning.rate', learning_rate, iteration) if (iteration % 10 == 0): logger.add_scalar('duration', ((time.time() - start_time_dekaiter)/10), iteration) average_loss = rolling_sum.process(reduced_loss) if (iteration % 10 == 0): tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {:.2f}s/iter {:.4f}s/item".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), (time.time() - start_time_dekaiter)/10, ((time.time() - start_time_dekaiter)/10)/(batch_size*num_gpus))) start_time_dekaiter = time.time() else: tqdm.write("{} {}: {:.3f} {:.3f} {:.3f} {:08.3F} {:.8f}LR ({:.8f} Effective) {}LS".format(time.strftime("%H:%M:%S"), iteration, reduced_loss, average_loss, best_MSE, round(grad_norm,3), learning_rate, min((grad_clip_thresh/grad_norm)*learning_rate,learning_rate), loss_scale)) start_time_iter = time.time() if rank == 0 and (len(rolling_sum.values) > moving_average-2): if (average_loss+best_model_margin) < best_model_loss: checkpoint_path = os.path.join(output_directory, "best_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(average_loss)+"\n"+str(iteration)) text_file.close() best_model_loss = average_loss #Only save the model if X better than the current loss. if rank == 0 and iteration > 0 and ((iteration % iters_per_checkpoint == 0) or (os.path.exists(save_file_check_path))): checkpoint_path = f"{output_directory}/waveglow_{iteration}" save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) if (os.path.exists(save_file_check_path)): os.remove(save_file_check_path) if (iteration % validation_interval == 0): if rank == 0: MSE, MAE = validate(model, loader_STFT, STFTs, logger, iteration, data_config['validation_files'], speaker_lookup, sigma, output_directory, data_config) if scheduler: MSE = torch.tensor(MSE, device='cuda') if num_gpus > 1: broadcast(MSE, 0) scheduler.step(MSE.item()) if MSE < best_MSE: checkpoint_path = os.path.join(output_directory, "best_val_model") try: save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) except KeyboardInterrupt: # Avoid corrupting the model. save_checkpoint(model, optimizer, learning_rate, iteration, amp, scheduler, speaker_lookup, checkpoint_path) text_file = open((f"{checkpoint_path}.txt"), "w", encoding="utf-8") text_file.write(str(MSE.item())+"\n"+str(iteration)) text_file.close() best_MSE = MSE.item() #Only save the model if X better than the current loss. else: if scheduler: MSE = torch.zeros(1, device='cuda') broadcast(MSE, 0) scheduler.step(MSE.item()) learning_rate = optimizer.param_groups[0]['lr'] #check actual learning rate (because I sometimes see learning_rate variable go out-of-sync with real LR) iteration += 1 training = False # exit the While loop except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome) print(ex) # print Loss checkpoint_path = os.path.join(output_directory, "best_model") assert os.path.exists(checkpoint_path), "best_val_model must exist for automatic restarts" # clearing VRAM for load checkpoint audio = mel = speaker_ids = loss = None torch.cuda.empty_cache() model.eval() model, optimizer, iteration, scheduler = load_checkpoint(checkpoint_path, model, optimizer, scheduler, fp16_run) learning_rate = optimizer.param_groups[0]['lr'] epoch_offset = max(0, int(iteration / len(train_loader))) model.train() iteration += 1 pass # and continue training.
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(data_config['training_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=False) if 'testing_files' in data_config: testset = Mel2Samp(data_config['testing_files'], data_config['segment_length'], data_config['filter_length'], data_config['hop_length'], data_config['win_length'], data_config['sampling_rate'], data_config['mel_fmin'], data_config['mel_fmax'], debug=True) else: testset = None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) else: logger = None model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() model.zero_grad() print("train batch loaded, {} ({} of {})".format( iteration, i, len(train_loader))) mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) is_overflow = math.isnan(grad_norm) optimizer.step() duration = time.perf_counter() - start print( "train batch done, {} ({} of {}): {:.9f} (took {:.2f})".format( iteration, i, len(train_loader), reduced_loss, duration)) if logger: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) logger.add_scalar('duration', duration, i + len(train_loader) * epoch) if testset and not is_overflow and (iteration % iters_per_checkpoint == 0): if testset: validate(model, criterion, testset, iteration, batch_size, num_gpus, logger) if rank == 0: rotate_checkpoints(output_directory) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def forward(self, pred_audio, gt_audio, amp, model, optimizer, optimizer_d, num_gpus, use_grad_clip, grad_clip_thresh): # optional cond input """ pred_audio: [B, T] gt_audio: [B, T] """ metrics = {} pred_spect = self.MRS.get_mel(pred_audio) pred_spect = dynamic_range_compression( pred_spect) # linear -> log magnitudes gt_spec = self.MRS.get_mel(gt_audio) gt_spec = dynamic_range_compression( gt_spec) # linear -> log magnitudes if self.stage >= 2: real_labels = torch.zeros(gt_audio.shape[0], device=gt_audio.device, dtype=gt_audio.dtype) # [B] fake_labels = torch.ones(gt_audio.shape[0], device=gt_audio.device, dtype=gt_audio.dtype) # [B] if False: # (optional) mask frequencies that humans can't hear in STFT pred_spect *= self.mask gt_spec *= self.mask ############################# ### Generator Stuff ### ############################# mel_fake_pred_fakeness = self.discriminatorS( pred_spect) # [B] predict fakeness of generated spectrogram wav_fake_pred_fakeness = self.discriminatorW( pred_audio) # [B] predict fakeness of generated audio fake_pred_fakeness = ( mel_fake_pred_fakeness + wav_fake_pred_fakeness ).sigmoid() # Average and range between 0.0 and 1.0 loss = nn.BCELoss( )(fake_pred_fakeness, real_labels) # [B] -> [] calc loss to decrease fakeness of model metrics['g_train_loss'] = reduce_tensor( loss.data, num_gpus).item() if num_gpus > 1 else loss.item() if amp is not None: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if use_grad_clip: if amp is not None: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() metrics['is_overflow'] = math.isinf(grad_norm) or math.isnan( grad_norm) if not metrics['is_overflow']: metrics['grad_norm'] = grad_norm else: metrics['is_overflow'] = False grad_norm = 1e-6 optimizer.step() ############################# ### Discriminator Stuff ### ############################# optimizer_d.zero_grad() mel_real_pred_fakeness = self.discriminatorS( gt_spec) # [B] predict fakeness of real spectrogram wav_real_pred_fakeness = self.discriminatorW( gt_audio) # [B] predict fakeness of real audio real_pred_fakeness = ( mel_real_pred_fakeness + wav_real_pred_fakeness ).sigmoid() # Average and range between 0.0 and 1.0 real_d_loss = nn.BCELoss()( real_pred_fakeness, real_labels ) # [B] -> [] loss to decrease distriminated fakeness of real samples mel_fake_pred_fakeness = self.discriminatorS(pred_spect.detach( )) # [B] predict fakeness of generated spectrogram wav_fake_pred_fakeness = self.discriminatorW( pred_audio.detach()) # [B] predict fakeness of generated audio fake_pred_fakeness = ( mel_fake_pred_fakeness + wav_fake_pred_fakeness ).sigmoid() # Average and range between 0.0 and 1.0 fake_d_loss = nn.BCELoss()( fake_pred_fakeness, fake_labels ) # [B] -> [] loss to increase distriminated fakeness of fake samples d_loss = (real_d_loss + fake_d_loss) / 2 metrics['d_train_loss'] = reduce_tensor( d_loss.data, num_gpus).item() if num_gpus > 1 else d_loss.item() if amp is not None: with amp.scale_loss(d_loss, optimizer_d) as scaled_d_loss: scaled_d_loss.backward() else: d_loss.backward() if use_grad_clip: if amp is not None: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer_d), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( self.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() metrics['is_overflow'] = math.isinf(grad_norm) or math.isnan( grad_norm) if not metrics['is_overflow']: metrics['grad_norm'] = grad_norm else: metrics['is_overflow'] = False grad_norm = 1e-6 optimizer_d.step() else: loss = F.l1_loss(pred_spect, gt_spec) loss += F.l1_loss(pred_audio, gt_audio) metrics['train_loss'] = reduce_tensor( loss.data, num_gpus).item() if num_gpus > 1 else loss.item() if amp is not None: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if use_grad_clip: if amp is not None: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) if type(grad_norm) == torch.Tensor: grad_norm = grad_norm.item() metrics['is_overflow'] = math.isinf(grad_norm) or math.isnan( grad_norm) if not metrics['is_overflow']: metrics['grad_norm'] = grad_norm else: metrics['is_overflow'] = False grad_norm = 0.00001 optimizer.step() return metrics
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, weight_sharing, optimizer_type, dataloader_type): ws = weight_sharing torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer_type = optimizer_type.lower() if optimizer_type == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) elif optimizer_type == "adam": optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) else: print("Unsupported optimizer: %s. Aborting." % optimizer_type) return None if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 dataloader_type = dataloader_type.lower() if dataloader_type == "vanilla": trainset = Mel2Samp(**data_config) elif dataloader_type == "split": trainset = Mel2SampSplit(**data_config) else: print("Unsupported dataloader type: %s. Aborting." % dataloader_type) return None # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=(num_gpus == 1), sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) name = "waveglow_ws%d_%s_%s_batch%d" % (ws, optimizer_type, dataloader_type, batch_size) if learning_rate != 1e-4: name = name + "_lr{:.0e}".format(learning_rate) if num_gpus > 1: name = name + "_x%d" % num_gpus if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join("./logs", name)) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== stime2 = None for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) stime = time() for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if (iteration % 100 == 0): if not stime2 is None: tot_time2 = time() - stime2 print("{}:\t{:.9f}, time: {}".format( iteration, reduced_loss, int(tot_time2))) stime2 = time() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}_{}".format( output_directory, name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 tot_time = time() - stime print("Epoch %d completed. Time: %d seconds" % (epoch, int(tot_time)))
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 print("checkpoint path", checkpoint_path) #model = warm_load_checkpoint(checkpoint_path, model) model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() loss.backward() optimizer.step() if (iteration % iters_per_checkpoint == 0): print("{}:\t{:.9f}".format(iteration, reduced_loss)) checkpoint_path = "{}/waveglow".format(output_directory) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, init_lr, final_lr, sigma, epochs_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): os.makedirs(output_directory, exist_ok=True) torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=init_lr) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists epoch_offset = 1 if checkpoint_path != "": model, optimizer, epoch_offset = load_checkpoint( checkpoint_path, model, optimizer) epoch_offset += 1 # next epoch is epoch_offset + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=8, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs + 1): print(f'Epoch: {epoch}') adjust_learning_rate(optimizer, epoch, init_lr, final_lr, epochs) for i, batch in enumerate(tqdm.tqdm(train_loader)): optimizer.zero_grad() batch = model.pre_process(batch) outputs = model(batch) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + 1 + len(train_loader) * epoch) if epoch % epochs_per_checkpoint == 0: if rank == 0: # Keep only one checkpoint last_chkpt = os.path.join( output_directory, f'waveglow_{epoch - epochs_per_checkpoint:06d}.pt') if os.path.exists(last_chkpt): os.remove(last_chkpt) checkpoint_path = os.path.join(output_directory, f'waveglow_{epoch:06d}.pt') save_checkpoint(model, optimizer, epoch, checkpoint_path)
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 #trainset = Mel2SampOnehot(**data_config) trainset = DeepMels(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): total_loss = 0 print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() total_loss += reduced_loss if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 print("epoch:{}, total epoch loss:{}".format(epoch, total_loss))
def train(num_gpus, rank, group_name, device, output_directory, epochs, learning_rate, iters_per_checkpoint, batch_size, seed, checkpoint_path, use_scheduled_sampling=False, use_wavenet_autoencoder=True, use_variational_autoencoder=False, diversity_scale=0.005, use_logistic_mixtures=False, n_mixtures=3, audio_hz=16000, midi_hz=250, aggressive_loss_threshold=3.0, encoder_error_thresh=0.0): assert use_wavenet_autoencoder is True if num_gpus > 1: device = init_distributed(rank, num_gpus, group_name, **dist_config) device = torch.device(device) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if use_logistic_mixtures: sampler = DML.SampleDiscretizedMixLogistics() criterion = DML.DiscretizedMixLogisticLoss() else: sampler = utils.CategoricalSampler() criterion = CrossEntropyLoss() model = WavenetAutoencoder(wavenet_config, cond_wavenet_config, use_variational_autoencoder).to(device) if use_variational_autoencoder: diversity_loss = L2DiversityLoss() if num_gpus > 1: model = apply_gradient_allreduce(model) if use_scheduled_sampling: scheduled_sampler = ScheduledSamplerWithPatience( model, sampler, **scheduled_sampler_config) encoder_optimizer = torch.optim.Adam(model.encoder_wavenet.parameters(), lr=learning_rate) decoder_optimizer = torch.optim.Adam(model.wavenet.parameters(), lr=learning_rate) # Train state params aggressive = True train_encoder = False # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, encoder_optimizer, decoder_optimizer, aggressive, iteration = load_checkpoint( checkpoint_path, model, encoder_optimizer, decoder_optimizer) iteration += 1 # Dataloader trainset = MaestroDataloader(**data_config) if num_gpus > 1: train_sampler = DistributedSampler(trainset) else: train_sampler = None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready for distributed if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) # Initialize training variables epoch_offset = max(0, int(iteration / len(train_loader))) start_iter = iteration loss_idx = 0 loss_sum = 0 prev_loss = 999999999 print("output directory: " + output_directory) # write loss to csv file loss_writer = DictWriter(open(output_directory + "/train.csv", 'w', newline=''), fieldnames=['iteration', 'loss']) loss_writer.writeheader() signal_writer = DictWriter(open(output_directory + "/signal.csv", "w", newline=''), fieldnames=[ 'iteration', 'cosim', 'p-dist', 'forwardMagnitude', 'midiMagnitude' ]) signal_writer.writeheader() model.train() # ================ MAIN TRAINING LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch x = as_variable(x, device) y = as_variable(y, device) y_true = y.clone() if use_scheduled_sampling: y = scheduled_sampler(x, y) y_preds = model((x, y)) if use_wavenet_autoencoder: q_bar = y_preds[1] y_preds = y_preds[0] loss = criterion(y_preds, y_true) if use_variational_autoencoder: div_loss = diversity_loss(q_bar) loss = loss + (diversity_scale * div_loss) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.data.item() loss.backward() if aggressive and train_encoder: encoder_optimizer.step() print("Encoder step") elif aggressive: decoder_optimizer.step() print("Decoder step") else: # normal training encoder_optimizer.step() decoder_optimizer.step() print("total loss: {}:\t{:.9f}".format(iteration, reduced_loss)) if use_variational_autoencoder: print(" diversity loss: {:.9f}".format(div_loss)) if use_scheduled_sampling: scheduled_sampler.update(reduced_loss) # record running average of loss loss_sum += reduced_loss loss_idx += 1 if (iteration % 10 == 0): loss_avg = loss_sum / loss_idx print("floating avg of 10: " + str(loss_avg)) #loss_writer.writerow({"iteration": str(i), # "loss": str(reduced_loss)}) if aggressive and loss_avg < aggressive_loss_threshold: agressive = False elif aggressive and train_encoder and loss_avg >= ( prev_loss + encoder_error_thresh): train_encoder = False elif aggressive: train_encoder = True prev_loss = loss_avg loss_sum = 0 loss_idx = 0 # save model if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint_autoencoder(model, device, use_variational_autoencoder, encoder_optimizer, decoder_optimizer, aggressive, learning_rate, iteration, checkpoint_path) iteration += 1 del loss
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=2): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # HACK: setup separate training and eval sets training_files = data_config['training_files'] eval_files = data_config['eval_files'] del data_config['training_files'] del data_config['eval_files'] data_config['audio_files'] = training_files trainset = Mel2Samp(**data_config) data_config['audio_files'] = eval_files evalset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print("Creating dataloaders with " + str(num_workers) + " workers") train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=True, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger_train = SummaryWriter( os.path.join(output_directory, 'logs', 'train')) logger_eval = SummaryWriter( os.path.join(output_directory, 'logs', 'eval')) epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): model.train() with tqdm(total=len(train_loader)) as train_pbar: for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() train_pbar.set_description( "Epoch {} Iter {} Loss {:.3f}".format( epoch, iteration, reduced_loss)) if with_tensorboard and rank == 0 and iteration % 10 == 0: logger_train.add_scalar('loss', reduced_loss, i + len(train_loader) * epoch) # adding logging for GPU utilization and memory usage gpu_memory_used, gpu_utilization = get_gpu_stats() k = 'gpu' + str(0) logger_train.add_scalar(k + '/memory', gpu_memory_used, iteration) logger_train.add_scalar(k + '/load', gpu_utilization, iteration) logger_train.flush() if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 train_pbar.update(1) # Eval model.eval() torch.cuda.empty_cache() with torch.no_grad(): tensorboard_mel, tensorboard_audio = None, None loss_accum = [] with tqdm(total=len(eval_loader)) as eval_pbar: for i, batch in enumerate(eval_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs).item() loss_accum.append(loss) eval_pbar.set_description("Epoch {} Eval {:.3f}".format( epoch, loss)) outputs = None # use the first batch for tensorboard audio samples if i == 0: tensorboard_mel = mel tensorboard_audio = audio eval_pbar.update(1) if with_tensorboard and rank == 0: loss_avg = statistics.mean(loss_accum) tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg)) logger_eval.add_scalar('loss', loss_avg, iteration) # log audio samples to tensorboard tensorboard_audio_generated = model.infer(tensorboard_mel) for i in range(0, 5): ta = tensorboard_audio[i].cpu().numpy() tag = tensorboard_audio_generated[i].cpu().numpy() logger_eval.add_audio("sample " + str(i) + "/orig", ta, epoch, sample_rate=data_config['sampling_rate']) logger_eval.add_audio("sample " + str(i) + "/gen", tag, epoch, sample_rate=data_config['sampling_rate']) logger_eval.flush()
def train(num_gpus, rank, group_name, prj_name, run_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, grad_clip_thresh, checkpoint_path, pretrained_path, with_tensorboard, with_wandb): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 if pretrained_path != "": model = load_pretrained(pretrained_path, model) trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: train_sampler = DistributedSampler(trainset) shuffle_at_dataloader = False else: train_sampler = None shuffle_at_dataloader = True # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle_at_dataloader, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): iter_start = time.perf_counter() float_epoch = float(iteration) / len(train_loader) model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss, etc = criterion(outputs) (z_L2_normalized, neg_log_s_total, neg_log_det_W_total) = etc if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() is_overflow = False if fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isnan(grad_norm) if not is_overflow: clipped_grad_norm = get_clip_grad_norm( grad_norm, grad_clip_thresh) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) clipped_grad_norm = get_clip_grad_norm(grad_norm, grad_clip_thresh) optimizer.step() iter_duration = time.perf_counter() - iter_start print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if with_wandb and rank == 0: wandb.log( { 'iteration': iteration, 'epoch': float_epoch, 'iter_duration': iter_duration, 'training_loss': reduced_loss, 'training_loss/z_L2_normalized': z_L2_normalized, 'training_loss/neg_log_s_total': neg_log_s_total, 'training_loss/neg_log_det_W_total': neg_log_det_W_total, }, step=iteration) if not is_overflow: wandb.log( { 'grad_norm': grad_norm, 'clipped_grad_norm': clipped_grad_norm, }, step=iteration) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/{}/{}/waveglow_{}".format( output_directory, prj_name, run_name, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if train_data_config["no_chunks"]: criterion = MaskedCrossEntropyLoss() else: criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() ema = ExponentialMovingAverage(ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=200000, gamma=0.5) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model, optimizer, scheduler, ema) iteration += 1 # next iteration is iteration + 1 trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config) validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print(train_data_config) if train_data_config["no_chunks"]: collate_fn = utils.collate_fn else: collate_fn = torch.utils.data.dataloader.default_collate train_loader = DataLoader(trainset, num_workers=1, shuffle=False, collate_fn=collate_fn, sampler=train_sampler, batch_size=batch_size, pin_memory=True, drop_last=True) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=valid_sampler, batch_size=1, pin_memory=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) writer = SummaryWriter(log_dir) print("Checkpoints writing to: {}".format(log_dir)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): if low_memory: torch.cuda.empty_cache() scheduler.step() model.zero_grad() if train_data_config["no_chunks"]: x, y, seq_lens = batch seq_lens = to_gpu(seq_lens) else: x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) if train_data_config["no_chunks"]: loss = criterion(y_pred, y, seq_lens) else: loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) print("{}:\t{:.9f}".format(iteration, reduced_loss)) if rank == 0: writer.add_scalar('loss', reduced_loss, iteration) if (iteration % iters_per_checkpoint == 0 and iteration): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path, ema, wavenet_config) if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]): if low_memory: torch.cuda.empty_cache() if rank == 0: model_eval = nv_wavenet.NVWaveNet(**(model.export_weights())) for j, valid_batch in enumerate(valid_loader): mel, audio = valid_batch mel = to_gpu(mel).float() cond_input = model.get_cond_input(mel) predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO) predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256) writer.add_audio("valid/predicted_audio_{}".format(j), predicted_audio, iteration, 22050) audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256) writer.add_audio("valid_true/audio_{}".format(j), audio, iteration, 22050) if low_memory: torch.cuda.empty_cache() iteration += 1
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path): # Get device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hp.sigma) model = WaveGlow().cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hp.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hp.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # Get dataset dataset = FastSpeechDataset() # Get training loader print("Get Training Loader") training_loader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=cpu_count()) if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hp.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model = model.train() epoch_offset = max(0, int(iteration / len(training_loader))) beta = hp.batch_size print("Total Epochs: {}".format(hp.epochs)) print("Batch Size: {}".format(hp.batch_size)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hp.epochs): print("Epoch: {}".format(epoch)) for i, data_of_batch in enumerate(training_loader): model.zero_grad() if not hp.pre_target: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = get_alignment(src_seq, tacotron2).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) else: # Prepare Data src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] mel_tgt = data_of_batch["mels"] alignment_target = data_of_batch["alignment"] src_seq = torch.from_numpy(src_seq).long().to(device) src_pos = torch.from_numpy(src_pos).long().to(device) mel_tgt = torch.from_numpy(mel_tgt).float().to(device) alignment_target = torch.from_numpy( alignment_target).float().to(device) # For Data Parallel mel_max_len = mel_tgt.size(1) outputs = model(src_seq, src_pos, mel_tgt, mel_max_len, alignment_target) _, _, _, duration_predictor = outputs mel_tgt = mel_tgt.transpose(1, 2) max_like, dur_loss = criterion(outputs, alignment_target, mel_tgt) if beta > 1 and iteration % 10000 == 0: beta = beta // 2 loss = max_like + dur_loss if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hp.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() #grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hp.with_tensorboard and rank == 0: logger.log_training(reduced_loss, dur_loss, learning_rate, iteration) if (iteration % hp.save_step == 0): if rank == 0: # logger.log_alignment(model, mel_predict, mel_tgt, iteration) checkpoint_path = "{}/TTSglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def compute_validation_loss(model, criterion, valset, batch_size, n_gpus, apply_ctc): model.eval() with torch.no_grad(): collate_fn = DataCollate(n_frames_per_step=1, use_attn_prior=valset.use_attn_prior) val_sampler = DistributedSampler(valset) if n_gpus > 1 else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size, pin_memory=False, collate_fn=collate_fn) val_loss, val_loss_nll, val_loss_gate = 0.0, 0.0, 0.0 val_loss_ctc = 0.0 n_batches = len(val_loader) for i, batch in enumerate(val_loader): (mel, spk_ids, txt, in_lens, out_lens, gate_target, attn_prior) = batch mel, spk_ids, txt = mel.cuda(), spk_ids.cuda(), txt.cuda() in_lens, out_lens = in_lens.cuda(), out_lens.cuda() gate_target = gate_target.cuda() attn_prior = attn_prior.cuda() if attn_prior is not None else None (z, log_s_list, gate_pred, attn, attn_logprob, mean, log_var, prob) = model(mel, spk_ids, txt, in_lens, out_lens, attn_prior) loss_nll, loss_gate, loss_ctc = criterion( (z, log_s_list, gate_pred, attn, attn_logprob, mean, log_var, prob), gate_target, in_lens, out_lens, is_validation=True) loss = loss_nll + loss_gate if apply_ctc: loss += loss_ctc * criterion.ctc_loss_weight if n_gpus > 1: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() reduced_val_loss_nll = reduce_tensor(loss_nll.data, n_gpus).item() reduced_val_loss_gate = reduce_tensor(loss_gate.data, n_gpus).item() reduced_val_loss_ctc = reduce_tensor(loss_ctc.data, n_gpus).item() else: reduced_val_loss = loss.item() reduced_val_loss_nll = loss_nll.item() reduced_val_loss_gate = loss_gate.item() reduced_val_loss_ctc = loss_ctc.item() val_loss += reduced_val_loss val_loss_nll += reduced_val_loss_nll val_loss_gate += reduced_val_loss_gate val_loss_ctc += reduced_val_loss_ctc val_loss = val_loss / n_batches val_loss_nll = val_loss_nll / n_batches val_loss_gate = val_loss_gate / n_batches val_loss_ctc = val_loss_ctc / n_batches print("Mean {}\nLogVar {}\nProb {}".format(mean, log_var, prob)) model.train() return (val_loss, val_loss_nll, val_loss_gate, val_loss_ctc, attn, gate_pred, gate_target)
def train(n_gpus, rank, output_directory, epochs, optim_algo, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, finetune_layers, warmstart_checkpoint_path, with_tensorboard, grad_clip_val, fp16_run, tensorboard_path=None): fp16_run = bool(fp16_run) torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), bool(model_config['use_gate_layer'])) model = Flowtron(**model_config).cuda() if len(finetune_layers): for name, param in model.named_parameters(): if name in finetune_layers: param.requires_grad = True else: param.requires_grad = False print("Initializing %s optimizer" % (optim_algo)) if optim_algo == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) elif optim_algo == 'RAdam': optimizer = RAdam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) else: print("Unrecognized optimizer %s!" % (optim_algo)) exit(1) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) scaler = amp.GradScaler(enabled=fp16_run) train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("Output directory", output_directory) if with_tensorboard and rank == 0: tboard_out_path = tensorboard_path if tensorboard_path is None: tboard_out_path = os.path.join(output_directory, "logs/run1") print("Setting up Tensorboard log in %s" % (tboard_out_path)) logger = FlowtronLogger(tboard_out_path) # force set the learning rate to what is specified for param_group in optimizer.param_groups: param_group['lr'] = learning_rate model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() attn_prior = attn_prior.cuda() if valset.use_attn_prior else None with amp.autocast(enabled=fp16_run): z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens, attn_prior) loss_nll, loss_gate = criterion( (z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) loss = loss_nll + loss_gate if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(loss_gate.data, n_gpus).item() reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = loss_gate.item() reduced_nll_loss = loss_nll.item() scaler.scale(loss).backward() if grad_clip_val > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip_val) scaler.step(optimizer) scaler.update() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('training_loss_gate', reduced_gate_loss, iteration) logger.add_scalar('training_loss_nll', reduced_nll_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if iteration % iters_per_checkpoint == 0: val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay, sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path, ignore_layers, include_layers, warmstart_checkpoint_path, with_tensorboard, fp16_run): torch.manual_seed(seed) torch.cuda.manual_seed(seed) if n_gpus > 1: init_distributed(rank, n_gpus, **dist_config) criterion = FlowtronLoss(sigma, bool(model_config['n_components']), model_config['use_gate_layer']) model = Flowtron(**model_config).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # Load checkpoint if one exists iteration = 0 if warmstart_checkpoint_path != "": model = warmstart(warmstart_checkpoint_path, model) if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer, ignore_layers) iteration += 1 # next iteration is iteration + 1 if n_gpus > 1: model = apply_gradient_allreduce(model) print(model) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') train_loader, valset, collate_fn = prepare_dataloaders( data_config, n_gpus, batch_size) # Get shared output_directory ready if rank == 0 and not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: logger = FlowtronLogger(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for batch in train_loader: model.zero_grad() mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda( ), text.cuda() in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda( ), gate_target.cuda() z, log_s_list, gate_pred, attn, mean, log_var, prob = model( mel, speaker_vecs, text, in_lens, out_lens) loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob), gate_target, out_lens) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if rank == 0: print("{}:\t{:.9f}".format(iteration, reduced_loss), flush=True) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, iteration) logger.add_scalar('learning_rate', learning_rate, iteration) if (iteration % iters_per_checkpoint == 0): val_loss, attns, gate_pred, gate_target = compute_validation_loss( model, criterion, valset, collate_fn, batch_size, n_gpus) if rank == 0: print("Validation loss {}: {:9f} ".format( iteration, val_loss)) if with_tensorboard: logger.log_validation(val_loss, attns, gate_pred, gate_target, iteration) checkpoint_path = "{}/model_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(n_gpus, rank, group_name): if n_gpus > 1: if rank == 0: print('Synchronizing distributed flow...') init_distributed(rank, n_gpus, group_name, config['dist_config']) torch.manual_seed(config['seed']) torch.cuda.manual_seed(config['seed']) if rank == 0: print('Initializing model, optimizer and loss...') model = Tacotron2(config).cuda() criterion = Tacotron2Loss() learning_rate = config['learning_rate'] optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=config['weight_decay']) if config['fp16_run']: if rank == 0: print('Using FP16...') from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') if rank == 0: print('Preparing dirs, data loaders and logger...') logger = prepare_directories_and_logger(config['output_directory'], config['log_directory'], rank) train_loader, valset, collate_fn = prepare_dataloaders( config['training_files'], config['validation_files'], config['n_frames_per_step'], n_gpus) iteration = 0 epoch_offset = 0 if not config['warm_up_checkpoint'] is None: if rank == 0: print('Loading checkpoint from {}...'.format( config['warm_up_checkpoint'])) model = load_checkpoint(config['warm_up_checkpoint'], model, optimizer) iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.compress_factorize(config=config['compress_config']) model.train() # Main training loop for epoch in range(epoch_offset, config['epochs']): print("Epoch: {}".format(epoch)) for _, batch in enumerate(train_loader): start = time.perf_counter() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if n_gpus > 1: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() if config['fp16_run']: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if iteration % config['iters_per_grad_acc'] == 0: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), config['grad_clip_thresh']) optimizer.step() model.zero_grad() if rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it". format(iteration, reduced_loss, grad_norm, duration)) logger.log_training(reduced_loss, grad_norm, learning_rate, duration, iteration) if iteration % config['iters_per_validation'] == 0: validate(model, criterion, valset, iteration, config['batch_size'], n_gpus, collate_fn, logger, rank) if iteration % config['iters_per_checkpoint'] == 0: if rank == 0: checkpoint_path = os.path.join( config['output_directory'], "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, log_directory, checkpoint_path, hparams): torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(hparams.sigma) model = WaveGlow(hparams).cuda() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== learning_rate = hparams.learning_rate optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path: model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = TextMelLoader(hparams.training_files, hparams) collate_fn = TextMelCollate() # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== batch_size = hparams.batch_size train_loader = DataLoader(trainset, num_workers=0, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) # Get shared output_directory readya if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if hparams.with_tensorboard and rank == 0: logger = prepare_directories_and_logger(output_directory, log_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) print("Total Epochs: {}".format(hparams.epochs)) print("Batch Size: {}".format(hparams.batch_size)) print("learning rate: {}".format(hparams.learning_rate)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) with torch.no_grad(): enc_outputs, alignments = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel_padded = mel_padded.transpose(1, 2) # mel_padded = mel_padded / torch.abs(mel_padded).max().item() mel_pos = torch.arange(1000) mel_pos = to_gpu(mel_pos).long().unsqueeze(0) mel_pos = mel_pos.expand(hparams.batch_size, -1) src_pos = torch.arange(hparams.n_position) src_pos = to_gpu(src_pos).long().unsqueeze(0) src_pos = src_pos.expand(hparams.batch_size, -1) mel_padded = (mel_padded + 5) / 10 z, log_s_list, log_det_w_list, dec_enc_attn = model( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) outputs = (z, log_s_list, log_det_w_list, dec_enc_attn) loss = criterion(outputs, alignments) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if hparams.with_tensorboard and rank == 0: logger.log_training(reduced_loss, grad_norm, learning_rate, iteration) if (iteration % hparams.iters_per_checkpoint == 0): if rank == 0: mel_predict, test_attn = model.test( mel_padded, enc_outputs, mel_pos, src_pos, input_lengths) logger.log_alignment(model, dec_enc_attn, alignments, mel_padded, mel_predict, test_attn, iteration) checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cpu() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cpu()) audio = torch.autograd.Variable(audio.cpu()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, batch_size, seed, checkpoint_path, hparams): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) if num_gpus >= 1: model = WaveGlow(**waveglow_config, hparams=hparams).cuda() else: model = WaveGlow(**waveglow_config, hparams=hparams) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # Load checkpoint if one exists iteration, eval_iteration = 0, 0 if checkpoint_path != "": model, optimizer, iteration, eval_iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 eval_iteration += 1 # trainset = Mel2Samp(**data_config) trainset = TextMelLoader( audiopaths_and_text='./filelists/ljs_audio_text_train_filelist.txt', hparams=hparams) testset = TextMelLoader( audiopaths_and_text='./filelists/ljs_audio_text_test_filelist.txt', hparams=hparams) collate_fn = TextMelCollate(hparams, fixed_length=True) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, collate_fn=collate_fn, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) test_loader = DataLoader(testset, num_workers=1, collate_fn=collate_fn, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) log_path = os.path.join(output_directory, 'log-event') os.makedirs(log_path, exist_ok=True) logger = WaveGlowLogger(log_path) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() tacotron2 = Tacotron2(hparams) batch_parser = tacotron2.parse_batch # we use tacotron-2's pipeline epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) model.train() for i, batch in enumerate(train_loader): model.zero_grad() x, y = batch_parser(batch) text_padded, input_lengths, mel_padded, max_len, output_lengths = x # print(text_padded.size(), mel_padded.size()) mel_padded, gate_padded = y outputs = model((text_padded, mel_padded)) loss = criterion(outputs) logger.log_loss('train/loss', loss, iteration) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) iteration += 1 # model.eval() # for i, batch in enumerate(test_loader): # x, y = batch_parser(batch) # text_padded, input_lengths, mel_padded, max_len, output_lengths = x # mel_padded, gate_padded = y # outputs = model((text_padded, mel_padded)) # loss = criterion(outputs) # logger.log_loss('eval/loss', loss, iteration) # eval_iteration += 1 if rank == 0: checkpoint_path = "{}/waveglow_epoch_{}".format(output_directory, epoch) save_checkpoint(model, optimizer, learning_rate, iteration, eval_iteration, checkpoint_path, hparams=hparams)