def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank): """Handles all the validation scoring and printing""" model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=num_workers_, shuffle=False, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) val_loss = 0.0 diagonality = torch.zeros(1) avg_prob = torch.zeros(1) for i, batch in tqdm( enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i] x, y = model.parse_batch(batch) y_pred = model(x) # text_padded, input_lengths, mel_padded, max_len, output_lengths, speaker_ids = x # mel_out, mel_out_postnet, gate_outputs, alignments = y_pred rate, prob = alignment_metric(x, y_pred) diagonality += rate avg_prob += prob loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss # end forloop val_loss = val_loss / (i + 1) diagonality = (diagonality / (i + 1)).item() avg_prob = (avg_prob / (i + 1)).item() # end torch.no_grad() model.train() if rank == 0: tqdm.write( "Validation loss {}: {:9f} Average Max Attention: {:9f}".format( iteration, val_loss, avg_prob)) #logger.log_validation(val_loss, model, y, y_pred, iteration) if iteration != 0: logger.log_validation(val_loss, model, y, y_pred, iteration, diagonality, avg_prob) return val_loss, avg_prob
def validate(model, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1): """Handles all the validation scoring and printing""" model.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1, shuffle=False, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) if teacher_force == 1: val_teacher_force_till = 0 val_p_teacher_forcing = 1.0 elif teacher_force == 2: val_teacher_force_till = 0 val_p_teacher_forcing = 0.0 val_loss = 0.0 diagonality = torch.zeros(1) avg_prob = torch.zeros(1) for i, batch in tqdm(enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i] x, y = model.parse_batch(batch) y_pred = model(x, teacher_force_till=val_teacher_force_till, p_teacher_forcing=val_p_teacher_forcing) rate, prob = alignment_metric(x, y_pred) diagonality += rate avg_prob += prob loss, gate_loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss # end forloop val_loss = val_loss / (i + 1) diagonality = (diagonality / (i + 1)).item() avg_prob = (avg_prob / (i + 1)).item() # end torch.no_grad() model.train() if rank == 0: tqdm.write("Validation loss {}: {:9f} Average Max Attention: {:9f}".format(iteration, val_loss, avg_prob)) #logger.log_validation(val_loss, model, y, y_pred, iteration) if True:#iteration != 0: if teacher_force == 1: logger.log_teacher_forced_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) elif teacher_force == 2: logger.log_infer(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) else: logger.log_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) return val_loss
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus, rank, group_name, hparams): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ # setup distributed hparams.n_gpus = n_gpus hparams.rank = rank if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) # reproducablilty stuffs torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) # initialize blank model model = load_model(hparams) model.eval() learning_rate = hparams.learning_rate # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust if hparams.print_layer_names_during_startup: print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n") # (optional) Freeze layers by disabling grads if len(hparams.frozen_modules): for layer, params in list(model.named_parameters()): if any(layer.startswith(module) for module in hparams.frozen_modules): params.requires_grad = False print(f"Layer: {layer} has been frozen") # define optimizer (any params without requires_grad are ignored) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay) #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay) if hparams.fp16_run: model, optimizer = amp.initialize(model, optimizer, opt_level='O2') if hparams.distributed_run: model = apply_gradient_allreduce(model) criterion = Tacotron2Loss(hparams) logger = prepare_directories_and_logger( output_directory, log_directory, rank) # Load checkpoint if one exists best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value. iteration = 0 epoch_offset = 0 _learning_rate = 1e-3 saved_lookup = None if checkpoint_path is not None: if warm_start: model, iteration, saved_lookup = warm_start_model( checkpoint_path, model, hparams.ignore_layers) elif warm_start_force: model, iteration, saved_lookup = warm_start_force_model( checkpoint_path, model) else: model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint( checkpoint_path, model, optimizer) if hparams.use_saved_learning_rate: learning_rate = _learning_rate iteration += 1 # next iteration is iteration + 1 print('Model Loaded') # define datasets/dataloaders train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, saved_lookup) epoch_offset = max(0, int(iteration / len(train_loader))) speaker_lookup = trainset.speaker_ids # load and/or generate global_mean if hparams.drop_frame_rate > 0.: if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it while not os.path.exists(hparams.global_mean_npy): time.sleep(1) global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams) hparams.global_mean = global_mean model.global_mean = global_mean # define scheduler use_scheduler = 0 if use_scheduler: scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1/5), patience=10) model.train() is_overflow = False validate_then_terminate = 0 if validate_then_terminate: val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank) raise Exception("Finished Validation") for param_group in optimizer.param_groups: param_group['lr'] = learning_rate rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"): tqdm.write("Epoch:{}".format(epoch)) if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training train_sampler.set_epoch(epoch) start_time = time.time() # start iterating through the epoch for i, batch in tqdm(enumerate(train_loader), desc="Iter: ", smoothing=0, total=len(train_loader), position=0, unit="iter"): # run external code every epoch, allows the run to be adjusting without restarts if (iteration % 1000 == 0 or i==0): try: with open("run_every_epoch.py") as f: internal_text = str(f.read()) if len(internal_text) > 0: print(internal_text) #code = compile(internal_text, "run_every_epoch.py", 'exec') ldict = {'iteration': iteration} exec(internal_text, globals(), ldict) print("Custom code excecuted\nPlease remove code if it was intended to be ran once.") else: print("No Custom code found, continuing without changes.") except Exception as ex: print(f"Custom code FAILED to run!\n{ex}") globals().update(ldict) locals().update(ldict) print("decay_start is ",decay_start) print("A_ is ",A_) print("B_ is ",B_) print("C_ is ",C_) print("min_learning_rate is ",min_learning_rate) print("epochs_between_updates is ",epochs_between_updates) print("drop_frame_rate is ",drop_frame_rate) print("p_teacher_forcing is ",p_teacher_forcing) print("teacher_force_till is ",teacher_force_till) print("val_p_teacher_forcing is ",val_p_teacher_forcing) print("val_teacher_force_till is ",val_teacher_force_till) print("grad_clip_thresh is ",grad_clip_thresh) if epoch % epochs_between_updates == 0 or epoch_offset == epoch: #if None: tqdm.write("Old learning rate [{:.6f}]".format(learning_rate)) if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ learning_rate = max(min_learning_rate, learning_rate) # output the largest number tqdm.write("Changing Learning Rate to [{:.6f}]".format(learning_rate)) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate # /run external code every epoch, allows the run to be adjusting without restarts/ model.zero_grad() x, y = model.parse_batch(batch) # move batch to GPU (async) y_pred = model(x, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing, drop_frame_rate=drop_frame_rate) loss, gate_loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() reduced_gate_loss = reduce_tensor(gate_loss.data, n_gpus).item() else: reduced_loss = loss.item() reduced_gate_loss = gate_loss.item() if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if hparams.fp16_run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), grad_clip_thresh) optimizer.step() for j, param_group in enumerate(optimizer.param_groups): learning_rate = (float(param_group['lr'])); break if iteration < decay_start: learning_rate = A_ + C_ else: iteration_adjusted = iteration - decay_start learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_ learning_rate = max(min_learning_rate, learning_rate) # output the largest number for param_group in optimizer.param_groups: param_group['lr'] = learning_rate if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_loss) tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Gate_loss {:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format( iteration, reduced_loss, average_loss, reduced_gate_loss, grad_norm, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate)) if iteration % 20 == 0: diagonality, avg_prob = alignment_metric(x, y_pred) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, diagonality=diagonality, avg_prob=avg_prob) else: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing) start_time = time.time() if is_overflow and rank == 0: tqdm.write("Gradient Overflow, Skipping Step") if not is_overflow and ((iteration % (hparams.iters_per_checkpoint/1) == 0) or (os.path.exists(save_file_check_path))): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) if not is_overflow and ((iteration % int((hparams.iters_per_validation)/1) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))): if rank == 0 and os.path.exists(save_file_check_path): os.remove(save_file_check_path) # perform validation and save "best_model" depending on validation loss val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1) #teacher_force val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=2) #infer val_loss = validate(model, criterion, valset, iteration, hparams.val_batch_size, n_gpus, collate_fn, logger, hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=0) #validate (0.8 forcing) if use_scheduler: scheduler.step(val_loss) if (val_loss < best_validation_loss): best_validation_loss = val_loss if rank == 0: checkpoint_path = os.path.join(output_directory, "best_model") save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path) iteration += 1
model.parameters(), grad_clip_thresh) is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) optimizer.step() for j, param_group in enumerate(optimizer.param_groups): learning_rate = (float(param_group['lr'])); break if not is_overflow and rank == 0: duration = time.time() - start_time average_loss = rolling_loss.process(reduced_loss) tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Gate_loss {:.4f}] [Grad Norm {:.4f}] " "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format( iteration, reduced_loss, average_loss, reduced_gate_loss, grad_norm, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate)) if iteration % 20 == 0: diagonality, avg_prob = alignment_metric(x, y_pred) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, diagonality=diagonality, avg_prob=avg_prob) else: logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing) start_time = time.time() if is_overflow and rank == 0: tqdm.write("Gradient Overflow, Skipping Step") if not is_overflow and ((iteration % (hparams.iters_per_checkpoint/1) == 0) or (os.path.exists(save_file_check_path))): # save model checkpoint like normal if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path)
def validate(hparams, model, model_d, criterion, valset, iteration, batch_size, n_gpus, collate_fn, logger, distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1, p_emotionnet_embed=0.0): """Handles all the validation scoring and printing""" model.eval() model_d.eval() with torch.no_grad(): val_sampler = DistributedSampler(valset) if distributed_run else None val_loader = DataLoader(valset, sampler=val_sampler, num_workers=num_workers_, shuffle=False, batch_size=batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) if teacher_force == 1: val_teacher_force_till = 0 val_p_teacher_forcing = 1.0 p_emotionnet_embed = 1.0 elif teacher_force == 2: val_teacher_force_till = 0 val_p_teacher_forcing = 0.0 p_emotionnet_embed = 0.0 val_loss = 0.0 diagonality = torch.zeros(1) avg_prob = torch.zeros(1) loss_terms_arr = [] for i, batch in tqdm(enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i] x, y = model.parse_batch(batch) with torch.random.fork_rng(devices=[0,]): torch.random.manual_seed(0)# use same seed during validation so results are more consistent and comparable. y_pred = model(x, teacher_force_till=val_teacher_force_till, p_teacher_forcing=val_p_teacher_forcing, p_emotionnet_embed=p_emotionnet_embed) rate, prob = alignment_metric(x, y_pred) diagonality += rate avg_prob += prob criterion_dict = { "amp": None, "n_gpus": n_gpus, "model": model, "model_d": model_d, "hparams": hparams, "optimizer": None, "optimizer_d": None, "grad_clip_thresh": 0.0, } loss, gate_loss, loss_terms, reduced_val_loss, reduced_gate_loss, grad_norm, is_overflow = criterion( y_pred, y, criterion_dict, 0) loss_terms_arr.append(loss_terms) val_loss += reduced_val_loss # end forloop val_loss = val_loss / (i + 1) diagonality = (diagonality / (i + 1)).item() avg_prob = (avg_prob / (i + 1)).item() # end torch.no_grad() model.train() model_d.train() if rank == 0: tqdm.write("Validation loss {}: {:9f} Average Max Attention: {:9f}".format(iteration, val_loss, avg_prob)) if iteration != 0: if teacher_force == 1: logger.log_teacher_forced_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) elif teacher_force == 2: logger.log_infer(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) else: loss_terms = average_loss_terms(loss_terms_arr) logger.log_validation(val_loss, model, y, y_pred, iteration, loss_terms, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob) return val_loss