Exemple #1
0
def validate(model, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank):
    """Handles all the validation scoring and printing"""
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset,
                                sampler=val_sampler,
                                num_workers=num_workers_,
                                shuffle=False,
                                batch_size=batch_size,
                                pin_memory=False,
                                drop_last=True,
                                collate_fn=collate_fn)

        val_loss = 0.0
        diagonality = torch.zeros(1)
        avg_prob = torch.zeros(1)
        for i, batch in tqdm(
                enumerate(val_loader),
                desc="Validation",
                total=len(val_loader),
                smoothing=0):  # i = index, batch = stuff in array[i]
            x, y = model.parse_batch(batch)
            y_pred = model(x)
            # text_padded, input_lengths, mel_padded, max_len, output_lengths, speaker_ids = x
            # mel_out, mel_out_postnet, gate_outputs, alignments = y_pred
            rate, prob = alignment_metric(x, y_pred)
            diagonality += rate
            avg_prob += prob
            loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
            # end forloop
        val_loss = val_loss / (i + 1)
        diagonality = (diagonality / (i + 1)).item()
        avg_prob = (avg_prob / (i + 1)).item()
        # end torch.no_grad()

    model.train()
    if rank == 0:
        tqdm.write(
            "Validation loss {}: {:9f}  Average Max Attention: {:9f}".format(
                iteration, val_loss, avg_prob))
        #logger.log_validation(val_loss, model, y, y_pred, iteration)
        if iteration != 0:
            logger.log_validation(val_loss, model, y, y_pred, iteration,
                                  diagonality, avg_prob)
    return val_loss, avg_prob
Exemple #2
0
def validate(model, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1):
    """Handles all the validation scoring and printing"""
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, drop_last=True, collate_fn=collate_fn)
        if teacher_force == 1:
            val_teacher_force_till = 0
            val_p_teacher_forcing = 1.0
        elif teacher_force == 2:
            val_teacher_force_till = 0
            val_p_teacher_forcing = 0.0
        val_loss = 0.0
        diagonality = torch.zeros(1)
        avg_prob = torch.zeros(1)
        for i, batch in tqdm(enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i]
            x, y = model.parse_batch(batch)
            y_pred = model(x, teacher_force_till=val_teacher_force_till, p_teacher_forcing=val_p_teacher_forcing)
            rate, prob = alignment_metric(x, y_pred)
            diagonality += rate
            avg_prob += prob
            loss, gate_loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
            # end forloop
        val_loss = val_loss / (i + 1)
        diagonality = (diagonality / (i + 1)).item()
        avg_prob = (avg_prob / (i + 1)).item()
        # end torch.no_grad()
    model.train()
    if rank == 0:
        tqdm.write("Validation loss {}: {:9f}  Average Max Attention: {:9f}".format(iteration, val_loss, avg_prob))
        #logger.log_validation(val_loss, model, y, y_pred, iteration)
        if True:#iteration != 0:
            if teacher_force == 1:
                logger.log_teacher_forced_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
            elif teacher_force == 2:
                logger.log_infer(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
            else:
                logger.log_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
    return val_loss
Exemple #3
0
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    # setup distributed
    hparams.n_gpus = n_gpus
    hparams.rank = rank
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)
    
    # reproducablilty stuffs
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    
    # initialize blank model
    model = load_model(hparams)
    model.eval()
    learning_rate = hparams.learning_rate
    
    # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust
    if hparams.print_layer_names_during_startup:
        print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n")
    
    # (optional) Freeze layers by disabling grads
    if len(hparams.frozen_modules):
        for layer, params in list(model.named_parameters()):
            if any(layer.startswith(module) for module in hparams.frozen_modules):
                params.requires_grad = False
                print(f"Layer: {layer} has been frozen")
    
    # define optimizer (any params without requires_grad are ignored)
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay)
    #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)
    
    if hparams.fp16_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
    
    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)
    
    criterion = Tacotron2Loss(hparams)
    
    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)
    
    # Load checkpoint if one exists
    best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value.
    iteration = 0
    epoch_offset = 0
    _learning_rate = 1e-3
    saved_lookup = None
    if checkpoint_path is not None:
        if warm_start:
            model, iteration, saved_lookup = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        elif warm_start_force:
            model, iteration, saved_lookup = warm_start_force_model(
                checkpoint_path, model)
        else:
            model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
        iteration += 1  # next iteration is iteration + 1
        print('Model Loaded')
    
    # define datasets/dataloaders
    train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, saved_lookup)
    epoch_offset = max(0, int(iteration / len(train_loader)))
    speaker_lookup = trainset.speaker_ids
    
    # load and/or generate global_mean
    if hparams.drop_frame_rate > 0.:
        if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it
            while not os.path.exists(hparams.global_mean_npy): time.sleep(1)
        global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams)
        hparams.global_mean = global_mean
        model.global_mean = global_mean
    
    # define scheduler
    use_scheduler = 0
    if use_scheduler:
        scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1/5), patience=10)
    
    model.train()
    is_overflow = False
    validate_then_terminate = 0
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
            hparams.batch_size, n_gpus, collate_fn, logger,
            hparams.distributed_run, rank)
        raise Exception("Finished Validation")
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate
    
    rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"):
        tqdm.write("Epoch:{}".format(epoch))
        
        if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training
            train_sampler.set_epoch(epoch)
        start_time = time.time()
        # start iterating through the epoch
        for i, batch in tqdm(enumerate(train_loader), desc="Iter:  ", smoothing=0, total=len(train_loader), position=0, unit="iter"):
            # run external code every epoch, allows the run to be adjusting without restarts
            if (iteration % 1000 == 0 or i==0):
                try:
                    with open("run_every_epoch.py") as f:
                        internal_text = str(f.read())
                        if len(internal_text) > 0:
                            print(internal_text)
                            #code = compile(internal_text, "run_every_epoch.py", 'exec')
                            ldict = {'iteration': iteration}
                            exec(internal_text, globals(), ldict)
                            print("Custom code excecuted\nPlease remove code if it was intended to be ran once.")
                        else:
                            print("No Custom code found, continuing without changes.")
                except Exception as ex:
                    print(f"Custom code FAILED to run!\n{ex}")
                globals().update(ldict)
                locals().update(ldict)
                print("decay_start is ",decay_start)
                print("A_ is ",A_)
                print("B_ is ",B_)
                print("C_ is ",C_)
                print("min_learning_rate is ",min_learning_rate)
                print("epochs_between_updates is ",epochs_between_updates)
                print("drop_frame_rate is ",drop_frame_rate)
                print("p_teacher_forcing is ",p_teacher_forcing)
                print("teacher_force_till is ",teacher_force_till)
                print("val_p_teacher_forcing is ",val_p_teacher_forcing)
                print("val_teacher_force_till is ",val_teacher_force_till)
                print("grad_clip_thresh is ",grad_clip_thresh)
                if epoch % epochs_between_updates == 0 or epoch_offset == epoch:
                #if None:
                    tqdm.write("Old learning rate [{:.6f}]".format(learning_rate))
                    if iteration < decay_start:
                        learning_rate = A_ + C_
                    else:
                        iteration_adjusted = iteration - decay_start
                        learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
                    learning_rate = max(min_learning_rate, learning_rate) # output the largest number
                    tqdm.write("Changing Learning Rate to [{:.6f}]".format(learning_rate))
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = learning_rate
            # /run external code every epoch, allows the run to be adjusting without restarts/
            
            model.zero_grad()
            x, y = model.parse_batch(batch) # move batch to GPU (async)
            y_pred = model(x, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing, drop_frame_rate=drop_frame_rate)
            
            loss, gate_loss = criterion(y_pred, y)
            
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_gate_loss = reduce_tensor(gate_loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                reduced_gate_loss = gate_loss.item()
            
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            
            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), grad_clip_thresh)
                is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), grad_clip_thresh)
            
            optimizer.step()
            
            for j, param_group in enumerate(optimizer.param_groups):
                learning_rate = (float(param_group['lr'])); break
            
            if iteration < decay_start:
                learning_rate = A_ + C_
            else:
                iteration_adjusted = iteration - decay_start
                learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
            learning_rate = max(min_learning_rate, learning_rate) # output the largest number
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate
            
            if not is_overflow and rank == 0:
                duration = time.time() - start_time
                average_loss = rolling_loss.process(reduced_loss)
                tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Gate_loss {:.4f}] [Grad Norm {:.4f}] "
                      "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format(
                    iteration, reduced_loss, average_loss, reduced_gate_loss, grad_norm, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate))
                if iteration % 20 == 0:
                    diagonality, avg_prob = alignment_metric(x, y_pred)
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, diagonality=diagonality, avg_prob=avg_prob)
                else:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing)
                start_time = time.time()
            if is_overflow and rank == 0:
                tqdm.write("Gradient Overflow, Skipping Step")
            
            if not is_overflow and ((iteration % (hparams.iters_per_checkpoint/1) == 0) or (os.path.exists(save_file_check_path))):
                # save model checkpoint like normal
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path)
            
            if not is_overflow and ((iteration % int((hparams.iters_per_validation)/1) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))):
                if rank == 0 and os.path.exists(save_file_check_path):
                    os.remove(save_file_check_path)
                # perform validation and save "best_model" depending on validation loss
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.val_batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1) #teacher_force
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.val_batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=2) #infer
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.val_batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=0) #validate (0.8 forcing)
                if use_scheduler:
                    scheduler.step(val_loss)
                if (val_loss < best_validation_loss):
                    best_validation_loss = val_loss
                    if rank == 0:
                        checkpoint_path = os.path.join(output_directory, "best_model")
                        save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path)
            
            iteration += 1
Exemple #4
0
		                model.parameters(), grad_clip_thresh)
		        is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)

            optimizer.step()
            
            for j, param_group in enumerate(optimizer.param_groups):
                learning_rate = (float(param_group['lr'])); break

            if not is_overflow and rank == 0:
                duration = time.time() - start_time
                average_loss = rolling_loss.process(reduced_loss)
                tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Gate_loss {:.4f}] [Grad Norm {:.4f}] "
                      "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format(
                    iteration, reduced_loss, average_loss, reduced_gate_loss, grad_norm, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate))
                if iteration % 20 == 0:
                    diagonality, avg_prob = alignment_metric(x, y_pred)
                logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, diagonality=diagonality, avg_prob=avg_prob)
                else:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing)
                start_time = time.time()
            if is_overflow and rank == 0:
                tqdm.write("Gradient Overflow, Skipping Step")

            if not is_overflow and ((iteration % (hparams.iters_per_checkpoint/1) == 0) or (os.path.exists(save_file_check_path))):
                # save model checkpoint like normal
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path)
Exemple #5
0
def validate(hparams, model, model_d, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1, p_emotionnet_embed=0.0):
    """Handles all the validation scoring and printing"""
    model.eval()
    model_d.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=num_workers_,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, drop_last=True, collate_fn=collate_fn)
        if teacher_force == 1:
            val_teacher_force_till = 0
            val_p_teacher_forcing = 1.0
            p_emotionnet_embed = 1.0
        elif teacher_force == 2:
            val_teacher_force_till = 0
            val_p_teacher_forcing = 0.0
            p_emotionnet_embed = 0.0
        val_loss = 0.0
        diagonality = torch.zeros(1)
        avg_prob = torch.zeros(1)
        loss_terms_arr = []
        for i, batch in tqdm(enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i]
            x, y = model.parse_batch(batch)
            with torch.random.fork_rng(devices=[0,]):
                torch.random.manual_seed(0)# use same seed during validation so results are more consistent and comparable.
                y_pred = model(x, teacher_force_till=val_teacher_force_till, p_teacher_forcing=val_p_teacher_forcing, p_emotionnet_embed=p_emotionnet_embed)
            
            rate, prob = alignment_metric(x, y_pred)
            diagonality += rate
            avg_prob += prob
            criterion_dict = { "amp": None,
                            "n_gpus": n_gpus,
                             "model": model,
                           "model_d": model_d,
                           "hparams": hparams,
                         "optimizer": None,
                       "optimizer_d": None,
                  "grad_clip_thresh": 0.0,   }
            loss, gate_loss, loss_terms, reduced_val_loss, reduced_gate_loss, grad_norm, is_overflow = criterion(
                 y_pred, y, criterion_dict, 0)
            
            loss_terms_arr.append(loss_terms)
            val_loss += reduced_val_loss
            # end forloop
        val_loss = val_loss / (i + 1)
        diagonality = (diagonality / (i + 1)).item()
        avg_prob = (avg_prob / (i + 1)).item()
        # end torch.no_grad()
    model.train()
    model_d.train()
    if rank == 0:
        tqdm.write("Validation loss {}: {:9f}  Average Max Attention: {:9f}".format(iteration, val_loss, avg_prob))
        if iteration != 0:
            if teacher_force == 1:
                logger.log_teacher_forced_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
            elif teacher_force == 2:
                logger.log_infer(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
            else:
                loss_terms = average_loss_terms(loss_terms_arr)
                logger.log_validation(val_loss, model, y, y_pred, iteration, loss_terms, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
    return val_loss