Beispiel #1
0
 def __init__(self, device='cpu', jit=False):
     """ Required """
     self.device = device
     self.jit = jit
     self.hparams = self.create_hparams()
     self.model = load_model(self.hparams).to(device=device)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.hparams.learning_rate,
                              weight_decay=self.hparams.weight_decay)
     self.criterion = Tacotron2Loss().to(device=device)
     train_loader, valset, collate_fn = prepare_dataloaders(self.hparams)
     self.example_input, self.target = self.model.parse_batch(list(train_loader)[0], device=self.device)
Beispiel #2
0
def load_Tacotron2(hparams, device=torch.device('cuda')):
    model = Tacotron2(hparams).to(device)
    if hparams.fp16_run:
        model = batchnorm_to_float(model.half())
        model = lstmcell_to_float(model)
        model.decoder.attention_layer.score_mask_value = float(
            finfo('float16').min)

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    return model, Tacotron2Loss()
Beispiel #3
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    GPU_NUM = 0  # 원하는 GPU 번호 입력
    device = torch.device(
        f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(device)  # change allocation of current GPU
    print('Current cuda device ', torch.cuda.current_device())  # check

    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn, train_loader2, train_loader3 = prepare_dataloaders(
        hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))

        if epoch % 3 == 0:
            train_loader = train_loader3
            print('3sentence synth')
        elif epoch % 3 == 1:
            train_loader = train_loader2
            print('2sentence synth')
        else:
            train_loader = train_loader
            print('1sentence synth')

        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss, d_loss = criterion(y_pred, y)

            #for plot encoder attention
            # if iteration % 1000 == 0:
            #     _, mel, _, _, attns, attns_dec = y_pred
            #     import matplotlib.pylab as plt
            #     plt.figure()
            #     plt.imshow(mel[0].T.cpu().detach().numpy())
            #     alignment_path = os.path.join("/media/qw/data/Experiment/Encoder_selfAtt/outdir/Encoder_alignment",
            #                                   "mel_{}".format(iteration))
            #     plt.savefig(alignment_path)
            #     for j in range(3):
            #         for i in range(4):
            #             plt.imshow(attns[j][i*hparams.batch_size].T.cpu().detach().numpy())
            #             alignment_path = os.path.join("/media/qw/data/Experiment/Encoder_selfAtt/outdir/Encoder_alignment","alignment_{}_{}_{}".format(iteration, j, i))
            #             plt.savefig(alignment_path)
            # plt.imshow(attns_dec[2][i*hparams.batch_size].T.cpu().detach().numpy())
            # alignment_path = os.path.join("/media/qw/data/Experiment/Encoder_selfAtt/outdir/Decoder_alignment","alignment_{}_{}".format(iteration, i))
            # plt.savefig(alignment_path)

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration, d_loss.item())

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #4
0
def train(output_directory, log_directory, checkpoint_path, warm_start,
          warm_start_force, n_gpus, rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    # setup distributed
    hparams.n_gpus = n_gpus
    hparams.rank = rank
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    # reproducablilty stuffs
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    # initialize blank model
    model = load_model(hparams)
    model.eval()
    learning_rate = hparams.learning_rate

    # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust
    if hparams.print_layer_names_during_startup:
        print(*[
            f"Layer{i} = " + str(x[0]) + " " + str(x[1].shape)
            for i, x in enumerate(list(model.named_parameters()))
        ],
              sep="\n")

    # (optional) Freeze layers by disabling grads
    if len(hparams.frozen_modules):
        for layer, params in list(model.named_parameters()):
            if any(
                    layer.startswith(module)
                    for module in hparams.frozen_modules):
                params.requires_grad = False
                print(f"Layer: {layer} has been frozen")

    # define optimizer (any params without requires_grad are ignored)
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)
    #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss(hparams)

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    # Load checkpoint if one exists
    best_validation_loss = 0.8  # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value.
    iteration = 0
    epoch_offset = 0
    _learning_rate = 1e-3
    saved_lookup = None
    if checkpoint_path is not None:
        if warm_start:
            model, iteration, saved_lookup = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        elif warm_start_force:
            model, iteration, saved_lookup = warm_start_force_model(
                checkpoint_path, model)
        else:
            model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
        iteration += 1  # next iteration is iteration + 1
        print('Model Loaded')

    # define datasets/dataloaders
    train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(
        hparams, saved_lookup)
    epoch_offset = max(0, int(iteration / len(train_loader)))
    speaker_lookup = trainset.speaker_ids

    # define scheduler
    use_scheduler = 0
    if use_scheduler:
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=0.1**(1 / 5),
                                      patience=10)

    model.train()
    is_overflow = False
    validate_then_terminate = 0
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
                            hparams.batch_size, n_gpus, collate_fn, logger,
                            hparams.distributed_run, rank)
        raise Exception("Finished Validation")

    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in tqdm(range(epoch_offset, hparams.epochs),
                      initial=epoch_offset,
                      total=hparams.epochs,
                      desc="Epoch:",
                      position=1,
                      unit="epoch"):
        tqdm.write("Epoch:{}".format(epoch))

        if hparams.distributed_run:  # shuffles the train_loader when doing multi-gpu training
            train_sampler.set_epoch(epoch)
        start_time = time.time()
        # start iterating through the epoch
        for i, batch in tqdm(enumerate(train_loader),
                             desc="Iter:  ",
                             smoothing=0,
                             total=len(train_loader),
                             position=0,
                             unit="iter"):
            # run external code every iter, allows the run to be adjusted without restarts
            if (i == 0 or iteration % param_interval == 0):
                try:
                    with open("run_every_epoch.py") as f:
                        internal_text = str(f.read())
                        if len(internal_text) > 0:
                            #code = compile(internal_text, "run_every_epoch.py", 'exec')
                            ldict = {'iteration': iteration}
                            exec(internal_text, globals(), ldict)
                        else:
                            print(
                                "No Custom code found, continuing without changes."
                            )
                except Exception as ex:
                    print(f"Custom code FAILED to run!\n{ex}")
                globals().update(ldict)
                locals().update(ldict)
                if show_live_params:
                    print(internal_text)
            if not iteration % 50:  # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR)
                learning_rate = optimizer.param_groups[0]['lr']
            # Learning Rate Schedule
            if custom_lr:
                old_lr = learning_rate
                if iteration < warmup_start:
                    learning_rate = warmup_start_lr
                elif iteration < warmup_end:
                    learning_rate = (iteration - warmup_start) * (
                        (A_ + C_) - warmup_start_lr
                    ) / (
                        warmup_end - warmup_start
                    ) + warmup_start_lr  # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                else:
                    if iteration < decay_start:
                        learning_rate = A_ + C_
                    else:
                        iteration_adjusted = iteration - decay_start
                        learning_rate = (A_ *
                                         (e**(-iteration_adjusted / B_))) + C_
                assert learning_rate > -1e-8, "Negative Learning Rate."
                if old_lr != learning_rate:
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = learning_rate
            # /run external code every epoch, allows the run to be adjusting without restarts/

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss, len_loss, loss_z, loss_w, loss_s, loss_att = criterion(
                y_pred, y)

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_len_loss = reduce_tensor(len_loss.data, n_gpus).item()
                reduced_loss_z = reduce_tensor(loss_z.data, n_gpus).item()
                reduced_loss_w = reduce_tensor(loss_w.data, n_gpus).item()
                reduced_loss_s = reduce_tensor(loss_s.data, n_gpus).item()
                reduced_loss_att = reduce_tensor(
                    loss_att.data, n_gpus).item() if (loss_att
                                                      is not None) else 0
            else:
                reduced_loss = loss.item()
                reduced_len_loss = len_loss.item()
                reduced_loss_z = loss_z.item()
                reduced_loss_w = loss_w.item()
                reduced_loss_s = loss_s.item()
                reduced_loss_att = loss_att.item() if (loss_att
                                                       is not None) else 0

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), grad_clip_thresh)
                is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.time() - start_time
                average_loss = rolling_loss.process(reduced_loss)
                loss_scale = amp._amp_state.loss_scalers[
                    0]._loss_scale if hparams.fp16_run else 0  # get current Loss Scale of first optimizer
                tqdm.write(
                    "{} [Train_loss:{:.4f} Avg:{:.4f} Len:{:.4f} z:{:.4f} w:{:.4f} s:{:.4f} att:{:.4f}] [Grad Norm {:.4f}] "
                    "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR] [{} LS]".format(
                        iteration, reduced_loss, average_loss,
                        reduced_len_loss, reduced_loss_z, reduced_loss_w,
                        reduced_loss_s, reduced_loss_att, grad_norm, duration,
                        (duration / (hparams.batch_size * n_gpus)),
                        learning_rate, round(loss_scale)))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)
                start_time = time.time()

            #from time import sleep
            #sleep(2.5)

            if is_overflow and rank == 0:
                tqdm.write("Gradient Overflow, Skipping Step")

            if not is_overflow and ((iteration %
                                     (hparams.iters_per_checkpoint / 1) == 0)
                                    or (os.path.exists(save_file_check_path))):
                # save model checkpoint like normal
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    hparams, best_validation_loss,
                                    average_loss, speaker_lookup,
                                    checkpoint_path)

            if not is_overflow and (
                (iteration % int(validation_interval) == 0) or
                (os.path.exists(save_file_check_path)) or
                (iteration < 1000 and (iteration % 250 == 0))):
                if rank == 0 and os.path.exists(save_file_check_path):
                    os.remove(save_file_check_path)
                # perform validation and save "best_model" depending on validation loss
                val_loss = validate(model, criterion, valset, iteration,
                                    hparams.val_batch_size, n_gpus, collate_fn,
                                    logger, hparams.distributed_run,
                                    rank)  #validate (0.8 forcing)
                if use_scheduler:
                    scheduler.step(val_loss)
                if (val_loss < best_validation_loss):
                    best_validation_loss = val_loss
                    if rank == 0:
                        checkpoint_path = os.path.join(output_directory,
                                                       "best_model")
                        save_checkpoint(model, optimizer, learning_rate,
                                        iteration, hparams,
                                        best_validation_loss, average_loss,
                                        speaker_lookup, checkpoint_path)

            iteration += 1
Beispiel #5
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    print("Loading models...")
    model = load_model(hparams)

    print("Initializing optimizer...")
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)
    if hparams.fp16_run:
        optimizer = FP16_Optimizer(
            optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)

    criterion = Tacotron2Loss()

    print("Initializing logger...")
    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    print("Initializing dataloader...")
    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    print("Loading checkpoints...")
    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate

            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    if hparams.distributed_run or torch.cuda.device_count() > 1:
        batch_parser = model.module.parse_batch
    else:
        batch_parser = model.parse_batch
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = batch_parser(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            reduced_loss = reduce_tensor(loss.data, n_gpus)[0] \
                if hparams.distributed_run else loss.data[0]

            if hparams.fp16_run:
                optimizer.backward(loss)
                grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            overflow = optimizer.overflow if hparams.fp16_run else False

            if not overflow and not math.isnan(reduced_loss) and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))

                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not overflow and (iteration % hparams.iters_per_checkpoint
                                 == 0):
                reduced_val_loss = validate(model, criterion, valset,
                                            iteration, hparams.batch_size,
                                            n_gpus, collate_fn, logger,
                                            hparams.distributed_run, rank)

                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, reduced_val_loss))

                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)
                    logger.log_validation(reduced_val_loss, model, x, y,
                                          y_pred, iteration, hparams)

            iteration += 1
Beispiel #6
0
    from model import Tacotron2
    from loss_function import Tacotron2Loss
    hparams = create_hparams()
    text_loader = TextMelLoader(hparams.training_lst, hparams)
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    text, mel = text_loader[0]  # mel.shape (80 * frame_num)
    plt.matshow(mel, origin='lower')
    plt.colorbar()
    plt.savefig('mel_demo.png')

    train_loader = torch.utils.data.DataLoader(text_loader,
                                               num_workers=1,
                                               shuffle=False,
                                               batch_size=3,
                                               pin_memory=False,
                                               drop_last=True,
                                               collate_fn=collate_fn)
    print(len(train_loader))
    tacotron = Tacotron2(hparams)
    criterion = Tacotron2Loss()
    for batch in train_loader:
        text_padded, text_alignment_padded, input_lengths, mel_padded, alignments, alignments_weights_padded,\
            output_lengths = batch
        max_len = torch.max(input_lengths.data).item()
        x = (text_padded, input_lengths, mel_padded, max_len, output_lengths)
        y = (mel_padded, alignments, alignments_weights_padded,
             text_alignment_padded)
        y_pred = tacotron(x)
        print(criterion(y_pred, y))
        break
Beispiel #7
0
def train(experiment,
          output_directory,
          log_directory,
          checkpoint_path,
          warm_start,
          n_gpus,
          rank,
          group_name,
          hparams,
          max_steps=150000):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): hparams object containing configuration.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    # create model - does not load weights yet
    model = load_model(hparams)

    global_mean_path = os.path.join(experiment.paths["acoustic_features"],
                                    "global_mean.npy")
    train_loader, trainset, valset, collate_fn = prepare_dataloaders(
        experiment, hparams, model.requires_durations)
    if hparams.drop_frame_rate > 0.:
        global_mean = calculate_global_mean(train_loader, global_mean_path)
        hparams.global_mean = global_mean

    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    if hparams.model_type == "forwardtacotron":
        print("Using ForwardTacotronLoss")
        criterion = ForwardTacotronLoss()
    else:
        print("Using TacotronLoss")
        criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank, hparams.model_type)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    #for epoch in range(epoch_offset, hparams.epochs):
    epoch = epoch_offset
    while iteration < max_steps:
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            mel_lens = x[4]
            if model.requires_durations:
                dur = x[7]
            else:
                dur = None
            y_pred = model(x)

            loss, loginfo = criterion(y_pred, y, mel_lens, dur)
            if model.mi is not None:
                # transpose to [b, T, dim]
                decoder_outputs = y_pred[0].transpose(2, 1)
                ctc_text, ctc_text_lengths, aco_lengths = x[-2], x[-1], x[4]
                taco_loss = loss
                mi_loss = model.mi(decoder_outputs, ctc_text, aco_lengths,
                                   ctc_text_lengths, dur)
                if hparams.use_gaf:
                    if i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0:
                        safe_loss = 0. * sum(
                            [x.sum() for x in model.parameters()])
                        gaf = gradient_adaptive_factor.calc_grad_adapt_factor(
                            taco_loss + safe_loss, mi_loss + safe_loss,
                            model.parameters(), optimizer)
                        gaf = min(gaf, hparams.max_gaf)
                else:
                    gaf = 1.0
                loss = loss + gaf * mi_loss
            else:
                taco_loss = loss
                mi_loss = torch.tensor([-1.0])
                gaf = -1.0
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                taco_loss = reduce_tensor(taco_loss.data, n_gpus).item()
                mi_loss = reduce_tensor(mi_loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                taco_loss = taco_loss.item()
                mi_loss = mi_loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print("Train loss {} {:.4f} mi_loss {:.4f} Grad Norm {:.4f} "
                      "gaf {:.4f} {:.2f}s/it".format(iteration, taco_loss,
                                                     mi_loss, grad_norm, gaf,
                                                     duration))
                logger.log_training(loginfo, reduced_loss, taco_loss, mi_loss,
                                    grad_norm, gaf, learning_rate, duration,
                                    iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    best_checkpoint_path = os.path.join(
                        output_directory, "checkpoint_best".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    best_checkpoint_path)

            iteration += 1
        epoch += 1

    # generate GTA features and leave
    train_loader_tmp = DataLoader(trainset,
                                  num_workers=0,
                                  shuffle=False,
                                  batch_size=hparams.batch_size,
                                  pin_memory=False,
                                  drop_last=False,
                                  collate_fn=collate_fn)
    val_loader = DataLoader(valset,
                            num_workers=0,
                            shuffle=False,
                            batch_size=hparams.batch_size,
                            pin_memory=False,
                            collate_fn=collate_fn,
                            drop_last=False)
    create_gta_features(experiment, model, train_loader_tmp, val_loader)
Beispiel #8
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    #train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(hparams)
    train_loader, train_sampler, val_loader, val_sampler = prepare_dataloaders(
        hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            dplist = batch['support']['datapath']
            logstr = '||STEP {}, rank {} ||'.format(i, rank)
            logstr += 'SUPPORTS: ' + '\n'.join(dplist) + '\n'
            dplist = batch['query']['datapath']
            logstr += 'QUERIES: ' + '\n'.join(dplist) + '\n'
            with open('logs/rk{}.logs'.format(rank), 'at') as f:
                f.writelines(logstr + '\n\n')

            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                #                validate(model, criterion, valset, iteration,
                #                        hparams.batch_size, n_gpus, collate_fn, logger,
                #                        hparams.distributed_run, rank)
                validate(model, val_sampler, val_loader, criterion, iteration,
                         n_gpus, logger, hparams.distributed_run, rank)

                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #9
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams, ax_max_run_timer, parameters):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams)

    model = load_model(hparams)
    model.train()
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2',
            min_loss_scale=1.0)  #, loss_scale=256.0)

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    log_dir_counter = 0
    starting_log_directory = log_directory
    while os.path.exists(log_directory):
        log_dir_counter += 1
        log_directory = starting_log_directory + "_" + str(log_dir_counter)
    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    # Load checkpoint if one exists
    best_validation_loss = 0.6  # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value.
    val_avg_prob = 0.0
    iteration = 0
    epoch_offset = 0
    _learning_rate = 1e-3
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration, best_validation_loss = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))
    # define scheduler
    use_scheduler = 0
    if use_scheduler:
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=0.562341325,
                                      patience=15)

    model.train()
    is_overflow = False
    validate_then_terminate = 0
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
                            hparams.batch_size, n_gpus, collate_fn, logger,
                            hparams.distributed_run, rank)
        raise Exception("Finished Validation")

    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    decay_start = parameters["decay_start"]
    A_ = parameters["lr_A"]
    B_ = parameters["lr_B"]
    C_ = 0
    min_learning_rate = 1e-6
    epochs_between_updates = parameters["epochs_between_updates"]
    p_teacher_forcing = 1.00
    teacher_force_till = 30
    rolling_loss = StreamingMovingAverage(int(len(train_loader)))
    ax_start_time = time.time()
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in tqdm(range(epoch_offset, hparams.epochs),
                      initial=epoch_offset,
                      total=hparams.epochs,
                      desc="Epoch:",
                      position=1,
                      unit="epoch"):
        tqdm.write("Epoch:{}".format(epoch))

        # run external code every epoch, allows the run to be adjusting without restarts
        try:
            with open("run_every_epoch.py") as f:
                internal_text = str(f.read())
                if len(internal_text) > 0:
                    print(internal_text)
                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                    ldict = {}
                    exec(internal_text, globals(), ldict)
                    C_ = ldict['C_']
                    min_learning_rate = ldict['min_learning_rate']
                    p_teacher_forcing = ldict['p_teacher_forcing']
                    teacher_force_till = ldict['teacher_force_till']
                    print(
                        "Custom code excecuted\nPlease remove code if it was intended to be ran once."
                    )
                else:
                    print("No Custom code found, continuing without changes.")
        except Exception as ex:
            print(f"Custom code FAILED to run!\n{ex}")
        print("decay_start is ", decay_start)
        print("A_ is ", A_)
        print("B_ is ", B_)
        print("C_ is ", C_)
        print("min_learning_rate is ", min_learning_rate)
        print("epochs_between_updates is ", epochs_between_updates)
        print("p_teacher_forcing is ", p_teacher_forcing)
        print("teacher_force_till is ", teacher_force_till)
        if epoch % epochs_between_updates == 0 or epoch_offset == epoch:
            #if None:
            tqdm.write("Old learning rate [{:.6f}]".format(learning_rate))
            if iteration < decay_start:
                learning_rate = A_
            else:
                iteration_adjusted = iteration - decay_start
                learning_rate = (A_ * (e**(-iteration_adjusted / B_))) + C_
            learning_rate = max(min_learning_rate,
                                learning_rate)  # output the largest number
            #if epoch_offset == epoch: # hold learning rate low during first pass to let optimizer rebuild
            #    learning_rate = 1e-5
            tqdm.write(
                "Changing Learning Rate to [{:.6f}]".format(learning_rate))
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

        if hparams.distributed_run:  # shuffles the train_loader when doing multi-gpu training
            train_sampler.set_epoch(epoch)
        start_time = time.time()
        # start iterating through the epoch
        for i, batch in tqdm(enumerate(train_loader),
                             desc="Iter:  ",
                             smoothing=0,
                             total=len(train_loader),
                             position=0,
                             unit="iter"):
            model.zero_grad()
            x, y = model.parse_batch(batch)  # move batch to GPU (async)
            y_pred = model(x,
                           teacher_force_till=teacher_force_till,
                           p_teacher_forcing=p_teacher_forcing)

            loss = criterion(y_pred, y)

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            for j, param_group in enumerate(optimizer.param_groups):
                learning_rate = (float(param_group['lr']))
                break

            if not is_overflow and rank == 0:
                duration = time.time() - start_time
                average_loss = rolling_loss.process(reduced_loss)
                tqdm.write(
                    "{} [Train_loss {:.4f} Avg {:.4f}] [Grad Norm {:.4f}] "
                    "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format(
                        iteration, reduced_loss, average_loss, grad_norm,
                        duration, (duration / (hparams.batch_size * n_gpus)),
                        learning_rate))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)
                start_time = time.time()
            if is_overflow and rank == 0:
                tqdm.write("Gradient Overflow, Skipping Step")

                if rank == 0:
                    if (os.path.exists(save_file_check_path)):
                        os.remove(save_file_check_path)

            if (time.time() - ax_start_time) > ax_max_run_timer:
                break
            iteration += 1
            # end of iteration loop
        # end of epoch loop
        # perform validation and save "ax_model"
        val_loss, val_avg_prob = validate(model, criterion, valset, iteration,
                                          hparams.batch_size, n_gpus,
                                          collate_fn, logger,
                                          hparams.distributed_run, rank)
        if use_scheduler:
            scheduler.step(val_loss)
        if rank == 0:
            checkpoint_path = os.path.join(output_directory, "ax_model")
            save_checkpoint(model, optimizer, learning_rate, iteration,
                            best_validation_loss, checkpoint_path)

        # lets pretend this code is actually able to finish
        return val_avg_prob
Beispiel #10
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams, args):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    tstart = time.time()
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            if checkpoint_path.startswith('pid'):
                checkpoint = os.path.basename(checkpoint_path)
                checkpoint_path = download_checkpoints(args.pid, checkpoint,
                                                       output_directory)
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    unsaved_data = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        if args.max_duration and time.time() - tstart > args.max_duration:
            if unsaved_data:
                checkpoint_path = os.path.join(
                    output_directory, "checkpoint_{}".format(iteration))
                save_checkpoint(model, optimizer, learning_rate, iteration,
                                checkpoint_path)
                unsaved_data = False
                if args.pid:
                    try:
                        log_files = glob.glob(
                            os.path.join(output_directory, log_directory, '*'))
                        upload_to_drive([checkpoint_path] + log_files,
                                        args.pid)
                    except Exception as e:
                        print('error while uploading to drive\n%s' % str(e))
            break
        print("Epoch: {}".format(epoch))
        unsaved_data = True
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)
                    unsaved_data = False
                    if args.pid:
                        try:
                            log_files = glob.glob(
                                os.path.join(output_directory, log_directory,
                                             '*'))
                            upload_to_drive([checkpoint_path] + log_files,
                                            args.pid)
                        except Exception as e:
                            print('error while uploading to drive\n%s' %
                                  str(e))

            iteration += 1
Beispiel #11
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hyper_params, train_loader, valset, collate_fn):
    """Training and validation method with logging results to tensorboard and stdout

    :param output_directory (string): directory to save checkpoints
    :param log_directory (string): directory to save tensorboard logs
    :param checkpoint_path (string): checkpoint path
    :param n_gpus (int): number of gpus
    :param rank (int): rank of current gpu
    :param hyper_params (object dictionary): dictionary with all hyper parameters
    """

    # Check whether is a distributed running
    if hyper_params['distributed_run']:
        init_distributed(hyper_params, n_gpus, rank, group_name)

    # set the same fixed seed to reproduce same results everytime we train
    torch.manual_seed(hyper_params['seed'])
    torch.cuda.manual_seed(hyper_params['seed'])

    model = load_model(hyper_params)
    learning_rate = hyper_params['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hyper_params['weight_decay'])

    if hyper_params['fp16_run']:
        optimizer = FP16_Optimizer(
            optimizer, dynamic_loss_scale=hyper_params['dynamic_loss_scaling'])

    # Define the criterion of the loss function. The objective.
    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)
    # logger = ''

    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            # Re-start the model from the last checkpoint if we save the parameters and don't want to start from 0
            model = warm_start_model(checkpoint_path, model)
        else:
            # CHECK THIS OUT!!!
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hyper_params['use_saved_learning_rate']:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    # Set this to make all modules and regularization aware this is the training stage:
    model.train()

    # MAIN LOOP
    for epoch in range(epoch_offset, hyper_params['epochs']):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            # CHECK THIS OUT!!!
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            input_data, output_target = model.parse_batch(batch)
            output_predicted = model(input_data)

            loss = criterion(output_predicted, output_target)

            if hyper_params['distributed_run']:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hyper_params['fp16_run']:
                optimizer.backward(
                    loss)  # transformed optimizer into fp16 type
                grad_norm = optimizer.clip_fp32_grads(
                    hyper_params['grad_clip_thresh'])
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hyper_params['grad_clip_thresh'])

            # Performs a single optimization step (parameter update)
            optimizer.step()
            # This boolean controls overflow when running in fp16 optimizer
            overflow = optimizer.overflow if hyper_params['fp16_run'] else False

            # If overflow is True, it will not enter. If isnan is True, it will not enter neither.
            if not overflow and not math.isnan(reduced_loss) and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grand Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                # logs training information of the current iteration
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            # Every iters_per_checkpoint steps there is a validation of the model and its updated parameters
            if not overflow and (iteration %
                                 hyper_params['iters_per_checkpoint'] == 0):
                validate(model, criterion, valset, iteration,
                         hyper_params['batch_size'], n_gpus, collate_fn,
                         logger, hyper_params['distributed_run'], rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #12
0
    model.eval() # test if this is needed anymore

    learning_rate = hparams.learning_rate
	if hparams.Apex_optimizer: # apex optimizer is slightly faster with slightly more vram usage in my testing. Helps in both fp32 and fp16.
    	optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)
	else:
	    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)
    
    if hparams.fp16_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
    
    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)
    
    criterion = Tacotron2Loss(hparams)
    
    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)
    
    # Load checkpoint if one exists
    best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.8, load_checkpoint will update to last best value.
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model, iteration = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        elif warm_start_force:
            model, iteration = warm_start_force_model(
                checkpoint_path, model)
Beispiel #13
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)
    # else:
    #     torch.cuda.set_device('cuda:1')

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    waveglow_path = 'waveglow_256channels_universal_v5.pt'
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().float()
    # waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()

    # ---------------------- MELLOTRON CODE BLOCK --------------------------
    arpabet_dict = cmudict.CMUDict('data/cmu_dictionary')
    audio_paths = 'data/examples_filelist.txt'
    dataloader = TextMelLoader(audio_paths, hparams)
    datacollate = TextMelCollate(hparams.n_frames_per_step)
    file_idx = 0
    audio_path, text, sid = dataloader.audiopaths_and_text[file_idx]

    stft = TacotronSTFT(hparams.filter_length, hparams.hop_length,
                        hparams.win_length, hparams.n_mel_channels,
                        hparams.sampling_rate, hparams.mel_fmin,
                        hparams.mel_fmax)

    def load_mel(path):
        audio, sampling_rate = librosa.core.load(path,
                                                 sr=hparams.sampling_rate)
        audio = torch.from_numpy(audio)
        if sampling_rate != hparams.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, stft.sampling_rate))
        audio_norm = audio.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = stft.mel_spectrogram(audio_norm)
        melspec = melspec.cuda()
        return melspec

    # get audio path, encoded text, pitch contour and mel for gst
    text_encoded = torch.LongTensor(
        text_to_sequence(text, hparams.text_cleaners,
                         arpabet_dict))[None, :].cuda()
    mel = load_mel(audio_path)
    print(audio_path, text)
    inference_batch = datacollate([dataloader[file_idx]])

    # ---------------------- MELLOTRON CODE BLOCK (END) --------------------------

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

                    # if not is_overflow and (iteration % 2 == 0):
                    log_audio(model, iteration, logger, waveglow,
                              inference_batch, text_encoded, mel)

            iteration += 1
Beispiel #14
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    if hparams.reverse:
        criterion = TacotronAsrLoss(hparams)
    else:
        criterion = Tacotron2Loss(hparams)

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0

    def load_ckpt(checkpoint_path, model, optimizer):
        model, optimizer, _learning_rate, iteration = load_checkpoint(
            checkpoint_path, model, optimizer)
        if hparams.use_saved_learning_rate:
            learning_rate = _learning_rate
        else:
            learning_rate = hparams.learning_rate
        iteration += 1  # next iteration is iteration + 1
        epoch_offset = max(0, int(iteration / len(train_loader)))
        return model, optimizer, learning_rate, iteration, epoch_offset

    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        else:
            model, optimizer, learning_rate, iteration, epoch_offset = load_ckpt(checkpoint_path, model, optimizer)
    else:
        ckpt_paths = glob.glob(os.path.join(output_directory, 'checkpoint_*'))
        if len(ckpt_paths) > 0:
            last_ckpt_path = sorted(ckpt_paths, key=lambda x: int(x.split("_")[-1]))[-1]
            model, optimizer, learning_rate, iteration, epoch_offset = load_ckpt(last_ckpt_path, model, optimizer)

    # print(">>>>", model.wavenet.first_conv.weight.data[0])
    model.train()
    if hparams.save_mels:
        model.eval()
    print(model)

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
    print('Trainable Parameters: %.3fM' % parameters)

    # if hparams.full_song:
    #     splits_fn = os.path.join(output_directory, 'splits_info.json')
    #     if not os.path.exists(splits_fn):
    #         splits_info = {}
    #         for idx in np.load(hparams.ds_name + '.npz', allow_pickle=True).keys():
    #             splits_info[idx] = [[0, 0]]
    #         with open(splits_fn, 'w') as f:
    #             json.dump(splits_info, f)

    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    all_alignments = {}
    all_mels = {}
    all_linears = {}
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        t = tqdm(enumerate(train_loader))
        all_reduced_loss = 0
        for i, batch in t:
            current_lr = hparams.learning_rate
            if hparams.lr_schedule is not None:
                lr_schedule_f = getattr(lrschedule, hparams.lr_schedule)
                current_lr = lr_schedule_f(
                    hparams.learning_rate, iteration, **hparams.lr_schedule_kwargs)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr

            if hparams.test_mode or (not is_overflow and (iteration % hparams.iters_per_checkpoint == 0)):
                if hparams.do_validation:
                    validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, hparams, output_directory)
                if hparams.test_mode:
                    exit()
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path, output_directory)

            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x, iteration)
            losses = criterion(y_pred, y, x)
            loss = sum(losses.values())
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if not hparams.save_mels:
                if hparams.fp16_run:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)
            is_overflow = math.isnan(grad_norm)

            if not is_overflow:
                optimizer.step()
            else:
                optimizer.zero_grad()
                print(loss, "grad overflow!!")

            input_lengths, output_lengths, uttids, mel_outputs, linear_outputs, alignments = \
                x[1], x[4], x[7], y_pred[1], y_pred[2], y_pred[3]
            if rank == 0:
                # if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                all_reduced_loss += reduced_loss

                t.set_description("iter:{},loss:{:.6f},GN:{:.6f},{:.2f}s/it,lr:{:.6f},"
                                  "details:{},shape:{}".format(
                    iteration, all_reduced_loss / (i + 1), grad_norm, duration, current_lr,
                    "".join(["[{}]:{:.4f}".format(k, v.item()) for k, v in losses.items()]),
                    list(mel_outputs.data.shape)))

                logger.log_training(
                    reduced_loss, grad_norm, learning_rate, duration, iteration)
            iteration += 1

            # save alignments
            input_lengths = input_lengths.data.cpu().numpy()
            output_lengths = output_lengths.data.cpu().numpy()
            uttids = uttids.data.cpu().numpy()
            if hparams.save_attn:
                alignments = alignments.data.cpu().numpy()
                for uttid, alignment, input_length, output_length \
                        in zip(uttids, alignments, input_lengths, output_lengths):
                    if hparams.reverse:
                        all_alignments[str(uttid)] = alignment[:input_length, :output_length]
                    else:
                        all_alignments[str(uttid)] = alignment[:output_length, :input_length]

            if hparams.save_mels:
                mel_outputs = mel_outputs.data.cpu().numpy()
                linear_outputs = linear_outputs.data.cpu().numpy()
                for uttid, mel_output, linear_output, input_length, output_length \
                        in zip(uttids, mel_outputs, linear_outputs, input_lengths, output_lengths):
                    all_mels[str(uttid)] = mel_output[:, :output_length]
                    all_linears[str(uttid)] = linear_output[:, :output_length]

        if hparams.save_attn:
            np.savez(os.path.join(output_directory, "all_alignments"), **all_alignments)
            exit()
        if hparams.save_mels:
            np.savez(os.path.join(output_directory, "all_mels"), **all_mels)
            np.savez(os.path.join(output_directory, "all_linears"), **all_linears)
            exit()
Beispiel #15
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)
    if hparams.drop_frame_rate > 0.:
        global_mean = calculate_global_mean(train_loader,
                                            hparams.global_mean_npy)
        hparams.global_mean = global_mean

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    if hparams.use_guided_attn_loss:
        criterion_attn = GuidedAttentionLoss(
            sigma=hparams.guided_attn_loss_sigma,
            alpha=hparams.guided_attn_loss_lambda,
        )

    logger = prepare_directories_and_logger(hparams, output_directory,
                                            log_directory, rank)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()

            if iteration < 50000:
                learning_rate = 1e-3
            elif iteration >= 50000 and iteration < 100000:
                learning_rate = 5e-4
            elif iteration >= 100000 and iteration < 150000:
                learning_rate = 3e-4
            elif iteration >= 150000 and iteration < 200000:
                learning_rate = 1e-4
            elif iteration >= 200000 and iteration < 250000:
                learning_rate = 5e-5
            elif iteration >= 250000 and iteration < 300000:
                learning_rate = 3e-5
            else:
                learning_rate = 1e-5

            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.use_guided_attn_loss is not None:
                alignments, r_len_pad, ilens, olens = y_pred[-1], x[2], x[
                    1], x[5]
                attn_loss = criterion_attn(alignments, ilens,
                                           (olens + r_len_pad) //
                                           hparams.n_frames_per_step)
                loss = loss + attn_loss
            if model.mi is not None:
                # transpose to [b, T, dim]
                decoder_outputs = y_pred[0].transpose(2, 1)
                ctc_text, ctc_text_lengths, aco_lengths = x[-2], x[-1], x[5]
                taco_loss = loss
                mi_loss = model.mi(decoder_outputs, ctc_text, aco_lengths,
                                   ctc_text_lengths)
                if hparams.use_gaf:
                    if i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0:
                        safe_loss = 0. * sum(
                            [x.sum() for x in model.parameters()])
                        gaf = gradient_adaptive_factor.calc_grad_adapt_factor(
                            taco_loss + safe_loss, mi_loss + safe_loss,
                            model.parameters(), optimizer)
                        gaf = min(gaf, hparams.max_gaf)
                else:
                    gaf = 1.0
                loss = loss + gaf * mi_loss
            else:
                taco_loss = loss
                mi_loss = torch.tensor([-1.0])
                gaf = -1.0
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                taco_loss = reduce_tensor(taco_loss.data, n_gpus).item()
                mi_loss = reduce_tensor(mi_loss.data, n_gpus).item()
                attn_loss = reduce_tensor(attn_loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                taco_loss = taco_loss.item()
                mi_loss = mi_loss.item()
                attn_loss = attn_loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                logger.log_training(reduced_loss, taco_loss, attn_loss,
                                    mi_loss, grad_norm, gaf, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_validate
                                    == 0):
                print("Train loss {} {:.4f} mi_loss {:.4f} Grad Norm {:.4f} "
                      "gaf {:.4f} {:.2f}s/it".format(iteration, taco_loss,
                                                     mi_loss, grad_norm, gaf,
                                                     duration))
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0 and (iteration % hparams.iters_per_checkpoint
                                  == 0):
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}_{}".format(
                            iteration,
                            output_directory.split('/')[-1].replace(
                                'outdir_', '')))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #16
0
def main(args):
    # Get device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Define model
    model = nn.DataParallel(BERT_Tacotron2(hp)).to(device)
    # model = Tacotron2(hp).to(device)
    print("Model Have Been Defined")
    num_param = sum(param.numel() for param in model.parameters())
    print('Number of Tacotron Parameters:', num_param)

    # Get dataset
    dataset = BERTTacotron2Dataset()

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=hp.learning_rate,
                                 weight_decay=hp.weight_decay)

    # Criterion
    criterion = Tacotron2Loss()

    # Load checkpoint if exists
    try:
        checkpoint = torch.load(
            os.path.join(hp.checkpoint_path,
                         'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n---Model Restored at Step %d---\n" % args.restore_step)

    except:
        print("\n---Start New Training---\n")
        if not os.path.exists(hp.checkpoint_path):
            os.mkdir(hp.checkpoint_path)

    # Init logger
    if not os.path.exists(hp.logger_path):
        os.mkdir(hp.logger_path)

    # Define Some Information
    Time = np.array([])
    Start = time.clock()

    # Training
    model = model.train()

    for epoch in range(hp.epochs):
        # Get training loader
        training_loader = DataLoader(dataset,
                                     batch_size=hp.batch_size**2,
                                     shuffle=True,
                                     collate_fn=collate_fn,
                                     drop_last=True,
                                     num_workers=0)
        total_step = hp.epochs * len(training_loader) * hp.batch_size

        for i, batchs in enumerate(training_loader):
            for j, data_of_batch in enumerate(batchs):
                start_time = time.clock()

                current_step = i * hp.batch_size + j + args.restore_step + \
                    epoch * len(training_loader)*hp.batch_size + 1

                # Init
                optimizer.zero_grad()

                # Get Data
                character = torch.from_numpy(
                    data_of_batch["text"]).long().to(device)
                mel_target = torch.from_numpy(
                    data_of_batch["mel_target"]).float().to(
                        device).contiguous().transpose(1, 2)
                stop_target = torch.from_numpy(
                    data_of_batch["stop_token"]).float().to(device)
                embeddings = data_of_batch["bert_embeddings"].float().to(
                    device)
                input_lengths = torch.from_numpy(
                    data_of_batch["length_text"]).long().to(device)
                output_lengths = torch.from_numpy(
                    data_of_batch["length_mel"]).long().to(device)

                # Forward
                batch = character, input_lengths, mel_target, stop_target, output_lengths, embeddings

                x, y = model.module.parse_batch(batch)
                y_pred = model(x)

                # Cal Loss
                mel_loss, mel_postnet_loss, stop_pred_loss = criterion(
                    y_pred, y)
                total_loss = mel_loss + mel_postnet_loss + stop_pred_loss

                # Logger
                t_l = total_loss.item()
                m_l = mel_loss.item()
                m_p_l = mel_postnet_loss.item()
                s_l = stop_pred_loss.item()

                with open(os.path.join("logger", "total_loss.txt"),
                          "a") as f_total_loss:
                    f_total_loss.write(str(t_l) + "\n")

                with open(os.path.join("logger", "mel_loss.txt"),
                          "a") as f_mel_loss:
                    f_mel_loss.write(str(m_l) + "\n")

                with open(os.path.join("logger", "mel_postnet_loss.txt"),
                          "a") as f_mel_postnet_loss:
                    f_mel_postnet_loss.write(str(m_p_l) + "\n")

                with open(os.path.join("logger", "stop_pred_loss.txt"),
                          "a") as f_s_loss:
                    f_s_loss.write(str(s_l) + "\n")

                # Backward
                total_loss.backward()

                # Clipping gradients to avoid gradient explosion
                nn.utils.clip_grad_norm_(model.parameters(), 1.)

                # Update weights
                optimizer.step()
                adjust_learning_rate(optimizer, current_step)

                # Print
                if current_step % hp.log_step == 0:
                    Now = time.clock()

                    str1 = "Epoch [{}/{}], Step [{}/{}], Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f};".format(
                        epoch + 1, hp.epochs, current_step, total_step,
                        mel_loss.item(), mel_postnet_loss.item())
                    str2 = "Stop Predicted Loss: {:.4f}, Total Loss: {:.4f}.".format(
                        stop_pred_loss.item(), total_loss.item())
                    str3 = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format(
                        (Now - Start),
                        (total_step - current_step) * np.mean(Time))

                    print("\n" + str1)
                    print(str2)
                    print(str3)

                    with open(os.path.join("logger", "logger.txt"),
                              "a") as f_logger:
                        f_logger.write(str1 + "\n")
                        f_logger.write(str2 + "\n")
                        f_logger.write(str3 + "\n")
                        f_logger.write("\n")

                if current_step % hp.save_step == 0:
                    torch.save(
                        {
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict()
                        },
                        os.path.join(hp.checkpoint_path,
                                     'checkpoint_%d.pth.tar' % current_step))
                    print("save model at step %d ..." % current_step)

                end_time = time.clock()
                Time = np.append(Time, end_time - start_time)
                if len(Time) == hp.clear_Time:
                    temp_value = np.mean(Time)
                    Time = np.delete(Time, [i for i in range(len(Time))],
                                     axis=None)
                    Time = np.append(Time, temp_value)
Beispiel #17
0
def main(args):
    # Get device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Define model
    model = Tacotron2(hp).to(device)
    model_SpeakerEncoder = SpeakerEncoder.get_model().to(device)
    # model = Tacotron2(hp).to(device)
    print("All Models Have Been Defined")

    # Get dataset
    dataset = Tacotron2DataLoader()

    # Optimizer
    optimizer = torch.optim.Adam(
        model.parameters(), lr=hp.learning_rate, weight_decay=hp.weight_decay)

    # Criterion
    criterion = Tacotron2Loss()

    # Get training loader
    print("Get Training Loader")
    training_loader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True,
                                 collate_fn=collate_fn, drop_last=True, num_workers=cpu_count())

    # Load checkpoint if exists
    try:
        checkpoint = torch.load(os.path.join(
            hp.checkpoint_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n---Model Restored at Step %d---\n" % args.restore_step)

    except:
        print("\n---Start New Training---\n")
        if not os.path.exists(hp.checkpoint_path):
            os.mkdir(hp.checkpoint_path)

    # Define Some Information
    total_step = hp.epochs * len(training_loader)
    Time = np.array([])
    Start = time.perf_counter()

    # Training
    model = model.train()

    for epoch in range(hp.epochs):
        for i, batch in enumerate(training_loader):
            start_time = time.perf_counter()

            # Count step
            current_step = i + args.restore_step + \
                epoch * len(training_loader) + 1

            # Init
            optimizer.zero_grad()

            # Load Data
            text_padded, input_lengths, mel_padded, gate_padded, output_lengths, mel_for_SE = batch

            # Get Speaker Embedding
            # print(np.shape(mel_for_SE))
            mel_for_SE = torch.from_numpy(mel_for_SE).float().to(device)
            # print(mel_for_SE.size())
            with torch.no_grad():
                SpeakerEmbedding = model_SpeakerEncoder(mel_for_SE)
            # print(SpeakerEmbedding.size())
            # print(SpeakerEmbedding)
            # print(SpeakerEmbedding.grad)

            if cuda_available:
                text_padded = torch.from_numpy(text_padded).type(
                    torch.cuda.LongTensor).to(device)
            else:
                text_padded = torch.from_numpy(text_padded).type(
                    torch.LongTensor).to(device)
            mel_padded = torch.from_numpy(mel_padded).to(device)

            gate_padded = torch.from_numpy(gate_padded).to(device)

            input_lengths = torch.from_numpy(input_lengths).to(device)
            output_lengths = torch.from_numpy(output_lengths).to(device)

            # print("mel", mel_padded.size())
            # print("text", text_padded.size())
            # print("gate", gate_padded.size())

            batch = text_padded, input_lengths, mel_padded, gate_padded, output_lengths

            x, y = model.parse_batch(batch)
            y_pred = model(x, SpeakerEmbedding)

            # Loss
            loss, mel_loss, gate_loss = criterion(y_pred, y)

            # Backward
            loss.backward()

            # Clipping gradients to avoid gradient explosion
            nn.utils.clip_grad_norm_(model.parameters(), hp.grad_clip_thresh)

            # Update weights
            optimizer.step()

            if current_step % hp.log_step == 0:
                Now = time.perf_counter()
                str_loss = "Epoch [{}/{}], Step [{}/{}], Mel Loss: {:.4f}, Gate Loss: {:.4f}, Total Loss: {:.4f}.".format(
                    epoch + 1, hp.epochs, current_step, total_step, mel_loss.item(), gate_loss.item(), loss.item())
                str_time = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format(
                    (Now - Start), (total_step - current_step) * np.mean(Time))

                print(str_loss)
                print(str_time)
                with open("logger.txt", "a")as f_logger:
                    f_logger.write(str_loss + "\n")
                    f_logger.write(str_time + "\n")
                    f_logger.write("\n")

            if current_step % hp.save_step == 0:
                torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict(
                )}, os.path.join(hp.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step))
                print("\nsave model at step %d ...\n" % current_step)

            end_time = time.perf_counter()
            Time = np.append(Time, end_time - start_time)
            if len(Time) == hp.clear_Time:
                temp_value = np.mean(Time)
                Time = np.delete(
                    Time, [i for i in range(len(Time))], axis=None)
                Time = np.append(Time, temp_value)
Beispiel #18
0
def train(n_gpus, rank, group_name):
    if n_gpus > 1:
        if rank == 0: print('Synchronizing distributed flow...')
        init_distributed(rank, n_gpus, group_name, config['dist_config'])

    torch.manual_seed(config['seed'])
    torch.cuda.manual_seed(config['seed'])

    if rank == 0: print('Initializing model, optimizer and loss...')
    model = Tacotron2(config).cuda()
    criterion = Tacotron2Loss()
    learning_rate = config['learning_rate']
    optimizer = torch.optim.Adam(params=model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=config['weight_decay'])
    if config['fp16_run']:
        if rank == 0: print('Using FP16...')
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    if rank == 0: print('Preparing dirs, data loaders and logger...')
    logger = prepare_directories_and_logger(config['output_directory'],
                                            config['log_directory'], rank)
    train_loader, valset, collate_fn = prepare_dataloaders(
        config['training_files'], config['validation_files'],
        config['n_frames_per_step'], n_gpus)

    iteration = 0
    epoch_offset = 0
    if not config['warm_up_checkpoint'] is None:
        if rank == 0:
            print('Loading checkpoint from {}...'.format(
                config['warm_up_checkpoint']))

        model = load_checkpoint(config['warm_up_checkpoint'], model, optimizer)

        iteration += 1  # next iteration is iteration + 1
        epoch_offset = max(0, int(iteration / len(train_loader)))

    model.compress_factorize(config=config['compress_config'])
    model.train()

    # Main training loop
    for epoch in range(epoch_offset, config['epochs']):
        print("Epoch: {}".format(epoch))
        for _, batch in enumerate(train_loader):
            start = time.perf_counter()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if config['fp16_run']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if iteration % config['iters_per_grad_acc'] == 0:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), config['grad_clip_thresh'])

                optimizer.step()
                model.zero_grad()

                if rank == 0:
                    duration = time.perf_counter() - start
                    print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".
                          format(iteration, reduced_loss, grad_norm, duration))
                    logger.log_training(reduced_loss, grad_norm, learning_rate,
                                        duration, iteration)

            if iteration % config['iters_per_validation'] == 0:
                validate(model, criterion, valset, iteration,
                         config['batch_size'], n_gpus, collate_fn, logger,
                         rank)

            if iteration % config['iters_per_checkpoint'] == 0:
                if rank == 0:
                    checkpoint_path = os.path.join(
                        config['output_directory'],
                        "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #19
0
def train_tts(output_directory, log_directory, checkpoint_path, warm_start,
              n_gpus, rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(
        hparams.seed)  ##设置(CPU)生成随机数的种子,在每次重新运行程序时,同样的随机数生成代码得到的是同样的结果。
    torch.cuda.manual_seed(
        hparams.seed
    )  ## 设置当前GPU的随机数生成种子  torch.cuda.manual_seed_all(seed)设置所有GPU的随机数生成种子
    ## 手动设置种子一般可用于固定随机初始化的权重值,这样就可以让每次重新从头训练网络时的权重的初始值虽然是随机生成的但却是固定的。

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
    #                              weight_decay=hparams.weight_decay)

    for name, param in model.named_parameters():
        # frozen except tts
        # if name.split('.')[0] == 'poly_phoneme_classifier':
        #     param.requires_grad = False

        # frozen poly module except tone sandhi & tts
        # if name.split('.')[0] == 'poly_phoneme_classifier':
        #     if name.split('.')[1] != 'linear_pre' and name.split('.')[1] != 'conv_layers' and name.split('.')[1] != 'linear_aft':
        #         param.requires_grad = False

        # frozen except structure CNN & tonesandhi & tts
        if name.split('.')[0] == 'poly_phoneme_classifier':
            if name.split('.')[1] == 'g2ptransformermask':
                if name.split('.')[2] != 'structure_cnn_tts':
                    param.requires_grad = False
            elif name.split('.')[1] != 'linear_pre' and name.split('.')[
                    1] != 'conv_layers' and name.split('.')[1] != 'linear_aft':
                param.requires_grad = False
            # else:
            #    param.requires_grad = False

    training_parameters_list = [
        p for p in model.parameters() if p.requires_grad
    ]
    optimizer = torch.optim.Adam(training_parameters_list,
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
    ## apex是一款由Nvidia开发的基于PyTorch的混合精度训练加速神奇,用短短几行代码就能实现不同程度的混合精度加速,训练时间直接缩小一半。
    ## fp16:半精度浮点数,是一种计算机使用的二进制浮点数数据类型,使用2字节(16位)存储。
    ## fp16优点:减少显存占用;加快训练和推断的计算;张量核心的普及。缺点:量化误差。

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()  ## 返回当前的计算机系统时间
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            # print('CHECK batch:', batch)

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            mask_padded = x[3]
            loss, mel_loss, gate_loss, select_loss = criterion(
                y_pred, y, mask_padded
            )  ## Tacotron2Loss(model_output,targets,mask_padded)
            ## 区分几种loss

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_val_mel_loss = reduce_tensor(mel_loss.data,
                                                     n_gpus).item()
                reduced_val_gate_loss = reduce_tensor(gate_loss.data,
                                                      n_gpus).item()
                reduced_val_select_loss = reduce_tensor(
                    select_loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                reduced_val_mel_loss = mel_loss.item()
                reduced_val_gate_loss = gate_loss.item()
                reduced_val_select_loss = select_loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # print('CHECK  structure_cnn.convs.0.weight IS CHANGE:', model.structure_cnn.convolutions[0][0].conv.weight)

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()
            ## 在用pytorch训练模型时,通常会在遍历epochs的过程中依次用到optimizer.zero_grad(),loss.backward(),optimizer.step()三个函数,总的来说,这三个函数的作用是先将梯度归零(optimizer.zero_grad()),
            ## 然后反向传播计算得到每个参数的梯度值(loss.backward()),最后通过梯度下降执行一步参数更新(optimizer.step())

            if not is_overflow and rank == 0:
                duration = time.perf_counter(
                ) - start  ## time.perf_counter()返回当前的计算机系统时间,只有连续两次perf_counter()进行差值才能有意义,一般用于计算程序运行时间。
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, reduced_val_mel_loss,
                                    reduced_val_gate_loss,
                                    reduced_val_select_loss, grad_norm,
                                    learning_rate, duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #20
0
def train(args, rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    args.output_directory (string): directory to save checkpoints
    args.log_directory (string) directory to save tensorboard logs
    args.checkpoint_path(string): checkpoint path
    args.n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    # setup distributed
    hparams.n_gpus = args.n_gpus
    hparams.rank = rank
    if hparams.distributed_run:
        init_distributed(hparams, args.n_gpus, rank, group_name)
    
    # reproducablilty stuffs
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    
    # initialize blank model
    print('Initializing Tacotron2...')
    model = load_model(hparams)
    print('Done')
    global model_args
    model_args = get_args(model.forward)
    model.eval()
    learning_rate = hparams.learning_rate
    
    # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust
    if hparams.print_layer_names_during_startup:
        print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n")
    
    # (optional) Freeze layers by disabling grads
    if len(hparams.frozen_modules):
        for layer, params in list(model.named_parameters()):
            if any(layer.startswith(module) for module in hparams.frozen_modules):
                params.requires_grad = False
                print(f"Layer: {layer} has been frozen")
    
    if len(hparams.unfrozen_modules):
        for layer, params in list(model.named_parameters()):
            if any(layer.startswith(module) for module in hparams.frozen_modules):
                params.requires_grad = True
                print(f"Layer: {layer} has been unfrozen")
    
    # define optimizer (any params without requires_grad are ignored)
    #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay)
    optimizer = apexopt.FusedAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay)
    
    if True and rank == 0:
        pytorch_total_params = sum(p.numel() for p in model.parameters())
        print("{:,} total parameters in model".format(pytorch_total_params))
        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        print("{:,} trainable parameters.".format(pytorch_total_params))
    
    print("Initializing AMP Model / Optimzier")
    if hparams.fp16_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level=f'O{hparams.fp16_run_optlvl}')
    
    print("Initializing Gradient AllReduce model wrapper.")
    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)
    
    print("Initializing Tacotron2 Loss func.")
    criterion = Tacotron2Loss(hparams)
    
    print("Initializing Tacotron2 Logger.")
    logger = prepare_directories_and_logger(hparams, args)
    
    # Load checkpoint if one exists
    best_validation_loss = 1e3# used to see when "best_val_model" should be saved
    best_inf_attsc       = -99# used to see when "best_inf_attsc" should be saved
    
    n_restarts = 0
    checkpoint_iter = 0
    iteration = 0
    epoch_offset = 0
    _learning_rate = 1e-3
    saved_lookup = None
    original_filelist = None
    
    global file_losses
    file_losses = {}
    global file_losses_smoothness
    file_losses_smoothness = 0.6
    
    global best_val_loss_dict
    best_val_loss_dict = None
    global best_loss_dict
    best_loss_dict = None
    global expavg_loss_dict
    expavg_loss_dict = None
    expavg_loss_dict_iters = 0# initial iters expavg_loss_dict has been fitted
    loss_dict_smoothness = 0.95 # smoothing factor
    
    if args.checkpoint_path is not None:
        if args.warm_start:
            model, iteration, saved_lookup = warm_start_model(
                args.checkpoint_path, model, hparams.ignore_layers)
        elif args.warm_start_force:
            model, iteration, saved_lookup = warm_start_force_model(
                args.checkpoint_path, model)
        else:
            _ = load_checkpoint(args.checkpoint_path, model, optimizer, best_val_loss_dict, best_loss_dict)
            model, optimizer, _learning_rate, iteration, best_validation_loss, best_inf_attsc, saved_lookup, best_val_loss_dict, best_loss_dict = _
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
        checkpoint_iter = iteration
        iteration += 1  # next iteration is iteration + 1
        print('Model Loaded')
    
    # define datasets/dataloaders
    dataloader_args = [*get_args(criterion.forward), *model_args]
    if rank == 0:
        dataloader_args.extend(get_args(logger.log_training))
    train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, dataloader_args, args, saved_lookup)
    epoch_offset = max(0, int(iteration / len(train_loader)))
    speaker_lookup = trainset.speaker_ids
    
    # load and/or generate global_mean
    if hparams.drop_frame_rate > 0.:
        if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it
            while not os.path.exists(hparams.global_mean_npy): time.sleep(1)
        global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams)
        hparams.global_mean = global_mean
        model.global_mean = global_mean
    
    # define scheduler
    use_scheduler = 0
    if use_scheduler:
        scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1/5), patience=10)
    
    model.train()
    is_overflow = False
    validate_then_terminate = 0
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
            hparams.batch_size, args.n_gpus, collate_fn, logger,
            hparams.distributed_run, rank)
        raise Exception("Finished Validation")
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate
    
    just_did_val = True
    rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200))
    # ================ MAIN TRAINNIG LOOP! ===================
    training = True
    while training:
        try:
            for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"):
                tqdm.write("Epoch:{}".format(epoch))
                
                train_loader.dataset.shuffle_dataset()# Shuffle Dataset
                dataset_len = len(train_loader)
                
                start_time = time.time()
                # start iterating through the epoch
                for i, batch in tqdm(enumerate(train_loader), desc="Iter:  ", smoothing=0, total=len(train_loader), position=0, unit="iter"):
                    # run external code every epoch or 1000 iters, allows the run to be adjusted without restarts
                    if (i==0 or iteration % param_interval == 0):
                        try:
                            with open("run_every_epoch.py", encoding='utf-8') as f:
                                internal_text = str(f.read())
                                if len(internal_text) > 0:
                                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                                    ldict = {'iteration': iteration, 'checkpoint_iter': checkpoint_iter, 'n_restarts': n_restarts}
                                    exec(internal_text, globals(), ldict)
                                else:
                                    print("[info] tried to execute 'run_every_epoch.py' but it is empty")
                        except Exception as ex:
                            print(f"[warning] 'run_every_epoch.py' FAILED to execute!\nException:\n{ex}")
                        globals().update(ldict)
                        locals().update(ldict)
                        if show_live_params:
                            print(internal_text)
                    n_restarts = n_restarts_override if (n_restarts_override is not None) else n_restarts or 0
                    # Learning Rate Schedule
                    if custom_lr:
                        if iteration < warmup_start:
                            learning_rate = warmup_start_lr
                        elif iteration < warmup_end:
                            learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                        else:
                            if iteration < decay_start:
                                learning_rate = A_ + C_
                            else:
                                iteration_adjusted = iteration - decay_start
                                learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
                        assert learning_rate > -1e-8, "Negative Learning Rate."
                        if decrease_lr_on_restart:
                            learning_rate = learning_rate/(2**(n_restarts/3))
                        if just_did_val:
                            learning_rate = 0.0
                            just_did_val=False
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = learning_rate
                    
                    # /run external code every epoch, allows the run to be adjusting without restarts/
                    model.zero_grad()
                    y = model.parse_batch(batch) # move batch to GPU (async)
                    y_pred = force(model, valid_kwargs=model_args, **{**y, "teacher_force_till": teacher_force_till, "p_teacher_forcing": p_teacher_forcing, "drop_frame_rate": drop_frame_rate})
                    
                    loss_scalars = {
                         "spec_MSE_weight": spec_MSE_weight,
                        "spec_MFSE_weight": spec_MFSE_weight,
                      "postnet_MSE_weight": postnet_MSE_weight,
                     "postnet_MFSE_weight": postnet_MFSE_weight,
                        "gate_loss_weight": gate_loss_weight,
                        "sylps_kld_weight": sylps_kld_weight,
                        "sylps_MSE_weight": sylps_MSE_weight,
                        "sylps_MAE_weight": sylps_MAE_weight,
                         "diag_att_weight": diag_att_weight,
                    }
                    loss_dict, file_losses_batch = criterion(y_pred, y, loss_scalars)
                    
                    file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness)
                    loss = loss_dict['loss']
                    
                    if hparams.distributed_run:
                        reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()}
                    else:
                        reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()}
                    
                    reduced_loss = reduced_loss_dict['loss']
                    
                    if hparams.fp16_run:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                    
                    if grad_clip_thresh:
                        if hparams.fp16_run:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), grad_clip_thresh)
                            is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
                        else:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                model.parameters(), grad_clip_thresh)
                    else:
                        grad_norm = 0.0
                    
                    optimizer.step()
                    
                    # get current Loss Scale of first optimizer
                    loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768.
                    
                    # restart if training/model has collapsed
                    if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)) or (loss_scale < 1/4):
                        raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n")
                    
                    if expavg_loss_dict is None:
                        expavg_loss_dict = reduced_loss_dict
                    else:
                        expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys()}
                        expavg_loss_dict_iters += 1
                    
                    if expavg_loss_dict_iters > 100:
                        if best_loss_dict is None:
                            best_loss_dict = expavg_loss_dict
                        else:
                            best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys()}
                    
                    if rank == 0:
                        duration = time.time() - start_time
                        if not is_overflow:
                            average_loss = rolling_loss.process(reduced_loss)
                            tqdm.write(
                                f"{iteration} [Train_loss:{reduced_loss:.4f} Avg:{average_loss:.4f}] "
                                f"[Grad Norm {grad_norm:.4f}] [{duration:.2f}s/it] "
                                f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] "
                                f"[{learning_rate:.7f} LR] [{loss_scale:.0f} LS]")
                            logger.log_training(reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, drop_frame_rate)
                        else:
                            tqdm.write("Gradient Overflow, Skipping Step")
                        start_time = time.time()
                    
                    if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):
                        # save model checkpoint like normal
                        if rank == 0:
                            checkpoint_path = os.path.join(args.output_directory, "checkpoint_{}".format(iteration))
                            save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, best_inf_attsc, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path)
                    
                    if iteration%dump_filelosses_interval==0:
                        print("Updating File_losses dict!")
                        file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), args.n_gpus, rank)
                    
                    if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0)):
                        if rank == 0 and os.path.exists(save_file_check_path):
                            os.remove(save_file_check_path)
                        # perform validation and save "best_val_model" depending on validation loss
                        val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, valset, best_val_loss_dict, iteration, collate_fn, logger, val_teacher_force_till, val_p_teacher_forcing, teacher_force=0)# validate/teacher_force
                        file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), args.n_gpus, rank)
                        valatt_loss, *_ = validate(hparams, args, file_losses, model, criterion, valset, best_val_loss_dict, iteration, collate_fn, logger, 0, 0.0, teacher_force=2)# infer
                        if use_scheduler:
                            scheduler.step(val_loss)
                        if (val_loss < best_validation_loss):
                            best_validation_loss = val_loss
                            if rank == 0 and hparams.save_best_val_model:
                                checkpoint_path = os.path.join(args.output_directory, "best_val_model")
                                save_checkpoint(
                                    model, optimizer, learning_rate, iteration, hparams, best_validation_loss, max(best_inf_attsc, val_loss),
                                    average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path)
                        if (valatt_loss > best_inf_attsc):
                            best_inf_attsc = valatt_loss
                            if rank == 0 and hparams.save_best_inf_attsc:
                                checkpoint_path = os.path.join(args.output_directory, "best_inf_attsc")
                                save_checkpoint(
                                    model, optimizer, learning_rate, iteration, hparams, best_validation_loss, best_inf_attsc,
                                    average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, checkpoint_path)
                        just_did_val = True
                    
                    iteration += 1
                    # end of iteration loop
                
                # update filelist of training dataloader
                if (iteration > hparams.min_avg_max_att_start) and (iteration-checkpoint_iter >= dataset_len):
                    print("Updating File_losses dict!")
                    file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), args.n_gpus, rank)
                    print("Done!")
                    
                    print("Updating dataloader filtered paths!")
                    bad_file_paths = [k for k in list(file_losses.keys()) if
                        file_losses[k]['avg_max_attention'] < hparams.min_avg_max_att or# if attention stength if too weak
                        file_losses[k]['att_diagonality']   > hparams.max_diagonality or# or diagonality is too high
                        file_losses[k]['spec_MSE']          > hparams.max_spec_mse]     # or audio quality is too low
                                                                                        # then add to bad files list
                    bad_file_paths = set(bad_file_paths)                                # and remove from dataset
                    filted_filelist = [x for x in train_loader.dataset.filelist if not (x[0] in bad_file_paths)]
                    train_loader.dataset.update_filelist(filted_filelist)
                    print(f"Done! {len(bad_file_paths)} Files removed from dataset. {len(filted_filelist)} Files remain.")
                    del filted_filelist, bad_file_paths
                    if iteration > hparams.speaker_mse_sampling_start:
                        print("Updating dataset with speaker MSE Sampler!")
                        if original_filelist is None:
                            original_filelist = train_loader.dataset.filelist
                        train_loader.dataset.update_filelist(get_mse_sampled_filelist(
                                                             original_filelist, file_losses, hparams.speaker_mse_exponent, seed=iteration))
                        print("Done!")
                
                # end of epoch loop
            training = False # exit the While loop
        
        #except Exception as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
        except LossExplosion as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
            print(ex) # print Loss
            checkpoint_path = os.path.join(args.output_directory, "best_val_model")
            assert os.path.exists(checkpoint_path), "best_val_model checkpoint must exist for automatic restarts"
            
            if hparams.fp16_run:
                amp._amp_state.loss_scalers[0]._loss_scale = 32768
            
            # clearing VRAM for load checkpoint
            model.zero_grad()
            x=y=y_pred=loss=len_loss=loss_z=loss_w=loss_s=loss_att=dur_loss_z=dur_loss_w=dur_loss_s=None
            torch.cuda.empty_cache()
            
            model.eval()
            model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint(checkpoint_path, model, optimizer)
            learning_rate = optimizer.param_groups[0]['lr']
            epoch_offset = max(0, int(iteration / len(train_loader)))
            model.train()
            checkpoint_iter = iteration
            iteration += 1
            n_restarts += 1
        except KeyboardInterrupt as ex:
            print(ex)
Beispiel #21
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    #rank += 4
    if hparams.distributed_run:
        init_distributed(hparams, rank, group_name)
    
    print('checkpoint path: {}'.format(checkpoint_path))
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O1')
    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)

            if hparams.fp16_run:
                checkpoint = torch.load(checkpoint_path, map_location='cpu')
                amp_state_dict = checkpoint['amp']
                amp.load_state_dict(checkpoint['amp'])
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    #print('HERE')
    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate
            
            model.zero_grad()
            x, y = model.parse_batch(batch)

            #print('X value')
            #from hashlib import sha1
            #np_x = x[0].data.cpu().numpy()
            #foo = sha1(np_x)
            #print(foo.hexdigest())
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, hparams.world_size).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)
            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                    iteration, reduced_loss, grad_norm, duration))
                logger.log_training(
                    reduced_loss, grad_norm, learning_rate, duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
                #validate(model, criterion, valset, iteration,
                #         hparams.batch_size, hparams.world_size, collate_fn, logger,
                #         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path, hparams.fp16_run, amp)
                    wandb.save(checkpoint_path)
            iteration += 1
Beispiel #22
0
def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    # setup distributed
    hparams.n_gpus = n_gpus
    hparams.rank = rank
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)
    
    # reproducablilty stuffs
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    
    # initialize blank model
    model = load_model(hparams)
    model.eval()
    learning_rate = hparams.learning_rate
    
    # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust
    if hparams.print_layer_names_during_startup:
        print(*[f"Layer{i} = "+str(x[0])+" "+str(x[1].shape) for i,x in enumerate(list(model.named_parameters()))], sep="\n")
    
    # (optional) Freeze layers by disabling grads
    if len(hparams.frozen_modules):
        for layer, params in list(model.named_parameters()):
            if any(layer.startswith(module) for module in hparams.frozen_modules):
                params.requires_grad = False
                print(f"Layer: {layer} has been frozen")
    
    # define optimizer (any params without requires_grad are ignored)
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay)
    #optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)
    
    if hparams.fp16_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
    
    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)
    
    criterion = Tacotron2Loss(hparams)
    
    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)
    
    # Load checkpoint if one exists
    best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.4, load_checkpoint will update to last best value.
    iteration = 0
    epoch_offset = 0
    _learning_rate = 1e-3
    saved_lookup = None
    if checkpoint_path is not None:
        if warm_start:
            model, iteration, saved_lookup = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        elif warm_start_force:
            model, iteration, saved_lookup = warm_start_force_model(
                checkpoint_path, model)
        else:
            model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
        iteration += 1  # next iteration is iteration + 1
        print('Model Loaded')
    
    # define datasets/dataloaders
    train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams, saved_lookup)
    epoch_offset = max(0, int(iteration / len(train_loader)))
    speaker_lookup = trainset.speaker_ids
    
    # load and/or generate global_mean
    if hparams.drop_frame_rate > 0.:
        if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it
            while not os.path.exists(hparams.global_mean_npy): time.sleep(1)
        global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams)
        hparams.global_mean = global_mean
        model.global_mean = global_mean
    
    # define scheduler
    use_scheduler = 0
    if use_scheduler:
        scheduler = ReduceLROnPlateau(optimizer, factor=0.1**(1/5), patience=10)
    
    model.train()
    is_overflow = False
    validate_then_terminate = 0
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
            hparams.batch_size, n_gpus, collate_fn, logger,
            hparams.distributed_run, rank)
        raise Exception("Finished Validation")
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate
    
    rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"):
        tqdm.write("Epoch:{}".format(epoch))
        
        if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training
            train_sampler.set_epoch(epoch)
        start_time = time.time()
        # start iterating through the epoch
        for i, batch in tqdm(enumerate(train_loader), desc="Iter:  ", smoothing=0, total=len(train_loader), position=0, unit="iter"):
            # run external code every epoch, allows the run to be adjusting without restarts
            if (iteration % 1000 == 0 or i==0):
                try:
                    with open("run_every_epoch.py") as f:
                        internal_text = str(f.read())
                        if len(internal_text) > 0:
                            print(internal_text)
                            #code = compile(internal_text, "run_every_epoch.py", 'exec')
                            ldict = {'iteration': iteration}
                            exec(internal_text, globals(), ldict)
                            print("Custom code excecuted\nPlease remove code if it was intended to be ran once.")
                        else:
                            print("No Custom code found, continuing without changes.")
                except Exception as ex:
                    print(f"Custom code FAILED to run!\n{ex}")
                globals().update(ldict)
                locals().update(ldict)
                print("decay_start is ",decay_start)
                print("A_ is ",A_)
                print("B_ is ",B_)
                print("C_ is ",C_)
                print("min_learning_rate is ",min_learning_rate)
                print("epochs_between_updates is ",epochs_between_updates)
                print("drop_frame_rate is ",drop_frame_rate)
                print("p_teacher_forcing is ",p_teacher_forcing)
                print("teacher_force_till is ",teacher_force_till)
                print("val_p_teacher_forcing is ",val_p_teacher_forcing)
                print("val_teacher_force_till is ",val_teacher_force_till)
                print("grad_clip_thresh is ",grad_clip_thresh)
                if epoch % epochs_between_updates == 0 or epoch_offset == epoch:
                #if None:
                    tqdm.write("Old learning rate [{:.6f}]".format(learning_rate))
                    if iteration < decay_start:
                        learning_rate = A_ + C_
                    else:
                        iteration_adjusted = iteration - decay_start
                        learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
                    learning_rate = max(min_learning_rate, learning_rate) # output the largest number
                    tqdm.write("Changing Learning Rate to [{:.6f}]".format(learning_rate))
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = learning_rate
            # /run external code every epoch, allows the run to be adjusting without restarts/
            
            model.zero_grad()
            x, y = model.parse_batch(batch) # move batch to GPU (async)
            y_pred = model(x, teacher_force_till=teacher_force_till, p_teacher_forcing=p_teacher_forcing, drop_frame_rate=drop_frame_rate)
            
            loss, gate_loss = criterion(y_pred, y)
            
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_gate_loss = reduce_tensor(gate_loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                reduced_gate_loss = gate_loss.item()
            
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            
            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), grad_clip_thresh)
                is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), grad_clip_thresh)
            
            optimizer.step()
            
            for j, param_group in enumerate(optimizer.param_groups):
                learning_rate = (float(param_group['lr'])); break
            
            if iteration < decay_start:
                learning_rate = A_ + C_
            else:
                iteration_adjusted = iteration - decay_start
                learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
            learning_rate = max(min_learning_rate, learning_rate) # output the largest number
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate
            
            if not is_overflow and rank == 0:
                duration = time.time() - start_time
                average_loss = rolling_loss.process(reduced_loss)
                tqdm.write("{} [Train_loss {:.4f} Avg {:.4f}] [Gate_loss {:.4f}] [Grad Norm {:.4f}] "
                      "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR]".format(
                    iteration, reduced_loss, average_loss, reduced_gate_loss, grad_norm, duration, (duration/(hparams.batch_size*n_gpus)), learning_rate))
                if iteration % 20 == 0:
                    diagonality, avg_prob = alignment_metric(x, y_pred)
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing, diagonality=diagonality, avg_prob=avg_prob)
                else:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration, iteration, teacher_force_till, p_teacher_forcing)
                start_time = time.time()
            if is_overflow and rank == 0:
                tqdm.write("Gradient Overflow, Skipping Step")
            
            if not is_overflow and ((iteration % (hparams.iters_per_checkpoint/1) == 0) or (os.path.exists(save_file_check_path))):
                # save model checkpoint like normal
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path)
            
            if not is_overflow and ((iteration % int((hparams.iters_per_validation)/1) == 0) or (os.path.exists(save_file_check_path)) or (iteration < 1000 and (iteration % 250 == 0))):
                if rank == 0 and os.path.exists(save_file_check_path):
                    os.remove(save_file_check_path)
                # perform validation and save "best_model" depending on validation loss
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.val_batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1) #teacher_force
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.val_batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=2) #infer
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.val_batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=0) #validate (0.8 forcing)
                if use_scheduler:
                    scheduler.step(val_loss)
                if (val_loss < best_validation_loss):
                    best_validation_loss = val_loss
                    if rank == 0:
                        checkpoint_path = os.path.join(output_directory, "best_model")
                        save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_lookup, checkpoint_path)
            
            iteration += 1
Beispiel #23
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    skipped = 0
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            try:
                y_pred = model(x)
            except ValueError:
                skipped += 1
                print(
                    'Skipped an iteration due to value error, you have now skipped {} iterations'
                    .format(skipped))
                iteration += 1
                continue

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0) and iteration > 0:
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #24
0
def train(output_directory, log_directory, checkpoint_path, warm_start,
          warm_start_force, n_gpus, rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    # setup distributed
    hparams.n_gpus = n_gpus
    hparams.rank = rank
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    # reproducablilty stuffs
    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    # initialize blank model
    print('Initializing UnTTS...')
    model = load_model(hparams)
    print('Done')
    model.eval()
    learning_rate = hparams.learning_rate

    # (optional) show the names of each layer in model, mainly makes it easier to copy/paste what you want to adjust
    if hparams.print_layer_names_during_startup:
        print(*[
            f"Layer{i} = " + str(x[0]) + " " + str(x[1].shape)
            for i, x in enumerate(list(model.named_parameters()))
        ],
              sep="\n")

    # (optional) Freeze layers by disabling grads
    if len(hparams.frozen_modules):
        for layer, params in list(model.named_parameters()):
            if any(
                    layer.startswith(module)
                    for module in hparams.frozen_modules):
                params.requires_grad = False
                print(f"Layer: {layer} has been frozen")

    # define optimizer (any params without requires_grad are ignored)
    #optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=hparams.weight_decay)
    optimizer = apexopt.FusedAdam(filter(lambda p: p.requires_grad,
                                         model.parameters()),
                                  lr=learning_rate,
                                  weight_decay=hparams.weight_decay)

    if True and rank == 0:
        pytorch_total_params = sum(p.numel() for p in model.parameters())
        print("{:,} total parameters in model".format(pytorch_total_params))
        pytorch_total_params = sum(p.numel() for p in model.parameters()
                                   if p.requires_grad)
        print("{:,} trainable parameters.".format(pytorch_total_params))

    if hparams.fp16_run:
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=f'O{hparams.fp16_run_optlvl}')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss(hparams)

    logger = prepare_directories_and_logger(hparams, output_directory,
                                            log_directory, rank)

    # Load checkpoint if one exists
    best_validation_loss = 1e3  # used to see when "best_model" should be saved

    n_restarts = 0
    checkpoint_iter = 0
    iteration = 0
    epoch_offset = 0
    _learning_rate = 1e-3
    saved_lookup = None

    global best_val_loss_dict
    best_val_loss_dict = None
    global best_loss_dict
    best_loss_dict = None
    global expavg_loss_dict
    expavg_loss_dict = None
    expavg_loss_dict_iters = 0  # initial iters expavg_loss_dict has been fitted
    loss_dict_smoothness = 0.95  # smoothing factor

    if checkpoint_path is not None:
        if warm_start:
            model, iteration, saved_lookup = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        elif warm_start_force:
            model, iteration, saved_lookup = warm_start_force_model(
                checkpoint_path, model)
        else:
            _ = load_checkpoint(checkpoint_path, model, optimizer,
                                best_val_loss_dict, best_loss_dict)
            model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup, best_val_loss_dict, best_loss_dict = _
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
        checkpoint_iter = iteration
        iteration += 1  # next iteration is iteration + 1
        print('Model Loaded')

    # define datasets/dataloaders
    train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(
        hparams, saved_lookup)
    epoch_offset = max(0, int(iteration / len(train_loader)))
    speaker_lookup = trainset.speaker_ids

    # load and/or generate global_mean
    if hparams.drop_frame_rate > 0.:
        if rank != 0:  # if global_mean not yet calcuated, wait for main thread to do it
            while not os.path.exists(hparams.global_mean_npy):
                time.sleep(1)
        global_mean = calculate_global_mean(train_loader,
                                            hparams.global_mean_npy, hparams)
        hparams.global_mean = global_mean
        model.global_mean = global_mean

    # define scheduler
    use_scheduler = 0
    if use_scheduler:
        scheduler = ReduceLROnPlateau(optimizer,
                                      factor=0.1**(1 / 5),
                                      patience=10)

    model.train()
    is_overflow = False
    validate_then_terminate = 0
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
                            hparams.batch_size, n_gpus, collate_fn, logger,
                            hparams.distributed_run, rank)
        raise Exception("Finished Validation")

    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200))
    # ================ MAIN TRAINNIG LOOP! ===================
    training = True
    while training:
        try:
            for epoch in tqdm(range(epoch_offset, hparams.epochs),
                              initial=epoch_offset,
                              total=hparams.epochs,
                              desc="Epoch:",
                              position=1,
                              unit="epoch"):
                tqdm.write("Epoch:{}".format(epoch))

                if hparams.distributed_run:  # shuffles the train_loader when doing multi-gpu training
                    train_sampler.set_epoch(epoch)
                start_time = time.time()
                # start iterating through the epoch
                for i, batch in tqdm(enumerate(train_loader),
                                     desc="Iter:  ",
                                     smoothing=0,
                                     total=len(train_loader),
                                     position=0,
                                     unit="iter"):
                    # run external code every iter, allows the run to be adjusted without restarts
                    if (i == 0 or iteration % param_interval == 0):
                        try:
                            with open("run_every_epoch.py") as f:
                                internal_text = str(f.read())
                                if len(internal_text) > 0:
                                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                                    ldict = {
                                        'iteration': iteration,
                                        'checkpoint_iter': checkpoint_iter,
                                        'n_restarts': n_restarts
                                    }
                                    exec(internal_text, globals(), ldict)
                                else:
                                    print(
                                        "[info] tried to execute 'run_every_epoch.py' but it is empty"
                                    )
                        except Exception as ex:
                            print(
                                f"[warning] 'run_every_epoch.py' FAILED to execute!\nException:\n{ex}"
                            )
                        globals().update(ldict)
                        locals().update(ldict)
                        if show_live_params:
                            print(internal_text)
                    n_restarts = n_restarts_override if (
                        n_restarts_override is not None) else n_restarts or 0
                    # Learning Rate Schedule
                    if custom_lr:
                        if iteration < warmup_start:
                            learning_rate = warmup_start_lr
                        elif iteration < warmup_end:
                            learning_rate = (iteration - warmup_start) * (
                                (A_ + C_) - warmup_start_lr
                            ) / (
                                warmup_end - warmup_start
                            ) + warmup_start_lr  # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                        else:
                            if iteration < decay_start:
                                learning_rate = A_ + C_
                            else:
                                iteration_adjusted = iteration - decay_start
                                learning_rate = (
                                    A_ * (e**(-iteration_adjusted / B_))) + C_
                        assert learning_rate > -1e-8, "Negative Learning Rate."
                        if decrease_lr_on_restart:
                            learning_rate = learning_rate / (2**(n_restarts /
                                                                 3))
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = learning_rate
                    # /run external code every epoch, allows the run to be adjusting without restarts/
                    model.zero_grad()
                    x, y = model.parse_batch(
                        batch)  # move batch to GPU (async)
                    y_pred = model(x)

                    loss_scalars = {
                        "MelGlow_ls": MelGlow_ls,
                        "DurGlow_ls": DurGlow_ls,
                        "VarGlow_ls": VarGlow_ls,
                        "Sylps_ls": Sylps_ls,
                    }
                    loss_dict = criterion(y_pred, y, loss_scalars)
                    loss = loss_dict['loss']

                    if hparams.distributed_run:
                        reduced_loss_dict = {
                            k: reduce_tensor(v.data, n_gpus).item()
                            if v is not None else 0.
                            for k, v in loss_dict.items()
                        }
                    else:
                        reduced_loss_dict = {
                            k: v.item() if v is not None else 0.
                            for k, v in loss_dict.items()
                        }
                    reduced_loss = reduced_loss_dict['loss']

                    if hparams.fp16_run:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()

                    if grad_clip_thresh:
                        if hparams.fp16_run:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                amp.master_params(optimizer), grad_clip_thresh)
                            is_overflow = math.isinf(grad_norm) or math.isnan(
                                grad_norm)
                        else:
                            grad_norm = torch.nn.utils.clip_grad_norm_(
                                model.parameters(), grad_clip_thresh)
                    else:
                        grad_norm = 0.0

                    optimizer.step()

                    # get current Loss Scale of first optimizer
                    loss_scale = amp._amp_state.loss_scalers[
                        0]._loss_scale if hparams.fp16_run else 32768.

                    # restart if training/model has collapsed
                    if (iteration > 1e3 and
                        (reduced_loss > LossExplosionThreshold)) or (
                            math.isnan(reduced_loss)) or (loss_scale < 1 / 4):
                        raise LossExplosion(
                            f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n"
                        )

                    if expavg_loss_dict is None:
                        expavg_loss_dict = reduced_loss_dict
                    else:
                        expavg_loss_dict = {
                            k: (reduced_loss_dict[k] *
                                (1 - loss_dict_smoothness)) +
                            (expavg_loss_dict[k] * loss_dict_smoothness)
                            for k in expavg_loss_dict.keys()
                        }
                        expavg_loss_dict_iters += 1

                    if expavg_loss_dict_iters > 100:
                        if best_loss_dict is None:
                            best_loss_dict = expavg_loss_dict
                        else:
                            best_loss_dict = {
                                k: min(best_loss_dict[k], expavg_loss_dict[k])
                                for k in best_loss_dict.keys()
                            }

                    if rank == 0:
                        duration = time.time() - start_time
                        if not is_overflow:
                            average_loss = rolling_loss.process(reduced_loss)
                            tqdm.write(
                                "{} [Train_loss:{:.4f} Avg:{:.4f}] [Grad Norm {:.4f}] "
                                "[{:.2f}s/it] [{:.3f}s/file] [{:.7f} LR] [{} LS]"
                                .format(iteration, reduced_loss, average_loss,
                                        grad_norm, duration,
                                        (duration /
                                         (hparams.batch_size * n_gpus)),
                                        learning_rate, round(loss_scale)))
                            logger.log_training(reduced_loss_dict,
                                                expavg_loss_dict,
                                                best_loss_dict, grad_norm,
                                                learning_rate, duration,
                                                iteration)
                        else:
                            tqdm.write("Gradient Overflow, Skipping Step")
                        start_time = time.time()

                    if not is_overflow and (
                        (iteration %
                         (hparams.iters_per_checkpoint / 1) == 0) or
                        (os.path.exists(save_file_check_path))):
                        # save model checkpoint like normal
                        if rank == 0:
                            checkpoint_path = os.path.join(
                                output_directory,
                                "checkpoint_{}".format(iteration))
                            save_checkpoint(model, optimizer, learning_rate,
                                            iteration, hparams,
                                            best_validation_loss, average_loss,
                                            best_val_loss_dict, best_loss_dict,
                                            speaker_lookup, checkpoint_path)

                    if not is_overflow and (
                        (iteration % int(validation_interval) == 0) or
                        (os.path.exists(save_file_check_path)) or
                        (iteration < 1000 and (iteration % 250 == 0))):
                        if rank == 0 and os.path.exists(save_file_check_path):
                            os.remove(save_file_check_path)
                        # perform validation and save "best_model" depending on validation loss
                        val_loss, best_val_loss_dict = validate(
                            model, criterion, valset, loss_scalars,
                            best_val_loss_dict, iteration,
                            hparams.val_batch_size, n_gpus, collate_fn, logger,
                            hparams.distributed_run,
                            rank)  #validate (0.8 forcing)
                        if use_scheduler:
                            scheduler.step(val_loss)
                        if (val_loss < best_validation_loss):
                            best_validation_loss = val_loss
                            if rank == 0:
                                checkpoint_path = os.path.join(
                                    output_directory, "best_model")
                                save_checkpoint(model, optimizer,
                                                learning_rate, iteration,
                                                hparams, best_validation_loss,
                                                average_loss,
                                                best_val_loss_dict,
                                                best_loss_dict, speaker_lookup,
                                                checkpoint_path)

                    iteration += 1
                    # end of iteration loop
                # end of epoch loop
            training = False  # exit the While loop

        #except Exception as ex: # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
        except LossExplosion as ex:  # print Exception and continue from checkpoint. (turns out it takes < 4 seconds to restart like this, f*****g awesome)
            print(ex)  # print Loss
            checkpoint_path = os.path.join(output_directory, "best_model")
            assert os.path.exists(
                checkpoint_path
            ), "best_model checkpoint must exist for automatic restarts"

            if hparams.fp16_run:
                amp._amp_state.loss_scalers[0]._loss_scale = 32768

            # clearing VRAM for load checkpoint
            model.zero_grad()
            x = y = y_pred = loss = len_loss = loss_z = loss_w = loss_s = loss_att = dur_loss_z = dur_loss_w = dur_loss_s = None
            torch.cuda.empty_cache()

            model.eval()
            model, optimizer, _learning_rate, iteration, best_validation_loss, saved_lookup = load_checkpoint(
                checkpoint_path, model, optimizer)
            learning_rate = optimizer.param_groups[0]['lr']
            epoch_offset = max(0, int(iteration / len(train_loader)))
            model.train()
            checkpoint_iter = iteration
            iteration += 1
            n_restarts += 1
Beispiel #25
0
def train(input_directory, output_directory, log_directory, checkpoint_path,
          warm_start, n_gpus, rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    # torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    # 记录训练的元数据。
    meta_folder = os.path.join(output_directory, 'metadata')
    os.makedirs(meta_folder, exist_ok=True)

    trpath = os.path.join(meta_folder, "train.txt")
    vapath = os.path.join(meta_folder, "validation.txt")
    with open(trpath, 'wt',
              encoding='utf8') as fout_tr, open(vapath, 'wt',
                                                encoding='utf8') as fout_va:
        lines = open(input_directory, encoding='utf8').readlines()
        val_ids = set(
            np.random.choice(list(range(len(lines))),
                             hparams.batch_size * 2,
                             replace=False))
        for num, line in enumerate(lines):
            parts = line.strip().split('\t')
            abspath = os.path.join(
                os.path.dirname(os.path.abspath(input_directory)),
                parts[0]).replace('\\', '/')
            text = parts[1]
            if len(parts) >= 3:
                speaker = parts[2]
            else:
                speaker = '0'
            out = f'{abspath}\t{text}\t{speaker}\n'
            if num in val_ids:
                fout_va.write(out)
            else:
                fout_tr.write(out)

    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        meta_folder, hparams)

    path = os.path.join(meta_folder, "speakers.json")
    obj = dict(valset.speaker_ids)
    json_dump(obj, path)

    path = os.path.join(meta_folder, "hparams.json")
    obj = {k: v for k, v in hparams.items()}
    json_dump(obj, path)

    path = os.path.join(meta_folder, "symbols.json")
    from text.symbols import symbols
    obj = {w: i for i, w in enumerate(symbols)}
    json_dump(obj, path)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        for i, batch in enumerate(
                tqdm(train_loader, desc=f"Epoch-{epoch}", ncols=100)):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()
            duration = time.perf_counter() - start
            if not is_overflow and rank == 0:
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                validate(model,
                         criterion,
                         valset,
                         iteration,
                         hparams.batch_size,
                         n_gpus,
                         collate_fn,
                         logger,
                         hparams.distributed_run,
                         rank,
                         outdir=Path(output_directory),
                         hparams=hparams)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory,
                        "checkpoint-{:06d}.pt".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #26
0
def load_checkpoint(checkpoint_path, model, optimizer):
    assert os.path.isfile(checkpoint_path)
    print("Loading checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    
    state_dict = {k.replace("encoder_speaker_embedding.weight","encoder.encoder_speaker_embedding.weight"): v for k,v in torch.load(checkpoint_path)['state_dict'].items()}
    model.load_state_dict(state_dict) # tmp for updating old models
    
    #model.load_state_dict(checkpoint_dict['state_dict']) # original
    
    if 'optimizer' in checkpoint_dict.keys(): optimizer.load_state_dict(checkpoint_dict['optimizer'])
    if 'amp' in checkpoint_dict.keys(): amp.load_state_dict(checkpoint_dict['amp'])
    if 'learning_rate' in checkpoint_dict.keys(): learning_rate = checkpoint_dict['learning_rate']
    #if 'hparams' in checkpoint_dict.keys(): hparams = checkpoint_dict['hparams']
    if 'best_validation_loss' in checkpoint_dict.keys(): best_validation_loss = checkpoint_dict['best_validation_loss']
    if 'average_loss' in checkpoint_dict.keys(): average_loss = checkpoint_dict['average_loss']
    if (start_from_checkpoints_from_zero):
        iteration = 0
    else:
    iteration = checkpoint_dict['iteration']
    print("Loaded checkpoint '{}' from iteration {}" .format(
        checkpoint_path, iteration))
    return model, optimizer, learning_rate, iteration, best_validation_loss


def save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_id_lookup, filepath):
    from utils import load_filepaths_and_text
    tqdm.write("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    
    # get speaker names to ID
    speakerlist = load_filepaths_and_text(hparams.speakerlist)
    speaker_name_lookup = {x[2]: speaker_id_lookup[x[3]] for x in speakerlist}
    
    torch.save({'iteration': iteration,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate,
                #'amp': amp.state_dict(),
                'hparams': hparams,
                'speaker_id_lookup': speaker_id_lookup,
                'speaker_name_lookup': speaker_name_lookup,
                'best_validation_loss': best_validation_loss,
                'average_loss': average_loss}, filepath)
    tqdm.write("Saving Complete")


def validate(model, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank, val_teacher_force_till, val_p_teacher_forcing, teacher_force=1):
    """Handles all the validation scoring and printing"""
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, drop_last=True, collate_fn=collate_fn)
        if teacher_force == 1:
            val_teacher_force_till = 0
            val_p_teacher_forcing = 1.0
        elif teacher_force == 2:
            val_teacher_force_till = 0
            val_p_teacher_forcing = 0.0
        val_loss = 0.0
        diagonality = torch.zeros(1)
        avg_prob = torch.zeros(1)
        for i, batch in tqdm(enumerate(val_loader), desc="Validation", total=len(val_loader), smoothing=0): # i = index, batch = stuff in array[i]
            x, y = model.parse_batch(batch)
            y_pred = model(x, teacher_force_till=val_teacher_force_till, p_teacher_forcing=val_p_teacher_forcing)
            rate, prob = alignment_metric(x, y_pred)
            diagonality += rate
            avg_prob += prob
            loss, gate_loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
            # end forloop
        val_loss = val_loss / (i + 1)
        diagonality = (diagonality / (i + 1)).item()
        avg_prob = (avg_prob / (i + 1)).item()
        # end torch.no_grad()
    model.train()
    if rank == 0:
        tqdm.write("Validation loss {}: {:9f}  Average Max Attention: {:9f}".format(iteration, val_loss, avg_prob))
        #logger.log_validation(val_loss, model, y, y_pred, iteration)
        if True:#iteration != 0:
            if teacher_force == 1:
                logger.log_teacher_forced_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
            elif teacher_force == 2:
                logger.log_infer(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
            else:
                logger.log_validation(val_loss, model, y, y_pred, iteration, val_teacher_force_till, val_p_teacher_forcing, diagonality, avg_prob)
    return val_loss


def calculate_global_mean(data_loader, global_mean_npy, hparams):
    if global_mean_npy and os.path.exists(global_mean_npy):
        global_mean = np.load(global_mean_npy)
        return to_gpu(torch.tensor(global_mean).half()) if hparams.fp16_run else to_gpu(torch.tensor(global_mean).float())
    sums = []
    frames = []
    print('calculating global mean...')
    for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), smoothing=0.001):
        text_padded, input_lengths, mel_padded, gate_padded,\
            output_lengths, speaker_ids, torchmoji_hidden, preserve_decoder_states = batch
        # padded values are 0.
        sums.append(mel_padded.double().sum(dim=(0, 2)))
        frames.append(output_lengths.double().sum())
    global_mean = sum(sums) / sum(frames)
    global_mean = to_gpu(global_mean.half()) if hparams.fp16_run else to_gpu(global_mean.float())
    if global_mean_npy:
        np.save(global_mean_npy, global_mean.cpu().numpy())
    return global_mean


def train(output_directory, log_directory, checkpoint_path, warm_start, warm_start_force, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    hparams.n_gpus = n_gpus
    hparams.rank = rank
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    train_loader, valset, collate_fn, train_sampler, trainset = prepare_dataloaders(hparams)
    speaker_lookup = trainset.speaker_ids
    
    if hparams.drop_frame_rate > 0.:
        if rank != 0: # if global_mean not yet calcuated, wait for main thread to do it
            while not os.path.exists(hparams.global_mean_npy): time.sleep(1)
        global_mean = calculate_global_mean(train_loader, hparams.global_mean_npy, hparams)
        hparams.global_mean = global_mean
    
    model = load_model(hparams)

    model.eval() # test if this is needed anymore

    learning_rate = hparams.learning_rate
	if hparams.Apex_optimizer: # apex optimizer is slightly faster with slightly more vram usage in my testing. Helps in both fp32 and fp16.
    	optimizer = apexopt.FusedAdam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)
	else:
	    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss(hparams)

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)

    # Load checkpoint if one exists
    best_validation_loss = 0.8 # used to see when "best_model" should be saved, default = 0.8, load_checkpoint will update to last best value.
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model, iteration = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        elif warm_start_force:
            model, iteration = warm_start_force_model(
                checkpoint_path, model)
        else:
            model, optimizer, _learning_rate, iteration, best_validation_loss = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))
        print('Model Loaded')
	
    ## LEARNING RATE SCHEDULER
    if True:
        from torch.optim.lr_scheduler import ReduceLROnPlateau
        min_lr = 1e-5
        factor = 0.1**(1/5) # amount to scale the LR by on Validation Loss plateau
        scheduler = ReduceLROnPlateau(optimizer, 'min', factor=factor, patience=20, cooldown=2, min_lr=min_lr, verbose=True)
        print("ReduceLROnPlateau used as (optional) Learning Rate Scheduler.")
    else: scheduler=False

    model.train()
    is_overflow = False
	
    validate_then_terminate = 0 # I use this for testing old models with new metrics
    if validate_then_terminate:
        val_loss = validate(model, criterion, valset, iteration,
            hparams.batch_size, n_gpus, collate_fn, logger,
            hparams.distributed_run, rank)
        raise Exception("Finished Validation")
    
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate
    
    rolling_loss = StreamingMovingAverage(min(int(len(train_loader)), 200))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in tqdm(range(epoch_offset, hparams.epochs), initial=epoch_offset, total=hparams.epochs, desc="Epoch:", position=1, unit="epoch"):
        tqdm.write("Epoch:{}".format(epoch))
        
        if hparams.distributed_run: # shuffles the train_loader when doing multi-gpu training
            train_sampler.set_epoch(epoch)
        start_time = time.time()
        # start iterating through the epoch
        for i, batch in tqdm(enumerate(train_loader), desc="Iter:  ", smoothing=0, total=len(train_loader), position=0, unit="iter"):
                    # run external code every iter, allows the run to be adjusted without restarts
                    if (i==0 or iteration % param_interval == 0):
                        try:
                            with open("run_every_epoch.py") as f:
                                internal_text = str(f.read())
                                if len(internal_text) > 0:
                                    #code = compile(internal_text, "run_every_epoch.py", 'exec')
                                    ldict = {'iteration': iteration}
                                    exec(internal_text, globals(), ldict)
                                else:
                                    print("No Custom code found, continuing without changes.")
                        except Exception as ex:
                            print(f"Custom code FAILED to run!\n{ex}")
                        globals().update(ldict)
                        locals().update(ldict)
                        if show_live_params:
                            print(internal_text)
                    if not iteration % 50: # check actual learning rate every 20 iters (because I sometimes see learning_rate variable go out-of-sync with real LR)
                        learning_rate = optimizer.param_groups[0]['lr']
                    # Learning Rate Schedule
                    if custom_lr:
                        old_lr = learning_rate
                        if iteration < warmup_start:
                            learning_rate = warmup_start_lr
                        elif iteration < warmup_end:
                            learning_rate = (iteration-warmup_start)*((A_+C_)-warmup_start_lr)/(warmup_end-warmup_start) + warmup_start_lr # learning rate increases from warmup_start_lr to A_ linearly over (warmup_end-warmup_start) iterations.
                        else:
                            if iteration < decay_start:
                                learning_rate = A_ + C_
                            else:
                                iteration_adjusted = iteration - decay_start
                                learning_rate = (A_*(e**(-iteration_adjusted/B_))) + C_
                        assert learning_rate > -1e-8, "Negative Learning Rate."
                        if old_lr != learning_rate:
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate
                    else:
                        scheduler.patience = scheduler_patience
                        scheduler.cooldown = scheduler_cooldown
                        if override_scheduler_last_lr:
                            scheduler._last_lr = override_scheduler_last_lr
                            print("Scheduler last_lr overriden. scheduler._last_lr =", scheduler._last_lr)
                        if override_scheduler_best:
                            scheduler.best = override_scheduler_best
                            print("Scheduler best metric overriden. scheduler.best =", override_scheduler_best)
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = initiate_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    single_train_loader, single_valset, single_collate_fn, single_train_sampler = prepare_single_dataloaders(
        hparams, output_directory)
    train_loader, valset, collate_fn, train_sampler = prepare_dataloaders(
        hparams, output_directory)
    single_train_loader.dataset.speaker_ids = train_loader.dataset.speaker_ids
    single_valset.speaker_ids = train_loader.dataset.speaker_ids
    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            # model = torch.nn.DataParallel(model)
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(single_train_loader)))

    model = torch.nn.DataParallel(model)
    model.train()
    is_overflow = False
    # init training loop with single speaker
    for epoch in range(epoch_offset, 30):
        print("Epoch: {}".format(epoch))
        if single_train_sampler is not None:
            single_train_sampler.set_epoch(epoch)
        for i, batch in enumerate(single_train_loader):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = parse_batch(batch)
            mel_outputs, mel_outputs_postnet, gate_outputs, alignments, length = model(
                x)
            y_pred = parse_output(
                [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
                length)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, single_valset, iteration,
                         hparams.batch_size, n_gpus, single_collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model.module, optimizer, learning_rate,
                                    iteration, checkpoint_path)

            iteration += 1

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(30, hparams.epochs):
        print("Epoch: {}".format(epoch))
        if train_sampler is not None:
            train_sampler.set_epoch(epoch)
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            if iteration > 0 and iteration % hparams.learning_rate_anneal == 0:
                learning_rate = max(hparams.learning_rate_min,
                                    learning_rate * 0.5)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = parse_batch(batch)
            mel_outputs, mel_outputs_postnet, gate_outputs, alignments, length = model(
                x)
            y_pred = parse_output(
                [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
                length)
            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model.module, optimizer, learning_rate,
                                    iteration, checkpoint_path)

            iteration += 1
Beispiel #28
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams, run_name, prj_name, resume):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()
    criterion_dom = torch.nn.CrossEntropyLoss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank, run_name, prj_name, resume)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    for param_group in optimizer.param_groups:
        param_group['initial_lr'] = learning_rate
    scheduler = torch.optim.lr_scheduler.ExponentialLR(
        optimizer,
        0.5**(1 / (125000 * (64 / hparams.batch_size))),
        last_epoch=-1)
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            batches_per_epoch = len(train_loader)
            float_epoch = iteration / batches_per_epoch
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = scheduler.get_lr()[0]

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)

            if prj_name == "forward_attention_loss":
                input_lengths = x[1]
                alignments = y_pred[3]
                mean_far, _ = forward_attention_ratio(alignments,
                                                      input_lengths)
                if mean_far > 0.95:
                    fa_loss = forward_attention_loss(alignments, input_lengths)
                    loss += fa_loss
                    float_fa_loss = fa_loss.item()
                else:
                    float_fa_loss = None
            else:
                float_fa_loss = None

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            learning_rate = scheduler.get_lr()[0]
            print("learning_rate:", learning_rate)
            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, x, y_pred, iteration,
                                    float_epoch, float_fa_loss)

            if not is_overflow and (
                (iteration % hparams.iters_per_checkpoint == 0) or
                (i + 1 == batches_per_epoch)):
                validate(model, criterion, valset, iteration, float_epoch,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank, hparams.sampling_rate)
                if rank == 0 and (iteration % hparams.iters_per_checkpoint
                                  == 0):
                    checkpoint_path = os.path.join(
                        os.path.join(output_directory, prj_name, run_name),
                        "checkpoint_{}-epoch_{:.4}".format(
                            iteration, float_epoch))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)
                if rank == 0 and (i + 1 == batches_per_epoch):
                    checkpoint_path = os.path.join(
                        os.path.join(output_directory, prj_name, run_name),
                        "checkpoint_{}-epoch_{:.4}_end-epoch_{}".format(
                            iteration, float_epoch, epoch))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            if iteration > round(50000 * (64 / hparams.batch_size)):
                scheduler.step()

            iteration += 1
Beispiel #29
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout
    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """

    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)
    torch.nn.functional.sigmoid
    model = load_model(hparams)

    learning_rate = hparams.learning_rate
    #lr = args.lr * (0.1 ** (epoch // 30))
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)
    # optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, dampening=0, weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        optimizer = FP16_Optimizer(
            optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss(hparams)

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1

            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    # ================ MAIN TRAINNIG LOOP! ===================
    step = 0
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss, recon_loss, S_kl_loss, R_kl_loss, speaker_loss, augment_loss, alignment_loss = criterion(
                y_pred, y, iteration)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                optimizer.backward(loss)
                grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            overflow = optimizer.overflow if hparams.fp16_run else False

            if not overflow and not math.isnan(reduced_loss) and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate, duration, recon_loss, S_kl_loss, R_kl_loss, \
                                    speaker_loss, augment_loss, alignment_loss, iteration)

            if not overflow and (iteration % hparams.iters_per_checkpoint
                                 == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Beispiel #30
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """

    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    if hparams.use_vae:
        criterion = Tacotron2Loss_VAE(hparams)
    else:
        criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank, hparams.use_vae)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)
    valset_csv = os.path.join(output_directory, log_directory, 'valset.csv')
    # list2csv(flatten_list(valset.audiopaths_and_text), valset_csv, delimiter='|')
    list2csv(valset.audiopaths_and_text, valset_csv, delimiter='|')

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration, epoch, step = \
                load_checkpoint(checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            if epoch == 0:
                iteration += 1  # next iteration is iteration + 1
                epoch_offset = max(0, int(iteration / len(train_loader)))
            else:
                epoch_offset = epoch
            print('epoch offset: {}'.format(epoch_offset))
            train_loader = prepare_dataloaders(hparams, epoch_offset, valset,
                collate_fn['train'])[0]
        print('completing loading model ...')

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    track_csv = os.path.join(output_directory, log_directory, 'track.csv')
    track_header = ['padding-rate-txt', 'max-len-txt', 'top-len-txt',
        'padding-rate-mel', 'max-len-mel', 'top-len-mel', 'batch-size',
        'batch-length', 'batch-area', 'mem-use', 'mem-all', 'mem-cached',
        'duration', 'iteration', 'epoch', 'step']
    if os.path.isfile(track_csv) and checkpoint_path is not None:
        print('loading existing {} ...'.format(track_csv))
        track = csv2dict(track_csv, header=track_header)
    else:
        track = {k:[] for k in track_header}

    print('start training in epoch {} ~ {} ...'.format(epoch_offset, hparams.epochs))
    nbatches = len(train_loader)
    for epoch in range(epoch_offset, hparams.epochs):
        #if epoch >= 10: break
        print("Epoch: {}, #batches: {}".format(epoch, nbatches))
        batch_sizes, batch_lengths = [0] * nbatches, [0] * nbatches
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            if hparams.use_vae:
                loss, recon_loss, kl, kl_weight = criterion(y_pred, y, iteration)
            else:
                loss = criterion(y_pred, y)

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                batch_sizes[i], batch_lengths[i] = batch[0].size(0), batch[2].size(2)
                batch_capacity = batch_sizes[i] * batch_lengths[i]
                mem_all = torch.cuda.memory_allocated() / (1024**2)
                mem_cached = torch.cuda.memory_cached() / (1024**2)
                mem_use = mem_all + mem_cached
                print("{} ({}:{}/{}): ".format(iteration, epoch, i, nbatches), end='')
                print("Batch {} ({}X{}) ".format(batch_capacity, batch_sizes[i],
                    batch_lengths[i]), end='')
                print("Mem {:.1f} ({:.1f}+{:.1f}) ".format(mem_use, mem_all,
                    mem_cached), end='')
                print("Train loss {:.3f} Grad Norm {:.3f} {:.2f}s/it".format(
                    reduced_loss, grad_norm, duration))
                input_lengths, gate_padded = batch[1], batch[4]
                metadata = (duration, iteration, epoch, i)
                track_seq(track, input_lengths, gate_padded, metadata)
                padding_rate_txt = track['padding-rate-txt'][-1]
                max_len_txt = track['max-len-txt'][-1]
                padding_rate_mel = track['padding-rate-mel'][-1]
                max_len_mel = track['max-len-mel'][-1]
                if hparams.use_vae:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration,
                        padding_rate_txt, max_len_txt, padding_rate_mel,
                        max_len_mel, iteration, recon_loss, kl, kl_weight)
                else:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration,
                        padding_rate_txt, max_len_txt, padding_rate_mel,
                        max_len_mel, iteration)

            check_by_iter = (hparams.check_by == 'iter') and \
                            (iteration % hparams.iters_per_checkpoint == 0)
            check_by_epoch = (hparams.check_by == 'epoch') and i == 0 and \
                             (epoch % hparams.epochs_per_checkpoint == 0)
            if not is_overflow and (check_by_iter or check_by_epoch):
                dict2col(track, track_csv, verbose=True)
                val_loss, (mus, emotions) = validate(model, criterion, valset,
                     iteration, hparams.batch_size, n_gpus, collate_fn['val'], logger,
                     hparams.distributed_run, rank, hparams.use_vae, pre_batching=False)
                if rank == 0:
                    checkpoint_path = os.path.join(output_directory,
                        "checkpoint_{}-{}-{}_{:.3f}".format(iteration, epoch, i, val_loss))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                         epoch, i, checkpoint_path)
                    if hparams.use_vae:
                        image_scatter_path = os.path.join(output_directory,
                             "checkpoint_{0}_scatter_val.png".format(iteration))
                        image_tsne_path = os.path.join(output_directory,
                             "checkpoint_{0}_tsne_val.png".format(iteration))
                        imageio.imwrite(image_scatter_path, plot_scatter(mus, emotions))
                        imageio.imwrite(image_tsne_path, plot_tsne(mus, emotions))

            iteration += 1

        if hparams.prep_trainset_per_epoch:
            train_loader = prepare_dataloaders(hparams, epoch+1, valset,
                collate_fn['train'])[0]
            nbatches = len(train_loader)