Example #1
0
def train(n_gpus,
          rank,
          output_directory,
          epochs,
          optim_algo,
          learning_rate,
          weight_decay,
          sigma,
          iters_per_checkpoint,
          batch_size,
          seed,
          checkpoint_path,
          ignore_layers,
          include_layers,
          finetune_layers,
          warmstart_checkpoint_path,
          with_tensorboard,
          grad_clip_val,
          fp16_run,
          tensorboard_path=None):
    fp16_run = bool(fp16_run)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             bool(model_config['use_gate_layer']))
    model = Flowtron(**model_config).cuda()

    if len(finetune_layers):
        for name, param in model.named_parameters():
            if name in finetune_layers:
                param.requires_grad = True
            else:
                param.requires_grad = False

    print("Initializing %s optimizer" % (optim_algo))
    if optim_algo == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate,
                                     weight_decay=weight_decay)
    elif optim_algo == 'RAdam':
        optimizer = RAdam(model.parameters(),
                          lr=learning_rate,
                          weight_decay=weight_decay)
    else:
        print("Unrecognized optimizer %s!" % (optim_algo))
        exit(1)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    scaler = amp.GradScaler(enabled=fp16_run)

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
        print("Output directory", output_directory)

    if with_tensorboard and rank == 0:
        tboard_out_path = tensorboard_path
        if tensorboard_path is None:
            tboard_out_path = os.path.join(output_directory, "logs/run1")
        print("Setting up Tensorboard log in %s" % (tboard_out_path))
        logger = FlowtronLogger(tboard_out_path)

    # force set the learning rate to what is specified
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))

    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target, attn_prior = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(
            ), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(
            ), gate_target.cuda()
            attn_prior = attn_prior.cuda() if valset.use_attn_prior else None
            with amp.autocast(enabled=fp16_run):
                z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                    mel, speaker_vecs, text, in_lens, out_lens, attn_prior)

                loss_nll, loss_gate = criterion(
                    (z, log_s_list, gate_pred, mean, log_var, prob),
                    gate_target, out_lens)
                loss = loss_nll + loss_gate

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
                reduced_gate_loss = reduce_tensor(loss_gate.data,
                                                  n_gpus).item()
                reduced_nll_loss = reduce_tensor(loss_nll.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
                reduced_gate_loss = loss_gate.item()
                reduced_nll_loss = loss_nll.item()

            scaler.scale(loss).backward()
            if grad_clip_val > 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               grad_clip_val)

            scaler.step(optimizer)
            scaler.update()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss),
                      flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('training_loss_gate', reduced_gate_loss,
                                  iteration)
                logger.add_scalar('training_loss_nll', reduced_nll_loss,
                                  iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if iteration % iters_per_checkpoint == 0:
                val_loss, val_loss_nll, val_loss_gate, attns, gate_pred, gate_target = compute_validation_loss(
                    model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, val_loss))
                    if with_tensorboard:
                        logger.log_validation(val_loss, val_loss_nll,
                                              val_loss_gate, attns, gate_pred,
                                              gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Example #2
0
def train(n_gpus, rank, output_directory, epochs, learning_rate, weight_decay,
          sigma, iters_per_checkpoint, batch_size, seed, checkpoint_path,
          ignore_layers, include_layers, warmstart_checkpoint_path,
          with_tensorboard, fp16_run):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if n_gpus > 1:
        init_distributed(rank, n_gpus, **dist_config)

    criterion = FlowtronLoss(sigma, bool(model_config['n_components']),
                             model_config['use_gate_layer'])
    model = Flowtron(**model_config).cuda()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=weight_decay)

    # Load checkpoint if one exists
    iteration = 0
    if warmstart_checkpoint_path != "":
        model = warmstart(warmstart_checkpoint_path, model)

    if checkpoint_path != "":
        model, optimizer, iteration = load_checkpoint(checkpoint_path, model,
                                                      optimizer, ignore_layers)
        iteration += 1  # next iteration is iteration + 1

    if n_gpus > 1:
        model = apply_gradient_allreduce(model)
    print(model)
    if fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    train_loader, valset, collate_fn = prepare_dataloaders(
        data_config, n_gpus, batch_size)

    # Get shared output_directory ready
    if rank == 0 and not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
    print("output directory", output_directory)

    if with_tensorboard and rank == 0:
        logger = FlowtronLogger(os.path.join(output_directory, 'logs'))

    model.train()
    epoch_offset = max(0, int(iteration / len(train_loader)))
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, epochs):
        print("Epoch: {}".format(epoch))
        for batch in train_loader:
            model.zero_grad()

            mel, speaker_vecs, text, in_lens, out_lens, gate_target = batch
            mel, speaker_vecs, text = mel.cuda(), speaker_vecs.cuda(
            ), text.cuda()
            in_lens, out_lens, gate_target = in_lens.cuda(), out_lens.cuda(
            ), gate_target.cuda()

            z, log_s_list, gate_pred, attn, mean, log_var, prob = model(
                mel, speaker_vecs, text, in_lens, out_lens)
            loss = criterion((z, log_s_list, gate_pred, mean, log_var, prob),
                             gate_target, out_lens)

            if n_gpus > 1:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            if rank == 0:
                print("{}:\t{:.9f}".format(iteration, reduced_loss),
                      flush=True)

            if with_tensorboard and rank == 0:
                logger.add_scalar('training_loss', reduced_loss, iteration)
                logger.add_scalar('learning_rate', learning_rate, iteration)

            if (iteration % iters_per_checkpoint == 0):
                val_loss, attns, gate_pred, gate_target = compute_validation_loss(
                    model, criterion, valset, collate_fn, batch_size, n_gpus)
                if rank == 0:
                    print("Validation loss {}: {:9f}  ".format(
                        iteration, val_loss))
                    if with_tensorboard:
                        logger.log_validation(val_loss, attns, gate_pred,
                                              gate_target, iteration)

                    checkpoint_path = "{}/model_{}".format(
                        output_directory, iteration)
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1