def load_model(cfg): model = TCTRN_Tacotron(cfg) if cfg["EXPERIMENT"]["FP_16RUN"]: model.decoder.attention_layer.score_mask_value = finfo("float16").min if cfg["EXPERIMENT"]["DISTRIBUTED_RUN"]: model = apply_gradient_allreduce(model) return model
def load_model(hparams, distributed_run=False): model = Tacotron2(hparams) if hparams.fp16_run and hparams.device != "cpu": model.decoder.attention_layer.score_mask_value = finfo("float16").min if distributed_run and hparams.device != "cpu": model = apply_gradient_allreduce(model) return model
def train(hparams, distributed_run=False, rank=0, n_gpus=None): """Training and validation logging results to tensorboard and stdout """ if distributed_run: assert n_gpus is not None torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams, distributed_run) optimizer = build_optimizer(model, hparams) lr_scheduler = build_scheduler(optimizer, hparams) criterion = OverallLoss(hparams) if hparams.fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if distributed_run: model = apply_gradient_allreduce(model) logger = prepare_directories_and_logger(hparams.output_dir, hparams.log_dir, rank) copyfile(hparams.path, os.path.join(hparams.output_dir, 'hparams.yaml')) train_loader, valset, collate_fn = prepare_dataloaders( hparams, distributed_run) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if hparams.checkpoint is not None: if hparams.warm_start: model = warm_start_model(hparams.checkpoint, model, hparams.ignore_layers) else: model, optimizer, lr_scheduler, mmi_criterion, iteration = load_checkpoint( hparams.checkpoint, model, optimizer, lr_scheduler, criterion, hparams.restore_scheduler_state) iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): torch.cuda.empty_cache() start = time.perf_counter() model.zero_grad() inputs, alignments, inputs_ctc = model.parse_batch(batch) outputs, decoder_outputs = model(inputs) losses = criterion(outputs, inputs, alignments=alignments, inputs_ctc=inputs_ctc, decoder_outputs=decoder_outputs) if hparams.use_mmi and hparams.use_gaf and i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0: mi_loss = losses["mi/loss"] overall_loss = losses["overall/loss"] gaf = calc_gaf(model, optimizer, overall_loss, mi_loss, hparams.max_gaf) losses["mi/loss"] = gaf * mi_loss losses["overall/loss"] = overall_loss - mi_loss * (1 - gaf) reduced_losses = { key: reduce_loss(value, distributed_run, n_gpus) for key, value in losses.items() } loss = losses["overall/loss"] if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), hparams.grad_clip_thresh) is_overflow = math.isnan(grad_norm) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) optimizer.step() if not is_overflow and rank == 0: learning_rate = lr_scheduler.get_last_lr()[0] duration = time.perf_counter() - start print( "Iteration {}: overall loss {:.6f} Grad Norm {:.6f} {:.2f}s/it LR {:.3E}" .format(iteration, reduced_losses["overall/loss"], grad_norm, duration, learning_rate)) logger.log_training(reduced_losses, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): val_loss = validate(model, criterion, valset, iteration, hparams.batch_size, collate_fn, logger, distributed_run, rank, n_gpus) if rank == 0: checkpoint = os.path.join( hparams.output_dir, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, lr_scheduler, criterion, iteration, hparams, checkpoint) iteration += 1 if hparams.lr_scheduler == SchedulerTypes.cyclic: lr_scheduler.step() if not hparams.lr_scheduler == SchedulerTypes.cyclic: # TODO: для plateau ошибка валидации должна рассчитываться в конце каждой эпохи, по-хорошему scheduler_args = ( ) if hparams.lr_scheduler != SchedulerTypes.plateau else ( val_loss, ) lr_scheduler.step(*scheduler_args)
def train(hparams, distributed_run=False, rank=0, n_gpus=None): """Training and validation logging results to tensorboard and stdout """ if distributed_run: assert n_gpus is not None torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) model = load_model(hparams, distributed_run) criterion = OverallLoss(hparams) if criterion.mmi_criterion is not None: parameters = chain(model.parameters(), criterion.mmi_criterion.parameters()) else: parameters = model.parameters() optimizer = build_optimizer(parameters, hparams) lr_scheduler = build_scheduler(optimizer, hparams) if distributed_run: model = apply_gradient_allreduce(model) scaler = amp.GradScaler(enabled=hparams.fp16_run) logger = prepare_directories_and_logger(hparams.output_dir, hparams.log_dir, rank) copyfile(hparams.path, os.path.join(hparams.output_dir, 'hparams.yaml')) train_loader, valset, collate_fn = prepare_dataloaders( hparams, distributed_run) # Load checkpoint if one exists iteration = 0 epoch_offset = 0 if hparams.checkpoint is not None: if hparams.warm_start: model = warm_start_model(hparams.checkpoint, model, hparams.ignore_layers, hparams.ignore_mismatched_layers) else: model, optimizer, lr_scheduler, mmi_criterion, iteration = load_checkpoint( hparams.checkpoint, model, optimizer, lr_scheduler, criterion, hparams.restore_scheduler_state) iteration += 1 # next iteration is iteration + 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, hparams.epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): start = time.perf_counter() model.zero_grad() inputs, alignments, inputs_ctc = model.parse_batch(batch) with amp.autocast(enabled=hparams.fp16_run): outputs, decoder_outputs = model(inputs) losses = criterion(outputs, inputs, alignments=alignments, inputs_ctc=inputs_ctc, decoder_outputs=decoder_outputs) if hparams.use_mmi and hparams.use_gaf and i % gradient_adaptive_factor.UPDATE_GAF_EVERY_N_STEP == 0: mi_loss = losses["mi/loss"] overall_loss = losses["overall/loss"] gaf = calc_gaf(model, optimizer, overall_loss, mi_loss, hparams.max_gaf) losses["mi/loss"] = gaf * mi_loss losses["overall/loss"] = overall_loss - mi_loss * (1 - gaf) reduced_losses = { key: reduce_loss(value, distributed_run, n_gpus) for key, value in losses.items() } loss = losses["overall/loss"] scaler.scale(loss).backward() scaler.unscale_(optimizer) grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) scaler.step(optimizer) scaler.update() if rank == 0: learning_rate = lr_scheduler.get_last_lr()[0] duration = time.perf_counter() - start print( "Iteration {} ({} epoch): overall loss {:.6f} Grad Norm {:.6f} {:.2f}s/it LR {:.3E}" .format(iteration, epoch, reduced_losses["overall/loss"], grad_norm, duration, learning_rate)) grad_norm = None if torch.isnan(grad_norm) or torch.isinf( grad_norm) else grad_norm logger.log_training(reduced_losses, grad_norm, learning_rate, duration, iteration) if iteration % hparams.iters_per_checkpoint == 0: validate(model, criterion, valset, iteration, hparams.batch_size, collate_fn, logger, distributed_run, rank, n_gpus) if rank == 0: checkpoint = os.path.join( hparams.output_dir, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, lr_scheduler, criterion, iteration, hparams, checkpoint) iteration += 1 if hparams.lr_scheduler == SchedulerTypes.cyclic: lr_scheduler.step() if not hparams.lr_scheduler == SchedulerTypes.cyclic: if hparams.lr_scheduler == SchedulerTypes.plateau: lr_scheduler.step( validate(model, criterion, valset, iteration, hparams.batch_size, collate_fn, logger, distributed_run, rank, n_gpus)) else: lr_scheduler.step()
def train_tacotron(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank, group_name, cfg): """Training and validation logging results to tensorboard and stdout Params ------ output_directory (string): directory to save checkpoints log_directory (string) directory to save tensorboard logs checkpoint_path(string): checkpoint path n_gpus (int): number of gpus rank (int): rank of current gpu cfg (object): comma separated list of "name=value" pairs. """ # experimental parameters distributed_run = cfg["EXPERIMENT"]["DISTRIBUTED_RUN"] fp_16run = cfg["EXPERIMENT"]["FP_16RUN"] if distributed_run: init_distributed_run(cfg, n_gpus, rank, group_name) manual_seed = cfg["EXPERIMENT"]["SEED"] torch.manual_seed(manual_seed) torch.cuda.manual_seed(manual_seed) model = load_model(cfg) learning_rate = cfg["OPT"]["LEARNING_RATE"] optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=cfg["OPT"]["WEIGHT_DECAY"]) if fp_16run: # TODO can remove fp16 run from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level="O2" ) if distributed_run: model = apply_gradient_allreduce(model) criterion = TCTRN_TacotronLoss() logger = prepare_directories_and_logger(output_directory, log_directory, rank) train_loader, valset, collate_fn = prepare_dataloaders(cfg) # load ckpt iteration = 0 epoch_offset = 0 if checkpoint_path is not None: if warm_start: model = warm_start_model(checkpoint_path, model, cfg["EXPERIMENT"]["IGNORE_LAYERS"]) else: model, optimizer, _learning_rate, iteration = laod_checkpoint(checkpoint_path, model, optimizer) if cfg["EXPERIMENT"]["USE_SAVED_LEARNING_RATE"]: learning_rate = _learning_rate iteration += 1 epoch_offset = max(0, int(iteration / len(train_loader))) model.train() is_overflow = False # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, cfg["EXPERIMENT"]["EPOCH"]): print(f"Epoch: {epoch}") for i, batch in enumerate(train_loader): start = time.perf_counter() # TODO for param_group in optimizer.param_groups: param_group["lr"] = learning_rate model.zero_grad() x, y = model.parse_batch(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus) else: reduced_loss = loss.item() if fp_16run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if fp_16run: grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), cfg["OPT"]["GRAD_CLIP_THRESH"]) is_overflow = math.isnan(grad_norm) else: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), cfg["OPT"]["GRAD_CLIP_THRESH"]) optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training( reduced_loss, grad_norm, learning_rate, duration, iteration) if not is_overflow and (iteration % cfg["EXPERIMENT"]["ITERS_PER_CHECKPOINT"] == 0): validate(model, criterion, valset, iteration, cfg["OPT"]["BATCH_SIZE"], n_gpus, collate_fn, logger, distributed_run, rank) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1