def train_model(rank, world_size, args): if 1 < args.n_gpu: init_process_group(rank, world_size) master = (world_size == 0 or rank % world_size == 0) if master: wandb.init(project="transformer-evolution") vocab = load_vocab(args.vocab) config = cfg.Config.load(args.config) config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab) config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu") print(config) best_epoch, best_loss, best_score = 0, 0, 0 model = transformer.QA(config) if os.path.isfile(args.save): best_epoch, best_loss, best_score = model.load(args.save) print(f"rank: {rank} load state dict from: {args.save}") if 1 < args.n_gpu: model.to(config.device) model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True) else: model.to(config.device) if master: wandb.watch(model) criterion = torch.nn.CrossEntropyLoss() train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True) test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False) t_total = len(train_loader) * args.epoch no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) offset = best_epoch for step in trange(args.epoch, desc="Epoch"): if train_sampler: train_sampler.set_epoch(step) epoch = step + offset loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader) score = eval_epoch(config, rank, model, test_loader) if master: wandb.log({"loss": loss, "accuracy": score}) if master and best_score < score: best_epoch, best_loss, best_score = epoch, loss, score if isinstance(model, DistributedDataParallel): model.module.save(best_epoch, best_loss, best_score, args.save) else: model.save(best_epoch, best_loss, best_score, args.save) print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}") if 1 < args.n_gpu: destroy_process_group()
def train(cfg_path, device='cuda'): if cfg_path is not None: cfg.merge_from_file(cfg_path) cfg.freeze() if not os.path.isdir(cfg.LOG_DIR): os.makedirs(cfg.LOG_DIR) if not os.path.isdir(cfg.SAVE_DIR): os.makedirs(cfg.SAVE_DIR) model = UNet(cfg.NUM_CHANNELS, cfg.NUM_CLASSES) model.to(device) train_data_loader = build_data_loader(cfg, 'train') if cfg.VAL: val_data_loader = build_data_loader(cfg, 'val') else: val_data_loader = None optimizer = build_optimizer(cfg, model) lr_scheduler = build_lr_scheduler(cfg, optimizer) criterion = get_loss_func(cfg) writer = SummaryWriter(cfg.LOG_DIR) iter_counter = 0 loss_meter = AverageMeter() val_loss_meter = AverageMeter() min_val_loss = 1e10 print('Training Start') for epoch in range(cfg.SOLVER.MAX_EPOCH): print('Epoch {}/{}'.format(epoch + 1, cfg.SOLVER.MAX_EPOCH)) if lr_scheduler is not None: lr_scheduler.step(epoch) for data in train_data_loader: iter_counter += 1 imgs, annots = data imgs = imgs.to(device) annots = annots.to(device) y = model(imgs) optimizer.zero_grad() loss = criterion(y, annots) loss.backward() optimizer.step() loss_meter.update(loss.item()) if iter_counter % 10 == 0: writer.add_scalars('loss', {'train': loss_meter.avg}, iter_counter) loss_meter.reset() if lr_scheduler is not None: writer.add_scalar('learning rate', optimizer.param_groups[0]['lr'], iter_counter) save_as_checkpoint(model, optimizer, os.path.join(cfg.SAVE_DIR, 'checkpoint.pth'), epoch, iter_counter) # Skip validation when cfg.VAL is False if val_data_loader is None: continue for data in val_data_loader: val_loss_meter.reset() with torch.no_grad(): imgs, annots = data imgs = imgs.to(device) annots = annots.to(device) y = model(imgs) optimizer.zero_grad() loss = criterion(y, annots) val_loss_meter.update(loss.item()) if val_loss_meter.avg < min_val_loss: min_val_loss = val_loss_meter.avg writer.add_scalars('loss', {'val': val_loss_meter.avg}, iter_counter) # save model if validation loss is minimum torch.save(model.state_dict(), os.path.join(cfg.SAVE_DIR, 'min_val_loss.pth'))