Esempio n. 1
0
def train_model(rank, world_size, args):
    if 1 < args.n_gpu:
        init_process_group(rank, world_size)
    master = (world_size == 0 or rank % world_size == 0)
    if master: wandb.init(project="transformer-evolution")

    vocab = load_vocab(args.vocab)

    config = cfg.Config.load(args.config)
    config.n_enc_vocab, config.n_dec_vocab = len(vocab), len(vocab)
    config.device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    print(config)

    best_epoch, best_loss, best_score = 0, 0, 0
    model = transformer.QA(config)
    if os.path.isfile(args.save):
        best_epoch, best_loss, best_score = model.load(args.save)
        print(f"rank: {rank} load state dict from: {args.save}")
    if 1 < args.n_gpu:
        model.to(config.device)
        model = DistributedDataParallel(model, device_ids=[rank], find_unused_parameters=True)
    else:
        model.to(config.device)
    if master: wandb.watch(model)

    criterion = torch.nn.CrossEntropyLoss()

    train_loader, train_sampler = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=True)
    test_loader, _ = data.build_data_loader(vocab, "KorQuAD_v1.0_train.json", args, shuffle=False)

    t_total = len(train_loader) * args.epoch
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = optim.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    offset = best_epoch
    for step in trange(args.epoch, desc="Epoch"):
        if train_sampler:
            train_sampler.set_epoch(step)
        epoch = step + offset

        loss = train_epoch(config, rank, epoch, model, criterion, optimizer, scheduler, train_loader)
        score = eval_epoch(config, rank, model, test_loader)
        if master: wandb.log({"loss": loss, "accuracy": score})

        if master and best_score < score:
            best_epoch, best_loss, best_score = epoch, loss, score
            if isinstance(model, DistributedDataParallel):
                model.module.save(best_epoch, best_loss, best_score, args.save)
            else:
                model.save(best_epoch, best_loss, best_score, args.save)
            print(f">>>> rank: {rank} save model to {args.save}, epoch={best_epoch}, loss={best_loss:.3f}, socre={best_score:.3f}")

    if 1 < args.n_gpu:
        destroy_process_group()
Esempio n. 2
0
def train(cfg_path, device='cuda'):
    if cfg_path is not None:
        cfg.merge_from_file(cfg_path)
    cfg.freeze()

    if not os.path.isdir(cfg.LOG_DIR):
        os.makedirs(cfg.LOG_DIR)
    if not os.path.isdir(cfg.SAVE_DIR):
        os.makedirs(cfg.SAVE_DIR)

    model = UNet(cfg.NUM_CHANNELS, cfg.NUM_CLASSES)
    model.to(device)

    train_data_loader = build_data_loader(cfg, 'train')
    if cfg.VAL:
        val_data_loader = build_data_loader(cfg, 'val')
    else:
        val_data_loader = None

    optimizer = build_optimizer(cfg, model)
    lr_scheduler = build_lr_scheduler(cfg, optimizer)
    criterion = get_loss_func(cfg)
    writer = SummaryWriter(cfg.LOG_DIR)

    iter_counter = 0
    loss_meter = AverageMeter()
    val_loss_meter = AverageMeter()
    min_val_loss = 1e10

    print('Training Start')
    for epoch in range(cfg.SOLVER.MAX_EPOCH):
        print('Epoch {}/{}'.format(epoch + 1, cfg.SOLVER.MAX_EPOCH))
        if lr_scheduler is not None:
            lr_scheduler.step(epoch)
        for data in train_data_loader:
            iter_counter += 1

            imgs, annots = data
            imgs = imgs.to(device)
            annots = annots.to(device)

            y = model(imgs)
            optimizer.zero_grad()
            loss = criterion(y, annots)
            loss.backward()
            optimizer.step()
            loss_meter.update(loss.item())

            if iter_counter % 10 == 0:
                writer.add_scalars('loss', {'train': loss_meter.avg},
                                   iter_counter)
                loss_meter.reset()
            if lr_scheduler is not None:
                writer.add_scalar('learning rate',
                                  optimizer.param_groups[0]['lr'],
                                  iter_counter)
            save_as_checkpoint(model, optimizer,
                               os.path.join(cfg.SAVE_DIR, 'checkpoint.pth'),
                               epoch, iter_counter)

        # Skip validation when cfg.VAL is False
        if val_data_loader is None:
            continue

        for data in val_data_loader:
            val_loss_meter.reset()
            with torch.no_grad():
                imgs, annots = data
                imgs = imgs.to(device)
                annots = annots.to(device)

                y = model(imgs)
                optimizer.zero_grad()
                loss = criterion(y, annots)
                val_loss_meter.update(loss.item())
        if val_loss_meter.avg < min_val_loss:
            min_val_loss = val_loss_meter.avg
            writer.add_scalars('loss', {'val': val_loss_meter.avg},
                               iter_counter)
            # save model if validation loss is minimum
            torch.save(model.state_dict(),
                       os.path.join(cfg.SAVE_DIR, 'min_val_loss.pth'))