Ejemplo n.º 1
0
def train2(net, train_loader, test_loader):

    loss_fn = nn.CrossEntropyLoss()
    net2 = BYOL_Classification(net, 10)

    net2.eval()
    net2.cuda()
    for pq in net.parameters():
        pq.requires_grad = False
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  net2.parameters()),
                           lr=1e-3)
    from warmup_scheduler import GradualWarmupScheduler
    scheduler = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=20,
        after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                             T_max=80))

    train_start = time.time()
    for epoch in range(1, 100 + 1):

        train_loss = 0
        net2.train()

        epoch_start = time.time()
        for idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad()
            data = data.cuda()
            target = target.cuda()
            data = net2(data)[1]
            loss = loss_fn(data, target)

            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        train_loss /= (idx + 1)
        scheduler.step()

        epoch_time = time.time() - epoch_start
        if epoch % 10 == 0:
            net.eval()
            total = 0.0
            correct = 0.0
            for test_data in test_loader:
                images, labels = test_data
                images = images.cuda()
                labels = labels.cuda()
                outputs = net2(images)[1]
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

            print("Epoch\t", epoch, "\tTest accuracy\t", correct / total * 100)

    elapsed_train_time = time.time() - train_start
    print('Finished training. Train time was:', elapsed_train_time)
Ejemplo n.º 2
0
def train(args, config, loader, device):
    logging.info('Start training...')
    model = getattr(net, config.model.name)(**config.model.args,
                                            **config.embedder)
    model = model.to(device)

    criterion = getattr(nn, config.loss.name)(**config.loss.args).to(device)
    optimizer = getattr(torch.optim,
                        config.optimizer.name)(model.parameters(),
                                               **config.optimizer.args)
    if hasattr(config, 'lr_scheduler'):
        if hasattr(config.lr_scheduler, 'name'):
            scheduler = getattr(torch.optim.lr_scheduler,
                                config.lr_scheduler.name)(
                                    optimizer, **config.lr_scheduler.args)
        else:
            scheduler = None
        if hasattr(config.lr_scheduler, 'warm_up'):
            scheduler_warm_up = GradualWarmupScheduler(
                optimizer,
                multiplier=config.lr_scheduler.warm_up.multiplier,
                total_epoch=config.lr_scheduler.warm_up.epoch,
                after_scheduler=scheduler)

    loss = Box({'train': 0.0, 'val': 0.0})
    metrics = Box({'train': [Accuracy()], 'val': [Accuracy()]})

    for epoch in range(config.train.n_epoch):
        if hasattr(config, 'lr_scheduler'):
            if hasattr(config.lr_scheduler, 'warm_up'):
                scheduler_warm_up.step()
            else:
                scheduler.step()

        loss.train, metrics.train = run_epoch(
            model,
            optimizer,
            criterion,
            loader.train,
            train=True,
            metrics=metrics.train,
            max_norm=config.max_norm if hasattr(config, 'max_norm') else -1)
        loss.val, metrics.val = run_epoch(model,
                                          optimizer,
                                          criterion,
                                          loader.val,
                                          train=False,
                                          metrics=metrics.val)

        saved_path = os.path.join(args.model_folder, 'checkpoints',
                                  f'epoch_{epoch}.pt')
        save_model(saved_path, epoch, model, optimizer)
        log_metrics(epoch, args.model_folder, loss, metrics)
Ejemplo n.º 3
0
def train(net, loader):
    optimizer = SGD_with_lars(net.parameters(),
                              lr=0.1,
                              momentum=0.9,
                              weight_decay=1e-6)

    from warmup_scheduler import GradualWarmupScheduler
    scheduler = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=20,
        after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                             T_max=180))

    train_start = time.time()

    for epoch in range(1, 100 + 1):
        print('hi')
        train_loss = 0
        net.train()
        epoch_start = time.time()
        for idx, (data, target) in enumerate(loader):
            optimizer.zero_grad()

            dat1 = data[0].cuda()
            dat2 = data[1].cuda()
            loss = net(dat1, dat2)

            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        train_loss /= (idx + 1)
        scheduler.step()

        epoch_time = time.time() - epoch_start
        print(
            "Epoch\t",
            epoch,
            "\tLoss\t",
            train_loss,
            "\tTime\t",
            epoch_time,
        )

    elapsed_train_time = time.time() - train_start
    print('Finished training. Train time was:', elapsed_train_time)
Ejemplo n.º 4
0
def train():
    cfg = opt.cfg
    data = opt.data
    img_size = opt.img_size
    epochs = 1 if opt.prebias else int(
        hyp['epochs'])  # 500200 batches at bs 64, 117263 images = 273 epochs
    batch_size = int(hyp['batch_size'])
    accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
    weights = opt.weights  # initial training weights

    if 'pw' not in opt.arc:  # remove BCELoss positive weights
        hyp['cls_pw'] = 1.
        hyp['obj_pw'] = 1.

    # Initialize
    init_seeds()
    if opt.multi_scale:
        img_sz_min = round(img_size / 32 / 1.5) + 1
        img_sz_max = round(img_size / 32 * 1.3) - 1
        img_size = img_sz_max * 32  # initiate with maximum multi_scale size
        print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))

    # Configure run
    data_dict = parse_data_cfg(data)
    train_path = data_dict['train']
    test_path = data_dict['valid']
    nc = int(data_dict['classes'])  # number of classes

    # Remove previous results
    for f in glob.glob('*_batch*.jpg') + glob.glob(results_file):
        os.remove(f)

    # Initialize model
    model = Darknet(cfg, hyp, arc=opt.arc).to(device)

    # Optimizer
    pg0, pg1 = [], []  # optimizer parameter groups
    for k, v in dict(model.named_parameters()).items():
        if 'Conv2d.weight' in k:
            pg1 += [v]  # parameter group 1 (apply weight_decay)
        else:
            pg0 += [v]  # parameter group 0

    if opt.adam:
        optimizer = optim.Adam(pg0, lr=hyp['lr0'])
        # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
    else:
        optimizer = optim.SGD(pg0,
                              lr=hyp['lr0'],
                              momentum=hyp['momentum'],
                              nesterov=True)
    optimizer.add_param_group({
        'params': pg1,
        'weight_decay': hyp['weight_decay']
    })  # add pg1 with weight_decay
    del pg0, pg1

    cutoff = -1  # backbone reaches to cutoff layer
    start_epoch = 0
    best_fitness = float('inf')

    attempt_download(weights)
    if weights.endswith('.pt'):  # pytorch format
        # possible weights are 'last.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
        chkpt = torch.load(weights, map_location=device)

        # load model
        # if opt.transfer:
        chkpt['model'] = {
            k: v
            for k, v in chkpt['model'].items()
            if model.state_dict()[k].numel() == v.numel()
        }
        model.load_state_dict(chkpt['model'], strict=False)
        # else:
        #    model.load_state_dict(chkpt['model'])

        # load optimizer
        if chkpt['optimizer'] is not None:
            optimizer.load_state_dict(chkpt['optimizer'])
            best_fitness = chkpt['best_fitness']

        # load results
        if chkpt.get('training_results') is not None:
            with open(results_file, 'w') as file:
                file.write(chkpt['training_results'])  # write results.txt

        if opt.resume:
            start_epoch = chkpt['epoch'] + 1

        del chkpt

    # elif len(weights) > 0:  # darknet format
    #     # possible weights are 'yolov3.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
    #     cutoff = load_darknet_weights(model, weights)
    if opt.transfer or opt.prebias:  # transfer learning edge (yolo) layers
        nf = [
            int(model.module_defs[x - 1]['filters']) for x in model.yolo_layers
        ]  # yolo layer size (i.e. 255)

        if opt.prebias:
            for p in optimizer.param_groups:
                # lower param count allows more aggressive training settings: i.e. SGD ~0.1 lr0, ~0.9 momentum
                p['lr'] = 0.1  # learning rate
                if p.get('momentum') is not None:  # for SGD but not Adam
                    p['momentum'] = 0.9

        for p in model.parameters():
            if opt.prebias and p.numel() == nf:  # train (yolo biases)
                p.requires_grad = True
            elif opt.transfer and p.shape[
                    0] == nf:  # train (yolo biases+weights)
                p.requires_grad = True
            else:  # freeze layer
                p.requires_grad = False

    # Scheduler https://github.com/ultralytics/yolov3/issues/238
    # lf = lambda x: 1 - x / epochs  # linear ramp to zero
    # lf = lambda x: 10 ** (hyp['lrf'] * x / epochs)  # exp ramp
    # lf = lambda x: 1 - 10 ** (hyp['lrf'] * (1 - x / epochs))  # inverse exp ramp
    # scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
    # scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=range(59, 70, 1), gamma=0.8)  # gradual fall to 0.1*lr0
    scheduler = lr_scheduler.MultiStepLR(
        optimizer,
        milestones=[round(epochs * x) for x in [0.8, 0.9]],
        gamma=0.1)
    # 带重启的余弦退火
    # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max = 0.1*epochs, eta_min=0, last_epoch=-1)
    # 余弦退火
    # scheduler = lr_scheduler.CosineAnnealingLR(optimizer, epochs)
    # warmup加载器,支持各种scheduler
    scheduler = GradualWarmupScheduler(optimizer,
                                       multiplier=hyp['multiplier'],
                                       total_epoch=hyp['warm_epoch'],
                                       after_scheduler=scheduler)
    scheduler.last_epoch = start_epoch - 1

    # # # Plot lr schedule(注意别一直开着!否则lr调整失效)
    # y = []
    # for _ in range(epochs):
    #     scheduler.step()
    #     y.append(optimizer.param_groups[0]['lr'])
    # plt.plot(y, label='LR')
    # plt.xlabel('epoch')
    # plt.ylabel('LR')
    # plt.tight_layout()
    # plt.savefig('LR.png', dpi=300)

    if mixed_precision:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)

    # Initialize distributed training
    if torch.cuda.device_count() > 1:
        dist.init_process_group(
            backend='nccl',  # 'distributed backend'
            init_method=
            'tcp://127.0.0.1:9999',  # distributed training init method
            world_size=1,  # number of nodes for distributed training
            rank=0)  # distributed training node rank
        model = torch.nn.parallel.DistributedDataParallel(model)
        model.yolo_layers = model.module.yolo_layers  # move yolo layer indices to top level

    # Dataset
    dataset = LoadImagesAndLabels(
        train_path,
        img_size,
        batch_size,
        augment=False,
        #   augment=True,
        hyp=hyp,  # augmentation hyperparameters
        rect=opt.rect,  # rectangular training
        image_weights=opt.img_weights,
        cache_labels=epochs > 10,
        cache_images=opt.cache_images and not opt.prebias,
    )

    # Dataloader
    batch_size = min(batch_size, len(dataset))
    nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0,
              8])  # number of workers
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=nw,
        shuffle=not opt.
        rect,  # Shuffle=True unless rectangular training is used
        pin_memory=True,
        collate_fn=dataset.collate_fn)
    # Test Dataloader
    if not opt.prebias:
        testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(
            test_path,
            opt.img_size,
            batch_size * 2,
            hyp=hyp,
            rect=opt.rect,
            cache_labels=True,
            cache_images=opt.cache_images),
                                                 batch_size=batch_size * 2,
                                                 num_workers=nw,
                                                 pin_memory=True,
                                                 collate_fn=dataset.collate_fn)

    # Start training
    model.nc = nc  # attach number of classes to model
    model.arc = opt.arc  # attach yolo architecture
    model.hyp = hyp  # attach hyperparameters to model
    # model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
    model_info(model, report='summary')  # 'full' or 'summary'
    nb = len(dataloader)
    maps = np.zeros(nc)  # mAP per class
    results = (
        0, 0, 0, 0, 0, 0, 0
    )  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification', 'val Regression'
    t0 = time.time()
    print('Using %g dataloader workers' % nw)
    print('Starting %s for %g epochs...' %
          ('prebias' if opt.prebias else 'training', epochs))

    for epoch in range(
            start_epoch, epochs
    ):  # epoch ------------------------------------------------------------------
        model.train()
        model.epoch = epoch
        # print(('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'reg', 'total', 'targets', 'img_size'))
        print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'obj', 'cls', 'reg',
                                     'total', 'targets', 'img_size'))

        # Freeze backbone at epoch 0, unfreeze at epoch 1 (optional)
        freeze_backbone = False
        if freeze_backbone and epoch < 2:
            for name, p in model.named_parameters():
                if int(name.split('.')[1]) < cutoff:  # if layer < 75
                    p.requires_grad = False if epoch == 0 else True

        # Update image weights (optional)
        if dataset.image_weights:
            w = model.class_weights.cpu().numpy() * (1 -
                                                     maps)**2  # class weights
            image_weights = labels_to_image_weights(dataset.labels,
                                                    nc=nc,
                                                    class_weights=w)
            dataset.indices = random.choices(range(dataset.n),
                                             weights=image_weights,
                                             k=dataset.n)  # rand weighted idx

        mloss = torch.zeros(4).to(device)  # mean losses
        pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
        # 着重注意这个targets,已经经过resize到416,augment等变化了,不能直接映射到原图
        for i, (
                imgs, targets, paths, _
        ) in pbar:  # batch -------------------------------------------------------------
            ni = i + nb * epoch  # number integrated batches (since train start)
            imgs = imgs.to(device)  # uint8 to float32, 0 - 255 to 0.0 - 1.0
            targets = targets.to(device)

            # Multi-Scale training
            if opt.multi_scale:
                if ni / accumulate % 10 == 0:  #  adjust (67% - 150%) every 10 batches
                    img_size = random.randrange(img_sz_min,
                                                img_sz_max + 1) * 32
                sf = img_size / max(imgs.shape[2:])  # scale factor
                if sf != 1:
                    ns = [
                        math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]
                    ]  # new shape (stretched to 32-multiple)
                    imgs = F.interpolate(imgs,
                                         size=ns,
                                         mode='bilinear',
                                         align_corners=False)

            # Plot images with bounding boxes
            if ni == 0:
                fname = 'train_batch%g.jpg' % i
                plot_images(imgs=imgs,
                            targets=targets,
                            paths=paths,
                            fname=fname)
                if tb_writer:
                    tb_writer.add_image(fname,
                                        cv2.imread(fname)[:, :, ::-1],
                                        dataformats='HWC')

            # Hyperparameter burn-in
            # n_burn = nb - 1  # min(nb // 5 + 1, 1000)  # number of burn-in batches
            # if ni <= n_burn:
            #     for m in model.named_modules():
            #         if m[0].endswith('BatchNorm2d'):
            #             m[1].momentum = 1 - i / n_burn * 0.99  # BatchNorm2d momentum falls from 1 - 0.01
            #     g = (i / n_burn) ** 4  # gain rises from 0 - 1
            #     for x in optimizer.param_groups:
            #         x['lr'] = hyp['lr0'] * g
            #         x['weight_decay'] = hyp['weight_decay'] * g

            # Run model
            pred = model(imgs)

            # Compute loss
            loss, loss_items = compute_loss(pred, targets, model, hyp)
            if not torch.isfinite(loss):
                print('WARNING: non-finite loss, ending training ', loss_items)
                return results

            # Scale loss by nominal batch_size of 64
            # loss *= batch_size / 64

            # Compute gradient
            if mixed_precision:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            # Accumulate gradient for x batches before optimizing
            if ni % accumulate == 0:
                optimizer.step()
                optimizer.zero_grad()

            # Print batch results
            mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
            mem = torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available(
            ) else 0  # (GB)
            # s = ('%10s' * 2 + '%10.3g' * 7) % (
            #     '%g/%g' % (epoch, epochs - 1), '%.3gG' % mem, *mloss, len(targets), img_size)
            s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1),
                                               '%.3gG' % mem, *mloss,
                                               len(targets), img_size)
            pbar.set_description(s)

            # end batch ------------------------------------------------------------------------------------------------

        # Update scheduler
        scheduler.step()

        # Process epoch results
        final_epoch = epoch + 1 == epochs
        if opt.prebias:
            print_model_biases(model)
        else:
            # Calculate mAP (always test final epoch, skip first 10 if opt.nosave)
            if not (opt.notest or (opt.nosave and epoch < 10)) or final_epoch:
                if not epoch < hyp['test_from']:  # 前部分epoch proposal太多,不计算
                    if epoch % hyp['test_interval'] == 0 and epoch != 0:
                        results, maps = test.test(
                            cfg,
                            data,
                            batch_size=1,
                            img_size=opt.img_size,
                            model=model,
                            hyp=hyp,
                            conf_thres=0.001
                            if final_epoch else 0.1,  # 0.1 for speed
                            save_json=final_epoch and epoch > 0
                            and 'coco.data' in data,
                            dataloader=testloader)
        # Write epoch results
        with open(results_file, 'a') as f:
            f.write(s + '%10.3g' * 7 % results +
                    '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)

        # Write Tensorboard results
        if tb_writer:
            x = list(mloss) + list(results)
            titles = [
                'GIoU', 'Objectness', 'Classification', 'Train loss',
                'Precision', 'Recall', 'mAP', 'F1', 'val GIoU',
                'val Objectness', 'val Classification'
            ]
            for xi, title in zip(x, titles):
                tb_writer.add_scalar(title, xi, epoch)

        # Update best mAP
        fitness = sum(results[4:])  # total loss
        if fitness < best_fitness:
            best_fitness = fitness

        # Save training results
        save = (not opt.nosave) or (final_epoch
                                    and not opt.evolve) or opt.prebias
        if save:
            with open(results_file, 'r') as f:
                # Create checkpoint
                chkpt = {
                    'epoch':
                    epoch,
                    'best_fitness':
                    best_fitness,
                    'training_results':
                    f.read(),
                    'model':
                    model.module.state_dict()
                    if type(model) is nn.parallel.DistributedDataParallel else
                    model.state_dict(),
                    'optimizer':
                    None if final_epoch else optimizer.state_dict()
                }

            # Save last checkpoint
            torch.save(chkpt, last)

            # Save best checkpoint
            if best_fitness == fitness:
                torch.save(chkpt, best)

            # Save backup every 10 epochs (optional)
            if epoch > 0 and epoch % hyp['save_interval'] == 0:
                torch.save(chkpt, wdir + 'backup%g.pt' % epoch)

            # Delete checkpoint
            del chkpt

        # end epoch ----------------------------------------------------------------------------------------------------

    # end training
    if len(opt.name):
        os.rename('results.txt', 'results_%s.txt' % opt.name)
    plot_results()  # save as results.png
    print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1,
                                                    (time.time() - t0) / 3600))
    dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
    torch.cuda.empty_cache()

    return results
Ejemplo n.º 5
0
        model.eval()
        # print(model)
        model = model.to(device)

        criterion = SmoothLabelCritierion(label_smoothing=0.1)
        optimizer = optim.Adam(model.parameters(), lr=3e-4, weight_decay=3e-5)
        if name == '3e-4 -> 1e-4':
            lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=1e-4)
        elif name == '3e-4 -> 3e-5':
            lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=3e-5)
        else:
            lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs - 5, eta_min=0)
        warmup_scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=5, after_scheduler=lr_scheduler)
        optimizer.zero_grad()
        optimizer.step()
        warmup_scheduler.step()

        util.check_dir('../data/models/')
        best_model, loss_dict, top1_acc_dict, top5_acc_dict = train_model(
            data_loaders, data_sizes, name, model, criterion, optimizer, warmup_scheduler,
            num_epochs=num_epochs, device=device)
        # 保存最好的模型参数
        # util.save_model(best_model.cpu(), '../data/models/best_%s.pth' % name)

        res_loss[name] = loss_dict
        res_top1_acc[name] = top1_acc_dict
        res_top5_acc[name] = top5_acc_dict

        print('train %s done' % name)
        print()
Ejemplo n.º 6
0
def training(train_data_list, val_data_list, test_files, fold):

    os.makedirs(os.path.join(config.weights, config.model_name) + os.sep +
                str(fold),
                exist_ok=True)
    os.makedirs(config.best_models, exist_ok=True)
    ### ---------- get model ------------------------------------------
    model = FF3DNet(drop=0.5)
    ### ---------- set lr, opt, loss ------------------------------------------
    img_params = list(map(id, model.img_encoder.parameters()))
    rest_params = filter(lambda p: id(p) not in img_params, model.parameters())
    params = [
        {
            'params': rest_params,
            'lr': config.lr
        },
        {
            'params': model.img_encoder.parameters(),
            'lr': config.lr * 3
        },
    ]
    optimizer = torch.optim.SGD(params, momentum=0.9, weight_decay=1e-4)
    scheduler = lr_scheduler.CosineAnnealingLR(optimizer,
                                               T_max=config.epochs - 5,
                                               eta_min=config.lr / 100)
    scheduler_warmup = GradualWarmupScheduler(optimizer,
                                              multiplier=10,
                                              total_epoch=5,
                                              after_scheduler=scheduler)

    criterion = nn.CrossEntropyLoss().to(device)

    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.to(device)

    best_results = [0, np.inf, 0]
    val_metrics = [0, np.inf, 0]
    ### ---------- load dataset ------------------------------------------
    train_gen = MultiModalDataset(train_data_list,
                                  config.train_data,
                                  config.train_vis,
                                  mode="train")
    train_loader = DataLoader(train_gen,
                              batch_size=config.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=4)

    # val_data=getfiles("val")
    # val_data.sort()
    val_csv = "/root/userfolder/linan/C/preliminary/val.csv"
    val_data = pd.read_csv(val_csv)
    val_gen = MultiModalDataset(val_data,
                                config.train_data,
                                config.train_vis,
                                augument=False,
                                mode="val")
    val_loader = DataLoader(val_gen,
                            512,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=4)
    test_gen = MultiModalDataset(test_files,
                                 config.test_data,
                                 config.test_vis,
                                 augument=False,
                                 mode="test")
    test_loader = DataLoader(test_gen,
                             512,
                             shuffle=False,
                             pin_memory=True,
                             num_workers=4)

    # --- train, val, test -------------------------
    resume = False

    start = timer()
    print("multi fold val")
    #___________________________________________________________________________________________________________________
    for index in [1, 2, 3]:
        print(index)
        checkpoint_loss = torch.load(
            'checkpoints/best_models/0626_debug_fold_' + str(index) +
            '_model_best_loss.pth.tar')
        model.load_state_dict(checkpoint_loss["state_dict"])
        test(val_loader, model, fold, checkpoint_loss, 'best_loss', False,
             index)

        checkpoint_acc = torch.load(
            'checkpoints/best_models/0626_debug_fold_' + str(index) +
            '_model_best_acc.pth.tar')
        model.load_state_dict(checkpoint_acc["state_dict"])
        test(val_loader, model, fold, checkpoint_acc, 'best_acc', False, index)
        #test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', True)
    0 / 0
    #___________________________________________________________________________________________________________________

    if resume:
        checkpoint_loss = torch.load(
            'checkpoints/best_models/0616_coslr_55_fold_0_model_best_loss.pth.tar'
        )
        model.load_state_dict(checkpoint_loss["state_dict"])
        test(test_loader, model, fold, checkpoint_loss, 'best_loss', False)
        checkpoint_acc = torch.load(
            'checkpoints/best_models/0616_coslr_55_fold_0_model_best_acc.pth.tar'
        )
        model.load_state_dict(checkpoint_acc["state_dict"])
        test(test_loader, model, fold, checkpoint_acc, 'best_acc', False)
        test_ensemble_loss_acc(test_loader, fold,
                               [checkpoint_loss, checkpoint_acc], 'ensemble',
                               True)
    else:
        ### ---------- train loop ----------------
        for epoch in range(4, config.epochs):
            scheduler_warmup.step(metrics=val_metrics[0])
            for param_group in optimizer.param_groups:
                log.write(str(param_group['lr']) + '\n')
            train_metrics = train(train_loader, model, criterion, optimizer,
                                  epoch, val_metrics, best_results, start)
            # val_metrics_tta = evaluate(val_loader_tta,model,criterion,epoch,train_metrics,best_results,start)
            val_metrics = evaluate(val_loader, model, criterion, epoch,
                                   train_metrics, best_results, start)
            is_best_acc = val_metrics[0] > best_results[0]
            best_results[0] = max(val_metrics[0], best_results[0])
            is_best_loss = val_metrics[1] < best_results[1]
            best_results[1] = min(val_metrics[1], best_results[1])
            is_best_f1 = val_metrics[2] > best_results[2]
            best_results[2] = max(val_metrics[2], best_results[2])
            save_checkpoint(
                {
                    "epoch": epoch + 1,
                    "model_name": config.model_name,
                    "state_dict": model.state_dict(),
                    "best_acc": best_results[0],
                    "best_loss": best_results[1],
                    "optimizer": optimizer.state_dict(),
                    "fold": fold,
                    "best_f1": best_results[2],
                }, is_best_acc, is_best_loss, is_best_f1, fold)
            print('\r', end='', flush=True)
            print(val_metrics[0], val_metrics[1], val_metrics[2], "val")
            log.write(
                '%s  %5.1f %6.1f      |   %0.3f   %0.3f   %0.3f     |  %0.3f   %0.3f    %0.3f    |   %s  %s  %s | %s' % ( \
                    "best", epoch, epoch,
                    train_metrics[0], train_metrics[1], train_metrics[2],
                    val_metrics[0], val_metrics[1], val_metrics[2],
                    str(best_results[0])[:8], str(best_results[1])[:8], str(best_results[2])[:8],
                    time_to_str((timer() - start), 'min'))
                )
            log.write("\n")
            time.sleep(0.01)
        # log.write("\n----------------------------------------------- [START %s] %s\n\n" % (
        # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51))
        # log.write(
        #     '                           |------------ Train -------|----------- Valid ---------|----------Best Results---|------------|\n')
        # log.write(
        #     'mode     iter     epoch    |    acc  loss  f1_macro   |    acc  loss  f1_macro    |    acc  loss  f1_macro       | time       |\n')
        # log.write(
        #     '-------------------------------------------------------------------------------------------------------------------------|\n')

        ### ---------- per fold ensemble best loss ckpt and best acc ckpt
        checkpoint_loss = torch.load(
            'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' %
            (config.model_name, str(fold)))
        model.load_state_dict(checkpoint_loss["state_dict"])
        test(test_loader, model, fold, checkpoint_loss, 'best_loss', False)
        checkpoint_acc = torch.load(
            'checkpoints/best_models/%s_fold_%s_model_best_acc.pth.tar' %
            (config.model_name, str(fold)))
        model.load_state_dict(checkpoint_acc["state_dict"])
        test(test_loader, model, fold, checkpoint_acc, 'best_acc', False)
        test_ensemble_loss_acc(test_loader, fold,
                               [checkpoint_loss, checkpoint_acc], 'ensemble',
                               not config.k_fold)

    ### ----------- last kfold ensemble all before k ensemble ckpts
    if config.k_fold and fold == config.num_kf:
        mean_npy = np.zeros([10000, 9])
        for i in range(1, config.num_kf + 1):
            checkpoint = torch.load(
                'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' %
                (config.model_name, str(i)))
            loss_pred = np.load('preds_9/%s/%s_val_fold%s_%s.npy' %
                                (checkpoint["model_name"],
                                 checkpoint["model_name"], str(i), 'ensemble'))
            mean_npy += loss_pred
        mean_npy = mean_npy / config.num_kf
        np.save(
            'preds_9/%s/%s_val_fold%s_%s.npy' %
            (checkpoint["model_name"], checkpoint["model_name"], 'cv',
             'ensemble'), mean_npy)
        gen_txt(mean_npy, checkpoint, 'cv', 'ensemble')
class Fitter:
    
    def __init__(self, model, device, config, folder):
        self.config = config
        self.epoch = 0

        #设置工作目录
        self.base_dir = f'./model/seresnext_512/{folder}'
        if not os.path.exists(self.base_dir):
            os.makedirs(self.base_dir)

        self.log_path = f'{self.base_dir}/log.txt'
        self.best_score = 0
        self.best_loss = 10**5
        self.best_ap = 0
        
        self.model = model
        self.device = device
        self.best_true = np.array([])
        self.best_pred = np.array([])

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ] 

        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.lr)
        self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params)
        # self.scheduler.step
        #self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=5, after_scheduler=self.scheduler)
        self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=6)

#         self.criterion = FocalLoss(logits=True).to(self.device)
        self.criterion = LabelSmoothing().to(self.device)
        self.log(f'Fitter prepared. Device is {self.device}')

    def fit(self, train_loader, validation_loader):
        for e in range(self.config.n_epochs):
            if self.config.verbose:
                lr = self.optimizer.param_groups[0]['lr']
                timestamp = datetime.utcnow().isoformat()
                self.log(f'\n{timestamp}\nLR: {lr}')
            
            if self.epoch <= 6:
                self.scheduler_warmup.step(self.epoch)
                print(self.epoch, self.optimizer.param_groups[0]['lr'])

            t = time.time()
            summary_loss, roc_auc_scores, ap_scores , f1_scores, acc_scores = self.train_one_epoch(train_loader)
            self.log(f'[RESULT]: Train. Epoch: {self.epoch}, summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f},\
            acc:{acc_scores.avg:.5f}, ap: {ap_scores.avg:.5f}, f1_scores: {f1_scores.avg:.5f}, time: {(time.time() - t):.5f}')

            t = time.time()
            f_true, f_pred, summary_loss, roc_auc_scores, ap_scores , f1_scores, acc_scores= self.validation(validation_loader)

            self.log(f'[RESULT]: Val. Epoch: {self.epoch}, summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f},\
            acc:{acc_scores.avg:.5f}, ap: {ap_scores.avg:.5f}, f1_scores: {f1_scores.avg:.5f}, time: {(time.time() - t):.5f}')
            if summary_loss.avg < self.best_loss:
                self.best_loss = summary_loss.avg
                self.save_model(f'{self.base_dir}/best-loss-checkpoint-{str(self.epoch).zfill(3)}epoch.bin')
                for path in sorted(glob(f'{self.base_dir}/best-loss-checkpoint-*epoch.bin'))[:-2]:
                    os.remove(path)
                    
            if roc_auc_scores.avg > self.best_score:
                self.best_score = roc_auc_scores.avg
                self.save_model(f'{self.base_dir}/best-score-checkpoint-{str(self.epoch).zfill(3)}epoch.bin')
                for path in sorted(glob(f'{self.base_dir}/best-score-checkpoint-*epoch.bin'))[:-2]:
                    os.remove(path)
                self.best_true = f_true
                self.best_pred = f_pred
                    
            if ap_scores.avg > self.best_ap:
                self.best_ap = ap_scores.avg
                self.save_model(f'{self.base_dir}/best-ap-checkpoint-{str(self.epoch).zfill(3)}epoch.bin')
                for path in sorted(glob(f'{self.base_dir}/best-ap-checkpoint-*epoch.bin'))[:-2]:
                    os.remove(path)

            if self.config.validation_scheduler:
                if self.epoch > 6:
                    self.scheduler.step(metrics=summary_loss.avg)
          
            self.epoch += 1
        #if self.epoch == self.config.n_epochs:
        return self.best_true , self.best_pred

    def validation(self, val_loader):
        self.model.eval()
        summary_loss = AverageMeter()
        roc_auc_scores = RocAucMeter()
        ap_scores = APScoreMeter()
        f1_scores = F1Score()
        acc_scores = AccSocre()
        t = time.time()
        for step, (images, targets) in enumerate(val_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Val Step {step}/{len(val_loader)}, ' + \
                        f'summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f}, ap: {ap_scores.avg:.5f} ' + \
                        f'f1_scores: {f1_scores.avg:.5f} ' + \
                        f'acc_scores: {acc_scores.avg:.5f} ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            with torch.no_grad():
                targets = targets.to(self.device).float()
                batch_size = images.shape[0]
                images = images.to(self.device).float()
                outputs = self.model(images)
                loss = self.criterion(outputs, targets)
                roc_auc_scores.update(targets, outputs)
                ap_scores.update(targets, outputs)
                f1_scores.update(targets, outputs)
                acc_scores.update(targets, outputs)
                summary_loss.update(loss.detach().item(), batch_size)
        f_true = roc_auc_scores.get_true()
        f_pred = roc_auc_scores.get_pred()

        return f_true, f_pred, summary_loss, roc_auc_scores, ap_scores, f1_scores, acc_scores

    def train_one_epoch(self, train_loader):
        self.model.train()
        summary_loss = AverageMeter()
        roc_auc_scores = RocAucMeter()
        ap_scores = APScoreMeter()
        f1_scores = F1Score()
        acc_scores = AccSocre()
        t = time.time()
        
        # print(len(train_loader), "gggggggg")
        for step, (images, targets) in enumerate(train_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Train Step {step}/{len(train_loader)}, ' + \
                        f'summary_loss: {summary_loss.avg:.5f}, roc_auc: {roc_auc_scores.avg:.5f}, ap: {ap_scores.avg:.5f} ' + \
                        f'f1_scores: {f1_scores.avg:.5f} ' + \
                        f'acc_scores: {acc_scores.avg:.5f} ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            
            targets = targets.to(self.device).float()
            images = images.to(self.device).float()
            batch_size = images.shape[0]

            self.optimizer.zero_grad()
            outputs = self.model(images)
            loss = self.criterion(outputs, targets)
            loss.backward()
            
            roc_auc_scores.update(targets, outputs)
            ap_scores.update(targets, outputs)
            summary_loss.update(loss.detach().item(), batch_size)
            f1_scores.update(targets, outputs)
            acc_scores.update(targets, outputs)

            self.optimizer.step()

            if self.config.step_scheduler:
                self.scheduler.step()

        return summary_loss, roc_auc_scores, ap_scores, f1_scores, acc_scores
    
    def save_model(self, path):
        self.model.eval()
        torch.save(self.model.state_dict(),path)

    def save(self, path):
        self.model.eval()
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'best_score': self.best_score,
            'best_ap': self.best_ap,
            'best_loss': self.best_loss,
            'epoch': self.epoch,
        }, path)

    def load(self, path):
        checkpoint = torch.load(path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.best_score = checkpoint['best_score']
        self.best_ap = checkpoint['best_ap']
        self.best_loss = checkpoint['best_loss']
        self.epoch = checkpoint['epoch']
        
    def log(self, message):
        if self.config.verbose:
            print(message)
        with open(self.log_path, 'a+') as logger:
            logger.write(f'{message}\n')
Ejemplo n.º 8
0
def train(args, train_dataset, model):
    tb_writer = SummaryWriter(args.tb_writer_dir)
    result_writer = ResultWriter(args.eval_results_dir)

    if args.weighted_sampling == 1:
        # 세 가지 구질이 불균일하게 분포되었으므로 세 개를 동일한 비율로 샘플링
        # 결과적으로 이 방법을 썼을 때 좋지 않아서 wighted_sampling은 쓰지 않았음
        ball_type, counts = np.unique(train_dataset.pitch, return_counts=True)
        count_dict = dict(zip(ball_type, counts))
        weights = [1.0 / count_dict[p] for p in train_dataset.pitch]
        sampler = WeightedRandomSampler(weights,
                                        len(train_dataset),
                                        replacement=True)
        logger.info("Do Weighted Sampling")
    else:
        sampler = RandomSampler(train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.train_batch_size,
                                  sampler=sampler)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // len(train_dataloader) + 1
    else:
        t_total = len(train_dataloader) * args.num_train_epochs

    args.warmup_step = int(args.warmup_percent * t_total)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = [
        "bias",
        "layernorm.weight",
    ]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = optim.Adam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           eps=args.adam_epsilon)
    if args.warmup_step != 0:
        scheduler_cosine = CosineAnnealingLR(optimizer, t_total)
        scheduler = GradualWarmupScheduler(optimizer,
                                           1,
                                           args.warmup_step,
                                           after_scheduler=scheduler_cosine)
    else:
        scheduler = CosineAnnealingLR(optimizer, t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    loss_fct = torch.nn.NLLLoss()

    # Train!
    logger.info("***** Running Baseball Transformer *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Warmup Steps = %d", args.warmup_step)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.train_batch_size)
    logger.info("  Total train batch size = %d", args.train_batch_size)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    best_step = 0
    steps_trained_in_current_epoch = 0
    tr_loss, logging_loss, logging_val_loss = 0.0, 0.0, 0.0

    best_pitch_micro_f1, best_pitch_macro_f1, = 0, 0
    best_loss = 1e10
    best_pitch_macro_f1 = 0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            (
                pitcher,
                batter,
                state,
                pitch,
                label,
                pitch_memory,
                label_memory,
                memory_mask,
            ) = list(map(lambda x: x.to(args.device), batch))
            model.train()
            pitching_score, memories = model(
                pitcher,
                batter,
                state,
                pitch_memory,
                label_memory,
                memory_mask,
            )

            pitching_score = pitching_score.log_softmax(dim=-1)
            loss = loss_fct(pitching_score, pitch)

            if args.n_gpu > 1:
                loss = loss.mean()

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()

            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1

            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                if args.evaluate_during_training:
                    results, f1_results, f1_log, cm = evaluate(
                        args, args.eval_data_file, model)
                    output_eval_file = os.path.join(args.output_dir,
                                                    "eval_results.txt")
                    print_result(output_eval_file, results, f1_log, cm)

                    for key, value in results.items():
                        tb_writer.add_scalar("eval_{}".format(key), value,
                                             global_step)
                    logging_val_loss = results["loss"]

                tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                     args.logging_steps, global_step)
                logging_loss = tr_loss

                # best 모델 선정 지표를 loss말고 macro-f1으로 설정(trade-off 존재)
                # if best_loss > results["loss"]:
                if best_pitch_macro_f1 < results["pitch_macro_f1"]:
                    best_pitch_micro_f1 = results["pitch_micro_f1"]
                    best_pitch_macro_f1 = results["pitch_macro_f1"]
                    best_loss = results["loss"]
                    results["best_step"] = best_step = global_step

                    output_dir = os.path.join(args.output_dir, "best_model/")
                    os.makedirs(output_dir, exist_ok=True)
                    torch.save(model.state_dict(),
                               os.path.join(output_dir, "pytorch_model.bin"))
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving best model to %s", output_dir)

                    result_path = os.path.join(output_dir, "best_results.txt")
                    print_result(result_path,
                                 results,
                                 f1_log,
                                 cm,
                                 off_logger=True)

                    results.update(dict(f1_results))
                    result_writer.update(args, **results)

                logger.info("  best pitch micro f1 : %s", best_pitch_micro_f1)
                logger.info("  best pitch macro f1 : %s", best_pitch_macro_f1)
                logger.info("  best loss : %s", best_loss)
                logger.info("  best step : %s", best_step)

            if args.save_steps > 0 and global_step % args.save_steps == 0:
                checkpoint_prefix = "checkpoint"
                # Save model checkpoint
                output_dir = os.path.join(
                    args.output_dir, "{}-{}".format(checkpoint_prefix,
                                                    global_step))
                os.makedirs(output_dir, exist_ok=True)
                torch.save(model.state_dict(),
                           os.path.join(output_dir, "pytorch_model.bin"))
                torch.save(args, os.path.join(output_dir, "training_args.bin"))
                logger.info("Saving model checkpoint to %s", output_dir)

                rotate_checkpoints(args, checkpoint_prefix)

                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))
                logger.info("Saving optimizer and scheduler states to %s",
                            output_dir)

    tb_writer.close()

    return global_step, tr_loss / global_step
Ejemplo n.º 9
0
def train(name, df, VAL_FOLD=0, resume=False):
    dt_string = datetime.now().strftime("%d|%m_%H|%M|%S")
    print("Starting -->", dt_string)

    os.makedirs(OUTPUT_DIR, exist_ok=True)
    os.makedirs('checkpoint', exist_ok=True)
    run = f"{name}_[{dt_string}]"

    wandb.init(project="imanip", config=config_defaults, name=run)
    config = wandb.config

    # model = SRM_Classifer(num_classes=1, encoder_checkpoint='weights/pretrain_[31|03_12|16|32].h5')
    model = SMP_SRM_UPP(classifier_only=True)

    # for name_, param in model.named_parameters():
    #     if 'classifier' in name_:
    #         continue
    #     else:
    #         param.requires_grad = False

    print("Parameters : ",
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    wandb.save('segmentation/smp_srm.py')
    wandb.save('dataset.py')

    train_imgaug, train_geo_aug = get_train_transforms()
    transforms_normalize = get_transforms_normalize()

    #region ########################-- CREATE DATASET and DATALOADER --########################
    train_dataset = DATASET(dataframe=df,
                            mode="train",
                            val_fold=VAL_FOLD,
                            test_fold=TEST_FOLD,
                            transforms_normalize=transforms_normalize,
                            imgaug_augment=train_imgaug,
                            geo_augment=train_geo_aug)
    train_loader = DataLoader(train_dataset,
                              batch_size=config.train_batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)

    valid_dataset = DATASET(
        dataframe=df,
        mode="val",
        val_fold=VAL_FOLD,
        test_fold=TEST_FOLD,
        transforms_normalize=transforms_normalize,
    )
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.valid_batch_size,
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True,
                              drop_last=False)

    test_dataset = DATASET(
        dataframe=df,
        mode="test",
        val_fold=VAL_FOLD,
        test_fold=TEST_FOLD,
        transforms_normalize=transforms_normalize,
    )
    test_loader = DataLoader(test_dataset,
                             batch_size=config.valid_batch_size,
                             shuffle=True,
                             num_workers=4,
                             pin_memory=True,
                             drop_last=False)
    #endregion ######################################################################################

    optimizer = get_optimizer(model, config.optimizer, config.learning_rate,
                              config.weight_decay)
    # after_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    #     optimizer,
    #     patience=config.schedule_patience,
    #     mode="min",
    #     factor=config.schedule_factor,
    # )
    after_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                 T_0=35,
                                                                 T_mult=2)
    scheduler = GradualWarmupScheduler(optimizer=optimizer,
                                       multiplier=1,
                                       total_epoch=config.warmup + 1,
                                       after_scheduler=after_scheduler)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    # optimizer.zero_grad()
    # optimizer.step()

    criterion = nn.BCEWithLogitsLoss()
    es = EarlyStopping(patience=200, mode="min")

    model = nn.DataParallel(model).to(device)

    # wandb.watch(model, log_freq=50, log='all')

    start_epoch = 0
    if resume:
        checkpoint = torch.load(
            'checkpoint/(using pretrain)COMBO_ALL_FULL_[09|04_12|46|35].pt')
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print("-----------> Resuming <------------")

    for epoch in range(start_epoch, config.epochs):
        print(f"Epoch = {epoch}/{config.epochs-1}")
        print("------------------")

        train_metrics = train_epoch(model, train_loader, optimizer, scheduler,
                                    criterion, epoch)
        valid_metrics = valid_epoch(model, valid_loader, criterion, epoch)

        scheduler.step(valid_metrics['valid_loss'])

        print(
            f"TRAIN_ACC = {train_metrics['train_acc_05']}, TRAIN_LOSS = {train_metrics['train_loss']}"
        )
        print(
            f"VALID_ACC = {valid_metrics['valid_acc_05']}, VALID_LOSS = {valid_metrics['valid_loss']}"
        )
        print("Optimizer LR", optimizer.param_groups[0]['lr'])
        print("Scheduler LR", scheduler.get_lr()[0])
        wandb.log({
            'optim_lr': optimizer.param_groups[0]['lr'],
            'schedule_lr': scheduler.get_lr()[0]
        })

        es(
            valid_metrics["valid_loss"],
            model,
            model_path=os.path.join(OUTPUT_DIR, f"{run}.h5"),
        )
        if es.early_stop:
            print("Early stopping")
            break

        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
        }
        torch.save(checkpoint, os.path.join('checkpoint', f"{run}.pt"))

    if os.path.exists(os.path.join(OUTPUT_DIR, f"{run}.h5")):
        print(
            model.load_state_dict(
                torch.load(os.path.join(OUTPUT_DIR, f"{run}.h5"))))
        print("LOADED FOR TEST")

    test_metrics = test(model, test_loader, criterion)
    wandb.save(os.path.join(OUTPUT_DIR, f"{run}.h5"))

    return test_metrics
Ejemplo n.º 10
0
        # print(model)
        model = model.to(device)

        criterion = SmoothLabelCritierion(label_smoothing=0.1)
        # criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
        scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                         num_epochs - 5,
                                                         eta_min=1e-4)
        lr_scheduler = GradualWarmupScheduler(optimizer,
                                              multiplier=1,
                                              total_epoch=5,
                                              after_scheduler=scheduler)
        optimizer.zero_grad()
        optimizer.step()
        lr_scheduler.step()

        util.check_dir('../data/models/')
        best_model, loss_dict, top1_acc_dict, top5_acc_dict = train_model(
            data_loaders,
            data_sizes,
            name,
            model,
            criterion,
            optimizer,
            lr_scheduler,
            num_epochs=num_epochs,
            device=device)
        # 保存最好的模型参数
        # util.save_model(best_model.cpu(), '../data/models/best_%s.pth' % name)
Ejemplo n.º 11
0
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Top 1-err {top1.val:.4f} ({top1.avg:.4f})'.format(
                epoch, epochs, i, len(val_loader), batch_time=batch_time, loss=losses,
                top1=top1))

    print('* Epoch: [{0}/{1}]\t Top 1-err {top1.avg:.3f}\t Test Loss {loss.avg:.3f}'.format(
        epoch, epochs, top1=top1, loss=losses))
    return top1.avg, losses.avg


# _, _, val_loss = validate(valid_loader, model, criterion)
val_loss = 5
for epoch in range(0, epochs):

    scheduler_warmup.step(epoch, val_loss)
    # train for one epoch
    train_loss = train(train_loader, model, criterion, optimizer, epoch)

    # evaluate on validation set
    err1, val_loss = validate(valid_loader, model, criterion, epoch)

    # remember best prec@1 and save checkpoint
    is_best = err1 <= best_err1
    best_err1 = min(err1, best_err1)

    print('Current best accuracy (top-1):', best_err1)
    save_checkpoint({
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'best_err1': best_err1,
Ejemplo n.º 12
0
class scene_transformer(LightningModule):
    def __init__(self, cfg):

        super(scene_transformer, self).__init__()
        self.hparams = cfg
        self.emb_dim = cfg["model"]["emb_dim"]
        self.save_hyperparameters(cfg)
        self.cfg = cfg
        self.cat_emb = nn.Embedding(
            cfg["model"]["cat"]["start_token"] + 1,
            cfg["model"]["emb_dim"],
            padding_idx=cfg["model"]["cat"]["pad_token"],
        )
        self.pos_emb = nn.Embedding(cfg["model"]["max_seq_len"],
                                    cfg["model"]["emb_dim"])

        self.coor_type_emb = nn.Embedding(3, cfg["model"]["emb_dim"])

        self.x_coor_emb = nn.Embedding(
            cfg["model"]["coor"]["start_token"] + 1,
            cfg["model"]["emb_dim"],
            padding_idx=cfg["model"]["coor"]["pad_token"],
        )
        self.y_coor_emb = nn.Embedding(
            cfg["model"]["coor"]["start_token"] + 1,
            cfg["model"]["emb_dim"],
            padding_idx=cfg["model"]["coor"]["pad_token"],
        )
        self.z_coor_emb = nn.Embedding(
            cfg["model"]["coor"]["start_token"] + 1,
            cfg["model"]["emb_dim"],
            padding_idx=cfg["model"]["coor"]["pad_token"],
        )

        self.orient_emb = nn.Embedding(
            cfg["model"]["orient"]["start_token"] + 1,
            cfg["model"]["emb_dim"],
            padding_idx=cfg["model"]["orient"]["pad_token"],
        )
        self.dim_emb = nn.Embedding(
            cfg["model"]["dim"]["start_token"] + 1,
            cfg["model"]["emb_dim"],
            padding_idx=cfg["model"]["dim"]["pad_token"],
        )

        self.shape_cond = cfg["model"]["dim"]["shape_cond"]

        if self.shape_cond:
            print("Using shape cond model")
            self.x_emb = nn.Embedding(16, self.emb_dim)
            self.y_emb = nn.Embedding(16, self.emb_dim)
            self.img_encoder = resnet_small(layers=[1, 2, 2],
                                            num_input_channels=1,
                                            dim=self.emb_dim)
            layer = nn.TransformerDecoderLayer
            gen_model = nn.TransformerDecoder
        else:
            layer = nn.TransformerEncoderLayer
            gen_model = nn.TransformerEncoder

        d_layer = layer(
            d_model=self.emb_dim,
            nhead=cfg["model"]["num_heads"],
            dim_feedforward=cfg["model"]["dim_fwd"],
            dropout=cfg["model"]["dropout"],
        )

        self.generator = gen_model(d_layer, cfg["model"]["num_blocks"])

        self.output_dim = nn.Linear(cfg["model"]["emb_dim"],
                                    cfg["model"]["dim"]["start_token"])

        self.decoder_seq_len = cfg["model"]["max_seq_len"]

    def get_shape_memory(self, room_shape):
        """
        Get the transformer encoder memory for the room_shape condition images
        (similar to PolyGen image conditional model)

        room_shape: (bsize, input_channel, 512, 512)

        return: (16*16, bsize, embdim)
        """
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        features = self.img_encoder(room_shape.to(device))
        # dimension of condition image
        img_dim = features.shape[-1]
        # 0,1,2 .. img_dim
        ndx = torch.LongTensor(range(img_dim)).unsqueeze(0).to(device)
        # positional embedding in X and Y axes
        x_emb, y_emb = (
            self.x_emb(ndx).transpose(1, 2).unsqueeze(3),
            self.y_emb(ndx).transpose(1, 2).unsqueeze(2),
        )

        # add positional embedding
        tmp = features + x_emb + y_emb
        features_flat = tmp.reshape(tmp.shape[0], tmp.shape[1], -1)
        memory = features_flat.permute(2, 0, 1)

        return memory

    def forward(
        self,
        cat_seq,
        x_loc_seq,
        y_loc_seq,
        z_loc_seq,
        orient_seq,
        dim_seq,
        room_shape=None,
    ):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        (
            cat_emb,
            pos_emb,
            x_emb,
            y_emb,
            z_emb,
            ori_emb,
            dim_emb,
            coor_type_emb,
        ) = self.get_embedding(cat_seq, x_loc_seq, y_loc_seq, z_loc_seq,
                               orient_seq, dim_seq)  # ,obj_emb

        joint_emb = (cat_emb + pos_emb + x_emb + y_emb + z_emb + ori_emb +
                     dim_emb + coor_type_emb)

        tgt_padding_mask = self.get_padding_mask(dim_seq)[:, :-1].to(device)
        tgt_mask = self.generate_square_subsequent_mask(dim_seq.shape[1] -
                                                        1).to(device)

        tgt = joint_emb.transpose(1, 0)[:-1, :, :]

        if self.shape_cond:
            memory = self.get_shape_memory(
                room_shape) if self.shape_cond else None
            out_embs = self.generator(tgt,
                                      memory,
                                      tgt_mask=tgt_mask,
                                      tgt_key_padding_mask=tgt_padding_mask)
        else:
            out_embs = self.generator(tgt, tgt_mask, tgt_padding_mask)

        out_embs = out_embs.transpose(1, 0)

        out_dim = self.output_dim(out_embs)

        logprobs_dim = F.log_softmax(out_dim, dim=-1)

        return logprobs_dim

    def get_embedding(self, cat_seq, x_loc_seq, y_loc_seq, z_loc_seq,
                      orient_seq, dim_seq):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        cat_emb = self.cat_emb(cat_seq)
        batch_size, seq_len = cat_seq.shape

        x_emb = self.x_coor_emb(x_loc_seq)
        y_emb = self.y_coor_emb(y_loc_seq)
        z_emb = self.z_coor_emb(z_loc_seq)

        ori_emb = self.orient_emb(orient_seq)

        dim_emb = self.dim_emb(dim_seq)

        pos_seq = torch.arange(0, seq_len).to(device)
        pos_emb = self.pos_emb(pos_seq)

        ndx = np.arange(seq_len).reshape((1, -1))
        ndx_ref = np.arange(seq_len).reshape((1, -1))
        ndx[ndx_ref % 3 == 1] = 0
        ndx[ndx_ref % 3 == 2] = 1
        ndx[ndx_ref % 3 == 0] = 2
        ndx = torch.LongTensor(ndx).to(device)
        coor_type_emb = self.coor_type_emb(ndx).repeat(batch_size, 1, 1)

        return cat_emb, pos_emb, x_emb, y_emb, z_emb, ori_emb, dim_emb, coor_type_emb

    def generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = (mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(
            mask == 1, float(0.0)))
        return mask

    def get_padding_mask(self, seq):
        mask = torch.ByteTensor(np.zeros(seq.shape, dtype=np.uint8))
        mask[seq == self.cfg["model"]["dim"]["pad_token"]] = 1

        return mask.bool()

    def configure_optimizers(self):
        self.optim = Adam(
            self.parameters(),
            lr=self.cfg["train"]["lr"],
            weight_decay=self.cfg["train"]["l2"],
        )
        self.sched = CosineAnnealingLR(self.optim,
                                       T_max=self.cfg["train"]["lr_restart"])
        self.warmup = GradualWarmupScheduler(
            self.optim,
            multiplier=1,
            total_epoch=self.cfg["train"]["warmup"],
            after_scheduler=self.sched,
        )

        return [self.optim], [self.sched]

    def optimizer_step(self,
                       current_epoch,
                       batch_nb,
                       optimizer,
                       optimizer_i,
                       second_order_closure=None):
        optimizer.step()
        self.warmup.step()
        optimizer.zero_grad()

    def general_step(self, batch):
        loss = 0
        cat_seq, x_loc_seq, y_loc_seq, z_loc_seq, orient_seq, dim_seq = (
            batch["cat_seq"],
            batch["x_loc_seq"],
            batch["y_loc_seq"],
            batch["z_loc_seq"],
            batch["orient_seq"],
            batch["dim_seq"],
        )
        room_shape = batch["floor"] if self.shape_cond else None
        logprobs_ori = self.forward(
            cat_seq,
            x_loc_seq,
            y_loc_seq,
            z_loc_seq,
            orient_seq,
            dim_seq,
            room_shape=room_shape,
        )

        loss_ori = F.nll_loss(
            logprobs_ori.transpose(1, 2),
            batch["dim_seq"][:, 1:],
            ignore_index=self.cfg["model"]["dim"]["pad_token"],
        )

        loss = loss_ori

        return loss

    def training_step(self, batch, batch_idx):
        loss = self.general_step(batch)
        lr = get_lr(self.optim)
        log = {"loss": {"train_loss": loss}, "lr": lr}
        return {"loss": loss, "log": log}

    def validation_step(self, batch, batch_idx):
        loss = self.general_step(batch)
        return {"val_loss": loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()

        log = {"loss": {"val": avg_loss}}

        return {"val_loss": avg_loss, "log": log}

    def decode_multi_model(
        self,
        out_ndx,
        cat_gen_seq,
        x_gen_seq,
        y_gen_seq,
        z_gen_seq,
        ori_gen_seq,
        dim_gen_seq,
        probabilistic=False,
        nucleus=False,
        room_shape=None,
    ):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        curr_cat_seq = cat_gen_seq + (self.decoder_seq_len -
                                      len(cat_gen_seq)) * [0]
        curr_cat_seq = torch.LongTensor(curr_cat_seq).view(1, -1).to(device)

        curr_x_seq = x_gen_seq + (self.decoder_seq_len - len(x_gen_seq)) * [0]
        curr_x_seq = torch.LongTensor(curr_x_seq).view(1, -1).to(device)

        curr_y_seq = y_gen_seq + (self.decoder_seq_len - len(y_gen_seq)) * [0]
        curr_y_seq = torch.LongTensor(curr_y_seq).view(1, -1).to(device)

        curr_z_seq = z_gen_seq + (self.decoder_seq_len - len(z_gen_seq)) * [0]
        curr_z_seq = torch.LongTensor(curr_z_seq).view(1, -1).to(device)

        curr_orient_seq = ori_gen_seq + (self.decoder_seq_len -
                                         len(ori_gen_seq)) * [0]
        curr_orient_seq = torch.LongTensor(curr_orient_seq).view(1,
                                                                 -1).to(device)

        curr_dim_seq = dim_gen_seq + (self.decoder_seq_len -
                                      len(dim_gen_seq)) * [0]
        curr_dim_seq = torch.LongTensor(curr_dim_seq).view(1, -1).to(device)

        (
            cat_emb,
            pos_emb,
            x_emb,
            y_emb,
            z_emb,
            ori_emb,
            dim_emb,
            coor_type_emb,
        ) = self.get_embedding(
            curr_cat_seq,
            curr_x_seq,
            curr_y_seq,
            curr_z_seq,
            curr_orient_seq,
            curr_dim_seq,
        )

        joint_emb = (cat_emb + pos_emb + x_emb + y_emb + z_emb + ori_emb +
                     dim_emb + coor_type_emb)
        tgt = joint_emb.transpose(1, 0)
        tgt_mask = self.generate_square_subsequent_mask(
            tgt.shape[0]).to(device)
        tgt_padding_mask = self.get_padding_mask(curr_cat_seq).to(device)

        if self.shape_cond:
            room_shape = room_shape.unsqueeze(0)
            memory = self.get_shape_memory(
                room_shape) if self.shape_cond else None
            out_embs = self.generator(tgt,
                                      memory,
                                      tgt_mask=tgt_mask,
                                      tgt_key_padding_mask=tgt_padding_mask)
        else:
            out_embs = self.generator(tgt, tgt_mask, tgt_padding_mask)

        logits_dim = self.output_dim(out_embs)[out_ndx][0]

        if probabilistic and nucleus:
            logits_dim = sample_top_p(logits_dim)

        probs_dim = F.softmax(logits_dim, dim=-1)

        if probabilistic:
            dim_next_token = Categorical(probs=probs_dim).sample()
        else:
            _, dim_next_token = torch.max(probs_dim, dim=0)

        if dim_next_token == self.cfg["model"]["dim"]["stop_token"]:
            dim_next_token = 999

        return dim_next_token
Ejemplo n.º 13
0
class TrainingLoop():
    def __init__(self,
                 model_kwargs,
                 train_positive_paths,
                 train_negative_paths,
                 train_unlabeled_paths,
                 val_positive_paths,
                 val_negative_paths,
                 val_unlabeled_paths,
                 data_cache_dir: str,
                 notify_callback: Callable[[Dict[str, Any]],
                                           None] = lambda x: None):
        '''The training loop for background splitting models.'''
        self.data_cache_dir = data_cache_dir
        self.notify_callback = notify_callback

        self._setup_model_kwargs(model_kwargs)

        # Setup dataset
        self._setup_dataset(train_positive_paths, train_negative_paths,
                            train_unlabeled_paths, val_positive_paths,
                            val_negative_paths, val_unlabeled_paths)

        # Setup model
        self._setup_model()

        # Setup optimizer

        # Resume if requested
        resume_from = model_kwargs.get('resume_from', None)
        if resume_from:
            resume_training = model_kwargs.get('resume_training', False)
            self.load_checkpoint(resume_from, resume_training=resume_training)

        self.writer = SummaryWriter(log_dir=model_kwargs['log_dir'])

        # Variables for estimating run-time
        self.train_batch_time = EMA(0)
        self.val_batch_time = EMA(0)
        self.train_batches_per_epoch = (len(self.train_dataloader.dataset) /
                                        self.train_dataloader.batch_size)
        self.val_batches_per_epoch = (len(self.val_dataloader.dataset) /
                                      self.val_dataloader.batch_size)
        self.train_batch_idx = 0
        self.val_batch_idx = 0
        self.train_epoch_loss = 0
        self.train_epoch_main_loss = 0
        self.train_epoch_aux_loss = 0

    def _setup_model_kwargs(self, model_kwargs):
        self.model_kwargs = copy.deepcopy(model_kwargs)
        self.num_workers = NUM_WORKERS
        self.val_frequency = model_kwargs.get('val_frequency', 1)
        self.checkpoint_frequency = model_kwargs.get('checkpoint_frequency', 1)
        self.use_cuda = bool(model_kwargs.get('use_cuda', True))
        assert 'model_dir' in model_kwargs
        self.model_dir = model_kwargs['model_dir']
        assert 'aux_labels' in model_kwargs
        self.aux_weight = float(model_kwargs.get('aux_weight', 0.1))
        assert 'log_dir' in model_kwargs

    def _setup_dataset(self, train_positive_paths, train_negative_paths,
                       train_unlabeled_paths, val_positive_paths,
                       val_negative_paths, val_unlabeled_paths):
        assert self.model_kwargs
        aux_labels = self.model_kwargs['aux_labels']
        image_input_size = self.model_kwargs.get('input_size', 224)
        batch_size = int(self.model_kwargs.get('batch_size', 64))
        num_workers = self.num_workers
        restrict_aux_labels = bool(
            self.model_kwargs.get('restrict_aux_labels', True))
        cache_images_on_disk = self.model_kwargs.get('cache_images_on_disk',
                                                     False)

        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(image_input_size),
            transforms.RandomHorizontalFlip(),
            transforms.ConvertImageDtype(torch.float32),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        resize_size = int(image_input_size * 1.15)
        resize_size += int(resize_size % 2)
        val_transform = transforms.Compose([
            transforms.Resize(resize_size),
            transforms.CenterCrop(image_input_size),
            transforms.ConvertImageDtype(torch.float32),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        self.train_dataloader = DataLoader(AuxiliaryDataset(
            positive_paths=train_positive_paths,
            negative_paths=train_negative_paths,
            unlabeled_paths=train_unlabeled_paths,
            auxiliary_labels=aux_labels,
            restrict_aux_labels=restrict_aux_labels,
            cache_images_on_disk=cache_images_on_disk,
            data_cache_dir=self.data_cache_dir,
            transform=train_transform),
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=num_workers)
        self.val_dataloader = DataLoader(AuxiliaryDataset(
            positive_paths=val_positive_paths,
            negative_paths=val_negative_paths,
            unlabeled_paths=val_unlabeled_paths,
            auxiliary_labels=aux_labels,
            restrict_aux_labels=restrict_aux_labels,
            cache_images_on_disk=cache_images_on_disk,
            data_cache_dir=self.data_cache_dir,
            transform=val_transform),
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=num_workers)

    def _setup_model(self):
        num_classes = 2
        num_aux_classes = self.train_dataloader.dataset.num_auxiliary_classes
        freeze_backbone = self.model_kwargs.get('freeze_backbone', False)
        self.model_kwargs['num_aux_classes'] = num_aux_classes
        self.model = Model(num_main_classes=num_classes,
                           num_aux_classes=num_aux_classes,
                           freeze_backbone=freeze_backbone)
        if self.model_kwargs.get('aux_labels_type', None) == "imagenet":
            # Initialize auxiliary head to imagenet fc
            self.model.auxiliary_head.weight = self.model.backbone.fc.weight
            self.model.auxiliary_head.bias = self.model.backbone.fc.bias
        if self.use_cuda:
            self.model = self.model.cuda()
        self.model = nn.DataParallel(self.model)
        self.main_loss = nn.CrossEntropyLoss()
        self.auxiliary_loss = nn.CrossEntropyLoss()
        self.start_epoch = 0
        self.end_epoch = self.model_kwargs.get('epochs_to_run', 1)
        self.current_epoch = 0
        self.global_train_batch_idx = 0
        self.global_val_batch_idx = 0

        lr = float(self.model_kwargs.get('initial_lr', 0.01))
        endlr = float(self.model_kwargs.get('endlr', 0.0))
        optim_params = dict(
            lr=lr,
            momentum=float(self.model_kwargs.get('momentum', 0.9)),
            weight_decay=float(self.model_kwargs.get('weight_decay', 0.0001)),
        )
        self.optimizer = optim.SGD(self.model.parameters(), **optim_params)
        max_epochs = int(self.model_kwargs.get('max_epochs', 90))
        warmup_epochs = int(self.model_kwargs.get('warmup_epochs', 0))
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer,
                                                               max_epochs -
                                                               warmup_epochs,
                                                               eta_min=endlr)
        self.optimizer_scheduler = GradualWarmupScheduler(
            optimizer=self.optimizer,
            multiplier=1.0,
            warmup_epochs=warmup_epochs,
            after_scheduler=scheduler)

    def _notify(self):
        epochs_left = self.end_epoch - self.current_epoch - 1
        num_train_batches_left = (
            epochs_left * self.train_batches_per_epoch +
            max(0, self.train_batches_per_epoch - self.train_batch_idx - 1))
        num_val_batches_left = (
            (1 + round(epochs_left / self.val_frequency)) *
            self.val_batches_per_epoch +
            max(0, self.val_batches_per_epoch - self.val_batch_idx - 1))
        time_left = (num_train_batches_left * self.train_batch_time.value +
                     num_val_batches_left * self.val_batch_time.value)
        self.notify_callback(**{"training_time_left": time_left})

    def setup_resume(self, train_positive_paths, train_negative_paths,
                     train_unlabeled_paths, val_positive_paths,
                     val_negative_paths, val_unlabeled_paths):
        self._setup_dataset(train_positive_paths, train_negative_paths,
                            train_unlabeled_paths, val_positive_paths,
                            val_negative_paths, val_unlabeled_paths)
        self.start_epoch = self.end_epoch
        self.current_epoch = self.start_epoch
        self.end_epoch = self.start_epoch + self.model_kwargs.get(
            'epochs_to_run', 1)

    def load_checkpoint(self, path: str, resume_training: bool = False):
        checkpoint_state = torch.load(path)
        self.model.load_state_dict(checkpoint_state['state_dict'])
        if resume_training:
            self.global_train_batch_idx = checkpoint_state[
                'global_train_batch_idx']
            self.global_val_batch_idx = checkpoint_state[
                'global_val_batch_idx']
            self.start_epoch = checkpoint_state['epoch'] + 1
            self.current_epoch = self.start_epoch
            self.end_epoch = (self.start_epoch +
                              self.model_kwargs.get('epochs_to_run', 1))
            self.optimizer.load_state_dict(checkpoint_state['optimizer'])
            self.optimizer_scheduler.load_state_dict(
                checkpoint_state['optimizer_scheduler'])
            # Copy tensorboard state
            prev_log_dir = checkpoint_state['model_kwargs']['log_dir']
            curr_log_dir = self.model_kwargs['log_dir']
            shutil.copytree(prev_log_dir, curr_log_dir)

    def save_checkpoint(self, epoch, checkpoint_path: str):
        kwargs = dict(self.model_kwargs)
        del kwargs['aux_labels']
        state = dict(
            global_train_batch_idx=self.global_train_batch_idx,
            global_val_batch_idx=self.global_val_batch_idx,
            model_kwargs=kwargs,
            epoch=epoch,
            state_dict=self.model.state_dict(),
            optimizer=self.optimizer.state_dict(),
            optimizer_scheduler=self.optimizer_scheduler.state_dict(),
        )
        os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
        torch.save(state, checkpoint_path)

    def _validate(self, dataloader):
        self.model.eval()
        loss_value = 0
        main_gts = []
        aux_gts = []
        main_preds = []
        aux_preds = []
        for batch_idx, (images, main_labels,
                        aux_labels) in enumerate(dataloader):
            batch_start = time.perf_counter()
            self.val_batch_idx = batch_idx
            if self.use_cuda:
                images = images.cuda()
                main_labels = main_labels.cuda()
                aux_labels = aux_labels.cuda()
            main_logits, aux_logits = self.model(images)
            valid_main_labels = main_labels != -1
            valid_aux_labels = aux_labels != -1
            main_loss_value = self.main_loss(main_logits[valid_main_labels],
                                             main_labels[valid_main_labels])
            aux_loss_value = self.aux_weight * self.auxiliary_loss(
                aux_logits[valid_aux_labels], aux_labels[valid_aux_labels])
            loss_value = torch.zeros_like(main_loss_value)
            if valid_main_labels.sum() > 0:
                loss_value += main_loss_value
            if valid_aux_labels.sum() > 0:
                loss_value += aux_loss_value
            loss_value = loss_value.item()

            if valid_main_labels.sum() > 0:
                main_pred = F.softmax(main_logits[valid_main_labels])
                main_preds += list(
                    main_pred.argmax(dim=1)[valid_main_labels].cpu().numpy())
                main_gts += list(main_labels[valid_main_labels].cpu().numpy())
            if valid_aux_labels.sum() > 0:
                aux_pred = F.softmax(main_logits[valid_main_labels])
                aux_preds += list(
                    aux_pred.argmax(dim=1)[valid_aux_labels].cpu().numpy())
                aux_gts += list(aux_labels[valid_aux_labels].cpu().numpy())
            batch_end = time.perf_counter()
            self.val_batch_time += (batch_end - batch_start)
            self.global_val_batch_idx += 1
        # Compute F1 score
        if len(dataloader) > 0:
            loss_value /= (len(dataloader) + 1e-10)
            main_prec, main_recall, main_f1, _ = \
                sklearn.metrics.precision_recall_fscore_support(
                    main_gts, main_preds, average='binary')
            aux_prec, aux_recall, aux_f1, _ = \
                sklearn.metrics.precision_recall_fscore_support(
                    aux_gts, aux_preds, average='micro')
        else:
            loss_value = 0
            main_prec = -1
            main_recall = -1
            main_f1 = -1
            aux_prec = -1
            aux_recall = -1
            aux_f1 = -1

        summary_data = [
            ('loss', loss_value),
            ('f1/main_head', main_f1),
            ('prec/main_head', main_prec),
            ('recall/main_head', main_recall),
            ('f1/aux_head', aux_f1),
            ('prec/aux_head', aux_prec),
            ('recall/aux_head', aux_recall),
        ]
        for k, v in [('val/epoch/' + tag, v) for tag, v in summary_data]:
            self.writer.add_scalar(k, v, self.current_epoch)

    def validate(self):
        self._validate(self.val_dataloader)

    def train(self):
        self.model.train()
        logger.info('Starting train epoch')
        load_start = time.perf_counter()
        self.train_epoch_loss = 0
        self.train_epoch_main_loss = 0
        self.train_epoch_aux_loss = 0
        main_gts = []
        aux_gts = []
        main_logits_all = []
        main_preds = []
        aux_preds = []
        for batch_idx, (images, main_labels,
                        aux_labels) in enumerate(self.train_dataloader):
            load_end = time.perf_counter()
            batch_start = time.perf_counter()
            self.train_batch_idx = batch_idx
            logger.debug('Train batch')
            if self.use_cuda:
                images = images.cuda()
                main_labels = main_labels.cuda()
                aux_labels = aux_labels.cuda()

            main_logits, aux_logits = self.model(images)
            # Compute loss
            valid_main_labels = main_labels != -1
            valid_aux_labels = aux_labels != -1

            main_loss_value = self.main_loss(main_logits[valid_main_labels],
                                             main_labels[valid_main_labels])
            aux_loss_value = self.aux_weight * self.auxiliary_loss(
                aux_logits[valid_aux_labels], aux_labels[valid_aux_labels])

            loss_value = torch.zeros_like(main_loss_value)
            if valid_main_labels.sum() > 0:
                loss_value += main_loss_value
            if valid_aux_labels.sum() > 0:
                loss_value += aux_loss_value

            self.train_epoch_loss += loss_value.item()
            if torch.sum(valid_main_labels) > 0:
                self.train_epoch_main_loss += main_loss_value.item()
            if torch.sum(valid_aux_labels) > 0:
                self.train_epoch_aux_loss += aux_loss_value.item()
            # Update gradients
            self.optimizer.zero_grad()
            loss_value.backward()
            self.optimizer.step()

            if valid_main_labels.sum() > 0:
                main_pred = F.softmax(main_logits[valid_main_labels], dim=1)
                main_logits_all += list(
                    main_logits[valid_main_labels].detach().cpu().numpy())
                main_preds += list(
                    main_pred[valid_main_labels].argmax(dim=1).cpu().numpy())
                main_gts += list(main_labels[valid_main_labels].cpu().numpy())
            if valid_aux_labels.sum() > 0:
                aux_pred = F.softmax(aux_logits[valid_aux_labels], dim=1)
                aux_preds += list(
                    aux_pred[valid_aux_labels].argmax(dim=1).cpu().numpy())
                aux_gts += list(aux_labels[valid_aux_labels].cpu().numpy())

            batch_end = time.perf_counter()
            total_batch_time = (batch_end - batch_start)
            total_load_time = (load_end - load_start)
            self.train_batch_time += total_batch_time + total_load_time
            logger.debug(f'Train batch time: {self.train_batch_time.value}, '
                         f'this batch time: {total_batch_time}, '
                         f'this load time: {total_load_time}, '
                         f'batch epoch loss: {loss_value.item()}, '
                         f'main loss: {main_loss_value.item()}, '
                         f'aux loss: {aux_loss_value.item()}')
            summary_data = [
                ('loss', loss_value.item()),
                ('loss/main_head', main_loss_value.item()),
                ('loss/aux_head', aux_loss_value.item()),
            ]
            for k, v in [('train/batch/' + tag, v) for tag, v in summary_data]:
                self.writer.add_scalar(k, v, self.global_train_batch_idx)

            self._notify()
            self.global_train_batch_idx += 1
            load_start = time.perf_counter()

        model_lr = self.optimizer.param_groups[-1]['lr']
        self.optimizer_scheduler.step()
        logger.debug(f'Train epoch loss: {self.train_epoch_loss}, '
                     f'main loss: {self.train_epoch_main_loss}, '
                     f'aux loss: {self.train_epoch_aux_loss}')
        main_prec, main_recall, main_f1, _ = \
            sklearn.metrics.precision_recall_fscore_support(
                main_gts, main_preds, average='binary')
        aux_prec, aux_recall, aux_f1, _ = \
            sklearn.metrics.precision_recall_fscore_support(
                aux_gts, aux_preds, average='micro')
        logger.debug(
            f'Train epoch main: {main_prec}, {main_recall}, {main_f1}, '
            f'aux: {aux_prec}, {aux_recall}, {aux_f1}'
            f'main loss: {self.train_epoch_main_loss}, '
            f'aux loss: {self.train_epoch_aux_loss}')
        summary_data = [('lr', model_lr), ('loss', self.train_epoch_loss),
                        ('loss/main_head', self.train_epoch_main_loss),
                        ('loss/aux_head', self.train_epoch_aux_loss),
                        ('f1/main_head', main_f1),
                        ('prec/main_head', main_prec),
                        ('recall/main_head', main_recall),
                        ('f1/aux_head', aux_f1), ('prec/aux_head', aux_prec),
                        ('recall/aux_head', aux_recall)]
        for k, v in [('train/epoch/' + tag, v) for tag, v in summary_data]:
            self.writer.add_scalar(k, v, self.current_epoch)

        if len(main_logits_all):
            self.writer.add_histogram(
                'train/epoch/softmax/main_head',
                scipy.special.softmax(main_logits_all, axis=1)[:, 1])

    def run(self):
        self.last_checkpoint_path = None
        for i in range(self.start_epoch, self.end_epoch):
            logger.info(f'Train: Epoch {i}')
            self.current_epoch = i
            self.train()
            if i % self.val_frequency == 0 or i == self.end_epoch - 1:
                logger.info(f'Validate: Epoch {i}')
                self.validate()
            if i % self.checkpoint_frequency == 0 or i == self.end_epoch - 1:
                logger.info(f'Checkpoint: Epoch {i}')
                self.last_checkpoint_path = os.path.join(
                    self.model_dir, f'checkpoint_{i:03}.pth')
                self.save_checkpoint(i, self.last_checkpoint_path)
        return self.last_checkpoint_path
Ejemplo n.º 14
0
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    num_classes = 1000

    # create model
    print("=> creating model '{}'".format(args.arch))
    model = models.__dict__[args.arch](stem_type=args.stem_type,
                                       num_classes=num_classes,
                                       block_type=models.PreBasicBlock,
                                       activation=nn.PReLU)
    bchef = BinaryChef('recepies/imagenet-baseline.yaml')
    model = bchef.run_step(model, args.step)
    print(model)

    print('Num paramters: {}'.format(count_parameters(model)))

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    parameters = model.parameters()
    if args.optimizer == 'adamw':
        wd = args.weight_decay if args.step == 0 else 0
        optimizer = torch.optim.AdamW(parameters, args.lr, weight_decay=wd)
    elif args.optimizer == 'adam':
        optimizer = torch.optim.Adam(parameters, args.lr)
    elif args.optimizer == 'sgd':
        wd = 0 if args.step > 0 else args.weight_decay
        optimizer = torch.optim.SGD(parameters,
                                    args.lr,
                                    momentum=args.momentum,
                                    weight_decay=wd)
    else:
        raise ValueError('Unknown optimizer selected: {}'.format(
            args.optimizer))

    if args.scheduler == 'multistep':
        milestone = [40, 70, 80, 100, 110]
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[x - args.warmup for x in milestone],
            gamma=0.1)  #
    elif args.scheduler == 'cosine':
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, float(args.epochs - args.warmup), eta_min=0)
    else:
        raise ValueError('Unknown schduler selected: {}'.format(
            args.scheduler))

    if args.warmup > 0:
        print('=> Applying warmup ({} epochs)'.format(args.warmup))
        lr_scheduler = GradualWarmupScheduler(optimizer,
                                              multiplier=1,
                                              total_epoch=args.warmup,
                                              after_scheduler=lr_scheduler)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            if args.resume_epoch:
                args.start_epoch = checkpoint['epoch']
                best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                pass
                # best_acc1 may be from a checkpoint from a different GPU
                #best_acc1 = best_acc1.to(args.gpu)
            try:
                model.load_state_dict(checkpoint['state_dict'])
                if not ('adam' in args.optimizer and 'sgd' in args.resume):
                    print('=> Loading optimizer...')
                    #optimizer.load_state_dict(checkpoint['optimizer'])
            except:
                print(
                    '=> Warning: dict model mismatch, loading with strict = False'
                )
                model.load_state_dict(checkpoint['state_dict'], strict=False)
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

        # Reset learning rate
        for g in optimizer.param_groups:
            g['lr'] = args.lr

    if args.start_epoch > 0:
        print('Advancing the scheduler to epoch {}'.format(args.start_epoch))
        for i in range(args.start_epoch):
            lr_scheduler.step()
    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'valid')

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    transforms_train = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])
    transforms_val = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])

    train_dataset = datasets.ImageFolder(traindir, transforms_train)
    val_dataset = datasets.ImageFolder(valdir, transforms_val)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    show_logs = (not args.multiprocessing_distributed) or (
        args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        if args.scheduler == 'cosine':
            lr_scheduler.step(epoch)
        else:
            lr_scheduler.step()
        if show_logs:
            print('New lr: {}'.format(lr_scheduler.get_last_lr()))

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args,
              show_logs)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args, show_logs)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        print('Current best: {}'.format(best_acc1))

        if show_logs:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, args.output_dir)
Ejemplo n.º 15
0
def main():
    fold = 0
    epoch = 3
    mode = 1
    batch = 2
    num_workers = 1
    SEED = 13
    init_lr = 3e-4
    warmup_factor = 10  #how long
    warmup_epo = 1
    log = True
    seed_everything(SEED)
    model = HUB_MODELS['efficientnet-b0']('efficientnet-b0')
    model.to(DEVICE)
    df = pd.read_csv(os.path.join(path_data, 'train_folds.csv'))

    kernel = type(model).__name__

    tr_idx = np.where(df.fold != fold)[0]
    vl_idx = np.where(df.fold == fold)[0]

    transforms_train = A.Compose([
        # A.OneOf([
        #     A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15),
        #     A.OpticalDistortion(distort_limit=0.11, shift_limit=0.15),
        #     A.NoOp()
        # ]),
        # A.OneOf([
        #     A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2),
        #     A.RandomGamma(gamma_limit=(50, 150)),
        #     A.NoOp()
        # ]),
        # A.OneOf([
        #     A.RGBShift(r_shift_limit=20, b_shift_limit=15, g_shift_limit=15),
        #     A.FancyPCA(3),
        #     A.HueSaturationValue(hue_shift_limit=5, sat_shift_limit=5),
        #     A.NoOp()
        # ]),
        # A.OneOf([
        #     A.CLAHE(),
        #     A.NoOp()
        # ]),
        A.Transpose(p=0.5),
        A.VerticalFlip(p=0.5),
        A.HorizontalFlip(p=0.5),
    ])

    # transforms_val = albumentations.Compose([])

    dataset = {
        'npy': [trainDataset_npy, 16],
        'pkl': [trainDataset_pkl, 25],
        'insta': [trainDataset_insta, None]
    }

    trainDataset, num = dataset['pkl']

    td = trainDataset(df.iloc[tr_idx],
                      df.iloc[tr_idx].isup_grade,
                      num,
                      rand=True,
                      transform=transforms_train)
    vd = trainDataset(df.iloc[vl_idx],
                      df.iloc[vl_idx].isup_grade,
                      num,
                      rand=False,
                      transform=transforms_train)

    train_dl = DataLoader(td,
                          batch_size=batch,
                          sampler=RandomSampler(td),
                          num_workers=num_workers)
    val_dl = DataLoader(vd,
                        batch_size=batch,
                        sampler=SequentialSampler(vd),
                        num_workers=num_workers)

    optimizer = Adam(model.parameters(), lr=init_lr / warmup_factor)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, epoch - warmup_epo)
    scheduler = GradualWarmupScheduler(optimizer,
                                       multiplier=warmup_factor,
                                       total_epoch=warmup_epo,
                                       after_scheduler=scheduler_cosine)
    criterion = nn.BCEWithLogitsLoss()
    scaler = amp.GradScaler()
    qwk_max = 0
    for i in range(1, epoch + 1):
        print(f'Epoch: {i}')
        scheduler.step(i - 1)
        model.train()
        loss = train_epoch(model, train_dl, criterion, scaler, optimizer)
        model.eval()
        with torch.no_grad():
            val_loss, pred, val_lab = train_epoch(model, val_dl, criterion,
                                                  None, None)
        p = torch.cat(pred).cpu().numpy()
        t = torch.cat(val_lab).cpu().numpy()
        acc = (p == t).mean() * 100.
        qwk = cohen_kappa_score(p, t, weights='quadratic')
        #sch.step(val_loss)          #  Plateau
        if log:
            print('Log.....')
            lg = time.ctime(
            ) + ' ' + f'Epoch {i}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(loss):.5f},  val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}, fold: {fold+1}'
            print(lg)
            with open(os.path.join(path_log, f'log_{kernel}_kaggle.txt'),
                      'a') as appender:
                appender.write(lg + '\n')

        if qwk > qwk_max:
            print('Best ({:.6f} --> {:.6f}).  Saving model ...'.format(
                qwk_max, qwk))
            torch.save(
                model.state_dict(),
                os.path.join(
                    path_model,
                    f'{kernel}_kaggle_best_fold{fold+1}_epoch_{i}.pth'))
            qwk_max = qwk

        #make checkpoint
        #problem in win
        # name_check = '_'.join(time.ctime().split(':')) + '_model.pt'

        # torch.save({
        #     'epoch': i,
        #     'model_state_dict': model.state_dict(),
        #     'optimizer_state_dict': optimizer.state_dict()
        #     }, os.path.join(path_checkpoint, name_check))

    torch.save(
        model.state_dict(),
        os.path.join(path_model, '{kernel}_kaggle_final_fold{fold+1}.pth'))
Ejemplo n.º 16
0
def main():
    best_test_loss = np.inf
    model = Yolov1_vgg16bn(pretrained=True)
    print('pre-trained vgg16 model has loaded!')

    previous_model_path = model_name
    exists = os.path.isfile(previous_model_path)
    if exists:
        print("Starting from previous result...")
        model.load_state_dict(torch.load(previous_model_path))
    else:
        print("Starting with new train")

    #print(model)

    print('')

    if use_gpu:
        model.cuda()

    # Data
    print('==> Preparing data..')
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])
    #transforms.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5)) parent_dir, img_size, S, B, C, transforms, num = 15000):

    train_dataset = DataGenerator(parent_dir=img_folder,
                                  img_size=img_size,
                                  S=S,
                                  B=B,
                                  C=C,
                                  transform=transform,
                                  num=train_num,
                                  train=True)

    train_loader = DataLoader(train_dataset,
                              batch_size=n_batch,
                              shuffle=True,
                              num_workers=8)

    test_dataset = DataGenerator(parent_dir=validate_folder,
                                 img_size=img_size,
                                 S=S,
                                 B=B,
                                 C=C,
                                 transform=transform,
                                 num=test_num,
                                 train=False)
    test_loader = DataLoader(test_dataset,
                             batch_size=n_batch,
                             shuffle=False,
                             num_workers=8)

    model.train()

    train_val_loss_log = open(
        os.path.join(results_folder, 'train_val_loss_log'), 'w+')
    #loss_fn = YoloLoss(B, S, lambda_coord, lambda_noobj)
    loss_fn = YoloLossNew(B, S, C, lambda_coord, lambda_noobj)

    optimizer = torch.optim.SGD(model.parameters(),
                                lr=0.0001,
                                momentum=0.9,
                                weight_decay=0.0005)
    #optimizer = torch.optim.SGD(model.parameters(),lr=0.0001)
    scheduler = GradualWarmupScheduler(optimizer, multiplier=8, total_epoch=30)

    for epoch in range(num_epochs):
        scheduler.step(epoch)
        print(epoch, optimizer.param_groups[0]['lr'])
        for i, (img_name, images, target) in enumerate(train_loader):
            #images = images.float()
            #target = target.float()
            images = Variable(images)
            target = Variable(target)
            if use_gpu:
                images, target = images.cuda(), target.cuda()

            optimizer.zero_grad()

            pred = model(images)
            loss = loss_fn(pred, target)
            current_loss = loss.item()

            loss.backward()
            optimizer.step()
            if i % 20 == 0:
                print(
                    "\r%d/%d batches in %d/%d iteration, current error is %f" %
                    (i, len(train_loader), epoch + 1, num_epochs,
                     current_loss))

        save_model_by_epoch(epoch, model)

        # validat on validation set
        validation_loss = 0.0
        model.eval()
        with torch.no_grad():
            for i, (img_name, images, target) in enumerate(test_loader):
                #image = images.float()
                #target = target.float()
                images = Variable(images)
                target = Variable(target)
                if use_gpu:
                    images, target = images.cuda(), target.cuda()

                pred = model(images)
                loss = loss_fn(pred, target)
                validation_loss += loss.item()

        validation_loss /= len(test_loader)
        # log the training loss and validation loss every epoch
        log_str = 'epoch: {}, train_loss: {}, val_loss: {} \n'.format(
            epoch + 1, current_loss, validation_loss)
        print(log_str)
        train_val_loss_log.writelines(log_str)
        train_val_loss_log.flush()
        if best_test_loss > validation_loss:
            best_test_loss = validation_loss
            save_torch_model(model, 'best.pth', epoch)

    train_val_loss_log.close()
Ejemplo n.º 17
0
def main(args):
    torch.backends.cudnn.benchmark = True
    seed_all(args.seed)

    num_classes = 1

    d = Dataset(train_set_size=args.train_set_sz, num_cls=num_classes)
    train = d.train_set
    valid = d.test_set

    net = UNet(in_dim=1, out_dim=4).cuda()
    snake_approx_net = UNet(in_dim=1,
                            out_dim=1,
                            wf=3,
                            padding=True,
                            first_layer_pad=None,
                            depth=4,
                            last_layer_resize=True).cuda()
    best_val_dice = -np.inf

    optimizer = torch.optim.Adam(params=net.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    snake_approx_optimizer = torch.optim.Adam(
        params=snake_approx_net.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay)
    scheduler_warmup = GradualWarmupScheduler(optimizer,
                                              multiplier=10,
                                              total_epoch=50,
                                              after_scheduler=None)

    # load model
    if args.ckpt:
        loaded = _pickle.load(open(args.ckpt, 'rb'))
        net.load_state_dict(loaded[0])
        optimizer.load_state_dict(loaded[1])
        snake_approx_net.load_state_dict(loaded[2])
        snake_approx_optimizer.load_state_dict(loaded[3])

    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir, exist_ok=True)

    writer = tensorboardX.SummaryWriter(log_dir=args.log_dir)
    snake = SnakePytorch(args.delta, args.batch_sz * args.num_samples,
                         args.num_lines, args.radius)
    snake_eval = SnakePytorch(args.delta, args.batch_sz, args.num_lines,
                              args.radius)
    noises = torch.zeros(
        (args.batch_sz, args.num_samples, args.num_lines, args.radius)).cuda()

    step = 1
    start = timeit.default_timer()
    for epoch in range(1, args.n_epochs + 1):
        for iteration in range(
                1,
                int(np.ceil(train.dataset_sz() / args.batch_sz)) + 1):

            scheduler_warmup.step()

            imgs, masks, onehot_masks, centers, dts_modified, dts_original, jitter_radius, bboxes = \
                train.next_batch(args.batch_sz)

            xs = make_batch_input(imgs)
            xs = torch.cuda.FloatTensor(xs)

            net.train()
            unet_logits = net(xs)

            center_jitters, angle_jitters = [], []
            for img, mask, center in zip(imgs, masks, centers):
                c_j, a_j = get_random_jitter_by_mask(mask, center, [1],
                                                     args.theta_jitter)
                if not args.use_center_jitter:
                    c_j = np.zeros_like(c_j)
                center_jitters.append(c_j)
                angle_jitters.append(a_j)

            center_jitters = np.asarray(center_jitters)
            angle_jitters = np.asarray(angle_jitters)

            # args.radius + 1 because we need additional outermost points for the gradient
            gs_logits_whole_img = unet_logits[:, 3, ...]
            gs_logits, coords_r, coords_c = get_star_pattern_values(
                gs_logits_whole_img,
                None,
                centers,
                args.num_lines,
                args.radius + 1,
                center_jitters=center_jitters,
                angle_jitters=angle_jitters)

            # currently only class 1 is foreground
            # if there's multiple foreground classes use a for loop
            gs = gs_logits[:, :,
                           1:] - gs_logits[:, :, :-1]  # compute the gradient

            noises.normal_(
                0, 1
            )  # noises here is only used for random exploration so no need mirrored sampling
            gs_noisy = torch.unsqueeze(gs, 1) + noises

            def batch_eval_snake(snake, inputs, batch_sz):
                n_inputs = len(inputs)
                assert n_inputs % batch_sz == 0
                n_batches = int(np.ceil(n_inputs / batch_sz))
                ind_sets = []
                for j in range(n_batches):
                    inps = inputs[j * batch_sz:(j + 1) * batch_sz]
                    batch_ind_sets = snake(inps).data.cpu().numpy()
                    ind_sets.append(batch_ind_sets)
                ind_sets = np.concatenate(ind_sets, 0)
                return ind_sets

            gs_noisy = gs_noisy.reshape((args.batch_sz * args.num_samples,
                                         args.num_lines, args.radius))
            ind_sets = batch_eval_snake(snake, gs_noisy,
                                        args.batch_sz * args.num_samples)
            ind_sets = ind_sets.reshape(
                (args.batch_sz * args.num_samples, args.num_lines))
            ind_sets = np.expand_dims(
                smooth_ind(ind_sets, args.smoothing_window), -1)

            # loss layers
            m = torch.nn.LogSoftmax(dim=1)
            loss = torch.nn.NLLLoss()

            # ===========================================================================
            # Inner loop: Train dice loss prediction network
            snake_approx_net.train()
            for _ in range(args.dice_approx_train_steps):

                snake_approx_logits = snake_approx_net(
                    gs_noisy.reshape(args.batch_sz * args.num_samples, 1,
                                     args.num_lines, args.radius).detach())
                snake_approx_train_loss = loss(
                    m(snake_approx_logits.squeeze().transpose(2, 1)),
                    torch.cuda.LongTensor(ind_sets.squeeze()))
                snake_approx_optimizer.zero_grad()
                snake_approx_train_loss.backward()
                snake_approx_optimizer.step()
            # ===========================================================================

            # ===========================================================================
            # Now, minimize the approximate dice loss
            snake_approx_net.eval()

            gt_indices = []
            for mask, center, cj, aj in zip(masks, centers, center_jitters,
                                            angle_jitters):
                gt_ind = mask_to_indices(mask, center, args.radius,
                                         args.num_lines, cj, aj)
                gt_indices.append(gt_ind)
            gt_indices = np.asarray(gt_indices).astype(int)

            gt_indices = gt_indices.reshape((args.batch_sz, args.num_lines))
            gt_indices = torch.cuda.LongTensor(gt_indices)

            snake_approx_logits = snake_approx_net(
                gs.reshape((args.batch_sz, 1, args.num_lines, args.radius)))
            nll_approx_loss = loss(
                m(snake_approx_logits.squeeze().transpose(2, 1)), gt_indices)

            total_loss = nll_approx_loss
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
            # ===========================================================================

            snake_approx_train_loss = snake_approx_train_loss.data.cpu().numpy(
            )
            nll_approx_loss = nll_approx_loss.data.cpu().numpy()
            total_loss = snake_approx_train_loss + nll_approx_loss

            if step % args.log_freq == 0:
                stop = timeit.default_timer()
                print(f"step={step}\tepoch={epoch}\titer={iteration}"
                      f"\tloss={total_loss}"
                      f"\tsnake_approx_train_loss={snake_approx_train_loss}"
                      f"\tnll_approx_loss={nll_approx_loss}"
                      f"\tlr={optimizer.param_groups[0]['lr']}"
                      f"\ttime={stop-start}")
                start = stop
                writer.add_scalar("total_loss", total_loss, step)
                writer.add_scalar("nll_approx_loss", nll_approx_loss, step)
                writer.add_scalar("lr", optimizer.param_groups[0]["lr"], step)

            if step % args.train_eval_freq == 0:
                train_dice = do_eval(
                    net,
                    snake_eval,
                    train.images,
                    train.masks,
                    train.centers,
                    args.batch_sz,
                    args.num_lines,
                    args.radius,
                    smoothing_window=args.smoothing_window).data.cpu().numpy()
                writer.add_scalar("train_dice", train_dice, step)
                print(
                    f"step={step}\tepoch={epoch}\titer={iteration}\ttrain_eval: train_dice={train_dice}"
                )

            if step % args.val_eval_freq == 0:
                val_dice = do_eval(
                    net,
                    snake_eval,
                    valid.images,
                    valid.masks,
                    valid.centers,
                    args.batch_sz,
                    args.num_lines,
                    args.radius,
                    smoothing_window=args.smoothing_window).data.cpu().numpy()
                writer.add_scalar("val_dice", val_dice, step)
                print(
                    f"step={step}\tepoch={epoch}\titer={iteration}\tvalid_dice={val_dice}"
                )
                if val_dice > best_val_dice:
                    best_val_dice = val_dice
                    _pickle.dump([
                        net.state_dict(),
                        optimizer.state_dict(),
                        snake_approx_net.state_dict(),
                        snake_approx_optimizer.state_dict()
                    ],
                                 open(
                                     os.path.join(args.log_dir,
                                                  'best_model.pth.tar'), 'wb'))
                    f = open(
                        os.path.join(args.log_dir, f"best_val_dice{step}.txt"),
                        'w')
                    f.write(str(best_val_dice))
                    f.close()
                    print(f"better val dice detected.")

            step += 1

    return best_val_dice
Ejemplo n.º 18
0
def main():

    data_dir = '../data/'
    df_biopsy = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    image_folder = os.path.join(data_dir, 'train_images')

    kernel_type = 'efficientnet-b3_36x256x256'
    enet_type = 'efficientnet-b3'
    num_folds = 5
    fold = 0
    tile_size = 256
    n_tiles = 32
    batch_size = 9
    num_workers = 24
    out_dim = 5
    init_lr = 3e-4
    warmup_factor = 10
    warmup_epo = 1
    n_epochs = 30
    use_amp = True

    writer = SummaryWriter(f'tensorboard_logs/{kernel_type}/fold-{fold}')

    if use_amp and not APEX_AVAILABLE:
        print("Error: could not import APEX module")
        exit()

    skf = StratifiedKFold(num_folds, shuffle=True, random_state=42)
    df_biopsy['fold'] = -1
    for i, (train_idx, valid_idx) in enumerate(
            skf.split(df_biopsy, df_biopsy['isup_grade'])):
        df_biopsy.loc[valid_idx, 'fold'] = i

    mean = [0.90949707, 0.8188697, 0.87795304]
    std = [0.36357649, 0.49984502, 0.40477625]
    transform_train = transforms.Compose([
        transforms.RandomChoice([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomVerticalFlip(p=0.5),
            RotationTransform([90, -90])
        ]),
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    transform_val = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize(mean, std)])

    df_train = df_biopsy.loc[df_biopsy['fold'] != fold]
    df_valid = df_biopsy.loc[df_biopsy['fold'] == fold]

    dataset_train = PANDADataset(df_train, image_folder, tile_size, n_tiles, \
        out_dim, transform=transform_train)
    dataset_valid = PANDADataset(df_valid, image_folder, tile_size, n_tiles, \
        out_dim, transform=transform_val)

    train_loader = DataLoader(
        dataset_train,
        batch_size=batch_size,
        sampler=RandomSampler(dataset_train),
        num_workers=num_workers,
    )
    valid_loader = DataLoader(dataset_valid,
                              batch_size=batch_size,
                              sampler=SequentialSampler(dataset_valid),
                              num_workers=num_workers)

    model = enetv2(enet_type, out_dim=out_dim)
    model = model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=init_lr / warmup_factor)
    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, n_epochs - warmup_epo)
    scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, \
                                    total_epoch=warmup_epo, after_scheduler=scheduler_cosine)

    criterion = nn.BCEWithLogitsLoss()

    if use_amp:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          keep_batchnorm_fp32=None,
                                          loss_scale="dynamic")
    model = nn.DataParallel(model)

    print("Number of train samples : {}".format(len(dataset_train)))
    print("Number of validation samples : {}".format(len(dataset_valid)))

    best_model = f'{kernel_type}_fold-{fold}_best.pth'
    save_path = f'../trained_models/{kernel_type}/fold-{fold}/'
    os.makedirs(save_path, exist_ok=True)

    qwk_max = 0.
    for epoch in range(1, n_epochs + 1):
        print(time.ctime(), 'Epoch:', epoch)
        scheduler.step(epoch - 1)

        train_loss = train_epoch(model,
                                 train_loader,
                                 optimizer,
                                 criterion,
                                 use_amp=use_amp)
        val_loss, acc, (qwk, qwk_k, qwk_r) = val_epoch(model, valid_loader,
                                                       criterion, df_valid)

        writer.add_scalars(f'loss', {
            'train': np.mean(train_loss),
            'val': val_loss
        }, epoch)
        writer.add_scalars(f'qwk', {
            'total': qwk,
            'Karolinska': qwk_k,
            'Radboud': qwk_r
        }, epoch)
        content = "{}, Epoch {}, lr: {:.7f}, train loss: {:.5f}," \
                " val loss: {:.5f}, acc: {:.5f}, qwk: {:.5f}".format(
                    time.ctime(), epoch, optimizer.param_groups[0]["lr"],
                    np.mean(train_loss), np.mean(val_loss), acc, qwk
                )
        print(content)

        with open('train_logs/log_{}_fold-{}.txt'.format(kernel_type, fold),
                  'a') as appender:
            appender.write(content + '\n')

        if qwk > qwk_max:
            print('score2 ({:.6f} --> {:.6f}).  Saving current best model ...'.
                  format(qwk_max, qwk))
            torch.save(model.state_dict(), os.path.join(save_path, best_model))
            qwk_max = qwk

        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'qwk_max': qwk_max
            }, os.path.join(save_path,
                            f'{kernel_type}_fold-{fold}_{epoch}.pth'))
Ejemplo n.º 19
0
def main_worker(gpu, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))
    # create model
    # if args.gen_map:
    #     args.qw = -1
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))

    else:
        print("=> creating model '{}'".format(args.arch))
    try:
        model = mnist_models.__dict__[args.arch](pretrained=args.pretrained)
    except KeyError:
        print('do not support {}'.format(args.arch))
        return

    print('model:\n=========\n{}\n=========='.format(model))

    if args.gpu is not None and args.gpus is None:
        #torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
        print('Use {} gpus'.format(args.gpus))
        # DataParallel will divide and allocate batch_size to all available GPUs
        if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
            model.features = torch.nn.DataParallel(model.features)
            model.cuda()
        else:
            model = torch.nn.DataParallel(model).cuda()
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        print(args.resume)
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume, map_location='cpu')
            model.load_state_dict(
                checkpoint['state_dict'])  # GPU memory leak. todo

            if not args.quant_bias_scale:
                args.start_epoch = checkpoint['epoch']
                best_acc1 = checkpoint['best_acc1']
                optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {}) (acc: {})".format(
                    args.resume, checkpoint['epoch'], best_acc1))
                print('=> save only weights in {}.pth'.format(args.arch))
                model.cpu()
                torch.save(model.state_dict(), '{}.pth'.format(args.arch))
                model.cuda(args.gpu)
                # save pth here
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # ConvQ + BN fusion
    if args.bn_fusion:
        print('BN fusion begin')
        model = wrapper.fuse_bn_recursively(model)
        print('after bn fusion: ')
        print(model)

    if args.resume_after:
        if os.path.isfile(args.resume_after):
            print('=> loading checkpoint {}'.format(args.resume_after))
            checkpoint = torch.load(args.resume_after, map_location='cpu')
            model.load_state_dict(checkpoint['state_dict'])
            model.cuda(args.gpu)
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    if args.extract_inner_data:
        print('extract inner feature map and weight')
        wrapper.save_inner_hooks(model)
        for k, v in model.state_dict().items():
            np.save('{}'.format(k), v.cpu().numpy())
    cudnn.benchmark = True

    # Data loading code
    print('==> Preparing data..')
    #    transform_train = transforms.Compose([
    #        transforms.RandomCrop(32, padding=4),
    #        transforms.RandomHorizontalFlip(),
    #        transforms.ToTensor(),
    #        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    #    ])

    #    transform_test = transforms.Compose([
    #        transforms.ToTensor(),
    #        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    #    ])
    '''
    trainset = torchvision.datasets.CIFAR10(root=args.data, train=True, download=True, transform=transform_train)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True,
                                               num_workers=args.workers)
    args.batch_num = len(train_loader)
    testset = torchvision.datasets.CIFAR10(root=args.data, train=False, download=True, transform=transform_test)
    val_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False,
                                             num_workers=args.workers)
    '''
    train_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST(
        '~/dataset/mnist',
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.Resize((32, 32)),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                               batch_size=args.batch_size,
                                               shuffle=True)
    args.batch_num = len(train_loader)
    val_loader = torch.utils.data.DataLoader(torchvision.datasets.MNIST(
        '~/dataset/mnist',
        train=False,
        transform=transforms.Compose([
            transforms.Resize((32, 32)),
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False)

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=args.epochs)
    scheduler_step = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[80, 160, 300])
    scheduler_next = scheduler_step
    if args.cosine:
        scheduler_next = scheduler_cosine
    scheduler_warmup = GradualWarmupScheduler(optimizer,
                                              multiplier=10,
                                              total_epoch=10,
                                              after_scheduler=scheduler_next)
    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return
    if 'q' in args.arch:
        args.log_name = 'logger/{}_{}'.format(args.arch, args.log_name)
    else:
        args.log_name = 'logger/{}_{}'.format(args.arch, args.log_name)
    writer = SummaryWriter(args.log_name)
    with open('{}/{}.txt'.format(args.log_name, args.arch), 'w') as wf:
        wf.write(str(model))
    for epoch in range(args.start_epoch, args.epochs):
        # adjust_learning_rate(optimizer, epoch, args)
        scheduler_warmup.step()
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)
        writer.add_scalar('val/acc1', acc1, epoch)
        writer.add_scalar('train/lr', optimizer.param_groups[0]['lr'], epoch)
        if args.debug:
            cnt = 0
            for k, v in model.state_dict().items():
                if 'pos' in k or 'neg' in k or 'shift' in k:
                    writer.add_histogram(k, v, epoch)
                    cnt += 1
                    if cnt == 10:
                        break
        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            prefix='{}/{}_'.format(args.log_name, args.arch))
Ejemplo n.º 20
0
    for data in train_loader:
        data = data.to(device)

        optimizer.zero_grad()

        output = model(data)
        loss = F.l1_loss(output, data.y)
        loss_all += loss.item() * data.num_graphs
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1000, norm_type=2)
        optimizer.step()

        curr_epoch = epoch + float(step) / (len(train_dataset) /
                                            args.batch_size)
        scheduler_warmup.step(curr_epoch)

        ema(model)
        step += 1

    train_loss = loss_all / len(train_loader.dataset)

    val_loss = test(val_loader)

    if best_val_loss is None or val_loss <= best_val_loss:
        test_loss = test(test_loader)
        best_epoch = epoch
        best_val_loss = val_loss

    print('Epoch: {:03d}, Train MAE: {:.7f}, Validation MAE: {:.7f}, '
          'Test MAE: {:.7f}'.format(epoch + 1, train_loss, val_loss,
Ejemplo n.º 21
0

if __name__ == '__main__':
    model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
    optim = SGD(model, 0.1)

    epochs = 20
    # scheduler_warmup is chained with lr_schduler
    lr_schduler = CosineAnnealingLR(optim, T_max=epochs - 5, eta_min=0.02)
    scheduler_warmup = GradualWarmupScheduler(optim,
                                              multiplier=1,
                                              total_epoch=5,
                                              after_scheduler=lr_schduler)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    optim.zero_grad()
    optim.step()
    scheduler_warmup.step()

    lr_list = list()
    for epoch in range(epochs):
        current_lr = optim.param_groups[0]['lr']

        optim.step()
        scheduler_warmup.step()

        print(epoch + 1, current_lr)
        lr_list.append(current_lr)

    plot(lr_list)
Ejemplo n.º 22
0
def train(net, loader):
    losses = []

    loss_fn = NTXentLoss(batch_size=BATCH_SIZE,
                         temperature=TEMPERATURE,
                         use_cosine_similarity=True)

    optimizer = SGD_with_lars(net.parameters(),
                              lr=0.1 * BATCH_SIZE / 256,
                              momentum=0.9,
                              weight_decay=1e-6)

    from warmup_scheduler import GradualWarmupScheduler

    cosine_scheduler = optim.lr_scheduler.CosineAnnealingLR(
        optimizer, TOTAL_EPOCHS)
    scheduler = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=TOTAL_EPOCHS // 10,
        after_scheduler=cosine_scheduler,
    )

    train_start = time.time()

    net.change_mode("pretrain")

    for epoch in range(1, TOTAL_EPOCHS + 1):
        train_loss = 0
        net.train()

        epoch_start = time.time()
        for idx, (data, target) in enumerate(loader):
            optimizer.zero_grad()

            xi, xj, target = data[0].cuda(), data[1].cuda(), target.cuda()

            _, zis, _ = net(xi)
            _, zjs, _ = net(xj)

            loss = loss_fn(zis, zjs)
            train_loss += loss.item()

            loss.backward()
            optimizer.step()

        train_loss /= idx + 1
        losses.append(train_loss)
        scheduler.step()

        epoch_time = time.time() - epoch_start
        print(
            "Epoch\t",
            epoch,
            "\tLoss\t",
            train_loss,
            "\tTime\t",
            epoch_time,
        )

    elapsed_train_time = time.time() - train_start
    print("Finished training. Train time was:", elapsed_train_time)

    return losses
def main(pargs):

    # this should be global
    global have_wandb

    #init distributed training
    comm.init(pargs.wireup_method)
    comm_rank = comm.get_rank()
    comm_local_rank = comm.get_local_rank()
    comm_size = comm.get_size()

    # set up logging
    pargs.logging_frequency = max([pargs.logging_frequency, 1])
    log_file = os.path.normpath(
        os.path.join(pargs.output_dir, "logs", pargs.run_tag + ".log"))
    logger = mll.mlperf_logger(log_file, "deepcam", "Umbrella Corp.")
    logger.log_start(key="init_start", sync=True)
    logger.log_event(key="cache_clear")

    #set seed
    seed = 333
    logger.log_event(key="seed", value=seed)

    # Some setup
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        device = torch.device("cuda", comm_local_rank)
        torch.cuda.manual_seed(seed)
        #necessary for AMP to work
        torch.cuda.set_device(device)

        # TEST: allowed? Valuable?
        #torch.backends.cudnn.benchark = True
    else:
        device = torch.device("cpu")

    #visualize?
    visualize = (pargs.training_visualization_frequency >
                 0) or (pargs.validation_visualization_frequency > 0)

    #set up directories
    root_dir = os.path.join(pargs.data_dir_prefix)
    output_dir = pargs.output_dir
    plot_dir = os.path.join(output_dir, "plots")
    if comm_rank == 0:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)
        if visualize and not os.path.isdir(plot_dir):
            os.makedirs(plot_dir)

    # Setup WandB
    if not pargs.enable_wandb:
        have_wandb = False
    if have_wandb and (comm_rank == 0):
        # get wandb api token
        certfile = os.path.join(pargs.wandb_certdir, ".wandbirc")
        try:
            with open(certfile) as f:
                token = f.readlines()[0].replace("\n", "").split()
                wblogin = token[0]
                wbtoken = token[1]
        except IOError:
            print("Error, cannot open WandB certificate {}.".format(certfile))
            have_wandb = False

        if have_wandb:
            # log in: that call can be blocking, it should be quick
            sp.call(["wandb", "login", wbtoken])

            #init db and get config
            resume_flag = pargs.run_tag if pargs.resume_logging else False
            wandb.init(entity=wblogin,
                       project='deepcam',
                       name=pargs.run_tag,
                       id=pargs.run_tag,
                       resume=resume_flag)
            config = wandb.config

            #set general parameters
            config.root_dir = root_dir
            config.output_dir = pargs.output_dir
            config.max_epochs = pargs.max_epochs
            config.local_batch_size = pargs.local_batch_size
            config.num_workers = comm_size
            config.channels = pargs.channels
            config.optimizer = pargs.optimizer
            config.start_lr = pargs.start_lr
            config.adam_eps = pargs.adam_eps
            config.weight_decay = pargs.weight_decay
            config.model_prefix = pargs.model_prefix
            config.amp_opt_level = pargs.amp_opt_level
            config.loss_weight_pow = pargs.loss_weight_pow
            config.lr_warmup_steps = pargs.lr_warmup_steps
            config.lr_warmup_factor = pargs.lr_warmup_factor

            # lr schedule if applicable
            if pargs.lr_schedule:
                for key in pargs.lr_schedule:
                    config.update(
                        {"lr_schedule_" + key: pargs.lr_schedule[key]},
                        allow_val_change=True)

    # Logging hyperparameters
    logger.log_event(key="global_batch_size",
                     value=(pargs.local_batch_size * comm_size))
    logger.log_event(key="opt_name", value=pargs.optimizer)
    logger.log_event(key="opt_base_learning_rate",
                     value=pargs.start_lr * pargs.lr_warmup_factor)
    logger.log_event(key="opt_learning_rate_warmup_steps",
                     value=pargs.lr_warmup_steps)
    logger.log_event(key="opt_learning_rate_warmup_factor",
                     value=pargs.lr_warmup_factor)
    logger.log_event(key="opt_epsilon", value=pargs.adam_eps)

    # Define architecture
    n_input_channels = len(pargs.channels)
    n_output_channels = 3
    net = deeplab_xception.DeepLabv3_plus(n_input=n_input_channels,
                                          n_classes=n_output_channels,
                                          os=16,
                                          pretrained=False,
                                          rank=comm_rank)
    net.to(device)

    #select loss
    loss_pow = pargs.loss_weight_pow
    #some magic numbers
    class_weights = [
        0.986267818390377**loss_pow, 0.0004578708870701058**loss_pow,
        0.01327431072255291**loss_pow
    ]
    fpw_1 = 2.61461122397522257612
    fpw_2 = 1.71641974795896018744
    criterion = losses.fp_loss

    #select optimizer
    optimizer = None
    if pargs.optimizer == "Adam":
        optimizer = optim.Adam(net.parameters(),
                               lr=pargs.start_lr,
                               eps=pargs.adam_eps,
                               weight_decay=pargs.weight_decay)
    elif pargs.optimizer == "AdamW":
        optimizer = optim.AdamW(net.parameters(),
                                lr=pargs.start_lr,
                                eps=pargs.adam_eps,
                                weight_decay=pargs.weight_decay)
    elif have_apex and (pargs.optimizer == "LAMB"):
        optimizer = aoptim.FusedLAMB(net.parameters(),
                                     lr=pargs.start_lr,
                                     eps=pargs.adam_eps,
                                     weight_decay=pargs.weight_decay)
    else:
        raise NotImplementedError("Error, optimizer {} not supported".format(
            pargs.optimizer))

    if have_apex:
        #wrap model and opt into amp
        net, optimizer = amp.initialize(net,
                                        optimizer,
                                        opt_level=pargs.amp_opt_level)

    #make model distributed
    net = DDP(net)

    #restart from checkpoint if desired
    #if (comm_rank == 0) and (pargs.checkpoint):
    #load it on all ranks for now
    if pargs.checkpoint:
        checkpoint = torch.load(pargs.checkpoint, map_location=device)
        start_step = checkpoint['step']
        start_epoch = checkpoint['epoch']
        optimizer.load_state_dict(checkpoint['optimizer'])
        net.load_state_dict(checkpoint['model'])
        if have_apex:
            amp.load_state_dict(checkpoint['amp'])
    else:
        start_step = 0
        start_epoch = 0

    #select scheduler
    if pargs.lr_schedule:
        scheduler_after = ph.get_lr_schedule(pargs.start_lr,
                                             pargs.lr_schedule,
                                             optimizer,
                                             last_step=start_step)

        # LR warmup
        if pargs.lr_warmup_steps > 0:
            if have_warmup_scheduler:
                scheduler = GradualWarmupScheduler(
                    optimizer,
                    multiplier=pargs.lr_warmup_factor,
                    total_epoch=pargs.lr_warmup_steps,
                    after_scheduler=scheduler_after)
            # Throw an error if the package is not found
            else:
                raise Exception(
                    f'Requested {pargs.lr_warmup_steps} LR warmup steps '
                    'but warmup scheduler not found. Install it from '
                    'https://github.com/ildoonet/pytorch-gradual-warmup-lr')
        else:
            scheduler = scheduler_after

    #broadcast model and optimizer state
    steptens = torch.tensor(np.array([start_step, start_epoch]),
                            requires_grad=False).to(device)
    dist.broadcast(steptens, src=0)

    ##broadcast model and optimizer state
    #hvd.broadcast_parameters(net.state_dict(), root_rank = 0)
    #hvd.broadcast_optimizer_state(optimizer, root_rank = 0)

    #unpack the bcasted tensor
    start_step = steptens.cpu().numpy()[0]
    start_epoch = steptens.cpu().numpy()[1]

    # Set up the data feeder
    # train
    train_dir = os.path.join(root_dir, "train")
    train_set = cam.CamDataset(train_dir,
                               statsfile=os.path.join(root_dir, 'stats.h5'),
                               channels=pargs.channels,
                               allow_uneven_distribution=False,
                               shuffle=True,
                               preprocess=True,
                               comm_size=comm_size,
                               comm_rank=comm_rank)
    train_loader = DataLoader(
        train_set,
        pargs.local_batch_size,
        num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]),
        pin_memory=True,
        drop_last=True)

    # validation: we only want to shuffle the set if we are cutting off validation after a certain number of steps
    validation_dir = os.path.join(root_dir, "validation")
    validation_set = cam.CamDataset(validation_dir,
                                    statsfile=os.path.join(
                                        root_dir, 'stats.h5'),
                                    channels=pargs.channels,
                                    allow_uneven_distribution=True,
                                    shuffle=(pargs.max_validation_steps
                                             is not None),
                                    preprocess=True,
                                    comm_size=comm_size,
                                    comm_rank=comm_rank)
    # use batch size = 1 here to make sure that we do not drop a sample
    validation_loader = DataLoader(
        validation_set,
        1,
        num_workers=min([pargs.max_inter_threads, pargs.local_batch_size]),
        pin_memory=True,
        drop_last=True)

    # log size of datasets
    logger.log_event(key="train_samples", value=train_set.global_size)
    if pargs.max_validation_steps is not None:
        val_size = min([
            validation_set.global_size,
            pargs.max_validation_steps * pargs.local_batch_size * comm_size
        ])
    else:
        val_size = validation_set.global_size
    logger.log_event(key="eval_samples", value=val_size)

    # do sanity check
    if pargs.max_validation_steps is not None:
        logger.log_event(key="invalid_submission")

    #for visualization
    #if visualize:
    #    viz = vizc.CamVisualizer()

    # Train network
    if have_wandb and (comm_rank == 0):
        wandb.watch(net)

    step = start_step
    epoch = start_epoch
    current_lr = pargs.start_lr if not pargs.lr_schedule else scheduler.get_last_lr(
    )[0]
    stop_training = False
    net.train()

    # start trining
    logger.log_end(key="init_stop", sync=True)
    logger.log_start(key="run_start", sync=True)

    # training loop
    while True:

        # start epoch
        logger.log_start(key="epoch_start",
                         metadata={
                             'epoch_num': epoch + 1,
                             'step_num': step
                         },
                         sync=True)

        # epoch loop
        for inputs, label, filename in train_loader:

            # send to device
            inputs = inputs.to(device)
            label = label.to(device)

            # forward pass
            outputs = net.forward(inputs)

            # Compute loss and average across nodes
            loss = criterion(outputs,
                             label,
                             weight=class_weights,
                             fpw_1=fpw_1,
                             fpw_2=fpw_2)

            # Backprop
            optimizer.zero_grad()
            if have_apex:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optimizer.step()

            # step counter
            step += 1

            if pargs.lr_schedule:
                current_lr = scheduler.get_last_lr()[0]
                scheduler.step()

            #visualize if requested
            #if (step % pargs.training_visualization_frequency == 0) and (comm_rank == 0):
            #    # Compute predictions
            #    predictions = torch.max(outputs, 1)[1]
            #
            #    # extract sample id and data tensors
            #    sample_idx = np.random.randint(low=0, high=label.shape[0])
            #    plot_input = inputs.detach()[sample_idx, 0,...].cpu().numpy()
            #    plot_prediction = predictions.detach()[sample_idx,...].cpu().numpy()
            #    plot_label = label.detach()[sample_idx,...].cpu().numpy()
            #
            #    # create filenames
            #    outputfile = os.path.basename(filename[sample_idx]).replace("data-", "training-").replace(".h5", ".png")
            #    outputfile = os.path.join(plot_dir, outputfile)
            #
            #    # plot
            #    viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label)
            #
            #    #log if requested
            #    if have_wandb:
            #        img = Image.open(outputfile)
            #        wandb.log({"train_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step)

            #log if requested
            if (step % pargs.logging_frequency == 0):

                # allreduce for loss
                loss_avg = loss.detach()
                dist.reduce(loss_avg, dst=0, op=dist.ReduceOp.SUM)
                loss_avg_train = loss_avg.item() / float(comm_size)

                # Compute score
                predictions = torch.max(outputs, 1)[1]
                iou = utils.compute_score(predictions,
                                          label,
                                          device_id=device,
                                          num_classes=3)
                iou_avg = iou.detach()
                dist.reduce(iou_avg, dst=0, op=dist.ReduceOp.SUM)
                iou_avg_train = iou_avg.item() / float(comm_size)

                logger.log_event(key="learning_rate",
                                 value=current_lr,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })
                logger.log_event(key="train_accuracy",
                                 value=iou_avg_train,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })
                logger.log_event(key="train_loss",
                                 value=loss_avg_train,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })

                if have_wandb and (comm_rank == 0):
                    wandb.log(
                        {"train_loss": loss_avg.item() / float(comm_size)},
                        step=step)
                    wandb.log(
                        {"train_accuracy": iou_avg.item() / float(comm_size)},
                        step=step)
                    wandb.log({"learning_rate": current_lr}, step=step)
                    wandb.log({"epoch": epoch + 1}, step=step)

            # validation step if desired
            if (step % pargs.validation_frequency == 0):

                logger.log_start(key="eval_start",
                                 metadata={'epoch_num': epoch + 1})

                #eval
                net.eval()

                count_sum_val = torch.Tensor([0.]).to(device)
                loss_sum_val = torch.Tensor([0.]).to(device)
                iou_sum_val = torch.Tensor([0.]).to(device)

                # disable gradients
                with torch.no_grad():

                    # iterate over validation sample
                    step_val = 0
                    # only print once per eval at most
                    visualized = False
                    for inputs_val, label_val, filename_val in validation_loader:

                        #send to device
                        inputs_val = inputs_val.to(device)
                        label_val = label_val.to(device)

                        # forward pass
                        outputs_val = net.forward(inputs_val)

                        # Compute loss and average across nodes
                        loss_val = criterion(outputs_val,
                                             label_val,
                                             weight=class_weights,
                                             fpw_1=fpw_1,
                                             fpw_2=fpw_2)
                        loss_sum_val += loss_val

                        #increase counter
                        count_sum_val += 1.

                        # Compute score
                        predictions_val = torch.max(outputs_val, 1)[1]
                        iou_val = utils.compute_score(predictions_val,
                                                      label_val,
                                                      device_id=device,
                                                      num_classes=3)
                        iou_sum_val += iou_val

                        # Visualize
                        #if (step_val % pargs.validation_visualization_frequency == 0) and (not visualized) and (comm_rank == 0):
                        #    #extract sample id and data tensors
                        #    sample_idx = np.random.randint(low=0, high=label_val.shape[0])
                        #    plot_input = inputs_val.detach()[sample_idx, 0,...].cpu().numpy()
                        #    plot_prediction = predictions_val.detach()[sample_idx,...].cpu().numpy()
                        #    plot_label = label_val.detach()[sample_idx,...].cpu().numpy()
                        #
                        #    #create filenames
                        #    outputfile = os.path.basename(filename[sample_idx]).replace("data-", "validation-").replace(".h5", ".png")
                        #    outputfile = os.path.join(plot_dir, outputfile)
                        #
                        #    #plot
                        #    viz.plot(filename[sample_idx], outputfile, plot_input, plot_prediction, plot_label)
                        #    visualized = True
                        #
                        #    #log if requested
                        #    if have_wandb:
                        #        img = Image.open(outputfile)
                        #        wandb.log({"eval_examples": [wandb.Image(img, caption="Prediction vs. Ground Truth")]}, step = step)

                        #increase eval step counter
                        step_val += 1

                        if (pargs.max_validation_steps is not None
                            ) and step_val > pargs.max_validation_steps:
                            break

                # average the validation loss
                dist.all_reduce(count_sum_val, op=dist.ReduceOp.SUM)
                dist.all_reduce(loss_sum_val, op=dist.ReduceOp.SUM)
                dist.all_reduce(iou_sum_val, op=dist.ReduceOp.SUM)
                loss_avg_val = loss_sum_val.item() / count_sum_val.item()
                iou_avg_val = iou_sum_val.item() / count_sum_val.item()

                # print results
                logger.log_event(key="eval_accuracy",
                                 value=iou_avg_val,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })
                logger.log_event(key="eval_loss",
                                 value=loss_avg_val,
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 })

                # log in wandb
                if have_wandb and (comm_rank == 0):
                    wandb.log({"eval_loss": loss_avg_val}, step=step)
                    wandb.log({"eval_accuracy": iou_avg_val}, step=step)

                if (iou_avg_val >= pargs.target_iou):
                    logger.log_event(key="target_accuracy_reached",
                                     value=pargs.target_iou,
                                     metadata={
                                         'epoch_num': epoch + 1,
                                         'step_num': step
                                     })
                    stop_training = True

                # set to train
                net.train()

                logger.log_end(key="eval_stop",
                               metadata={'epoch_num': epoch + 1})

            #save model if desired
            if (pargs.save_frequency > 0) and (step % pargs.save_frequency
                                               == 0):
                logger.log_start(key="save_start",
                                 metadata={
                                     'epoch_num': epoch + 1,
                                     'step_num': step
                                 },
                                 sync=True)
                if comm_rank == 0:
                    checkpoint = {
                        'step': step,
                        'epoch': epoch,
                        'model': net.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }
                    if have_apex:
                        checkpoint['amp'] = amp.state_dict()
                    torch.save(
                        checkpoint,
                        os.path.join(
                            output_dir, pargs.model_prefix + "_step_" +
                            str(step) + ".cpt"))
                logger.log_end(key="save_stop",
                               metadata={
                                   'epoch_num': epoch + 1,
                                   'step_num': step
                               },
                               sync=True)

            # Stop training?
            if stop_training:
                break

        # log the epoch
        logger.log_end(key="epoch_stop",
                       metadata={
                           'epoch_num': epoch + 1,
                           'step_num': step
                       },
                       sync=True)
        epoch += 1

        # are we done?
        if epoch >= pargs.max_epochs or stop_training:
            break

    # run done
    logger.log_end(key="run_stop", sync=True, metadata={'status': 'success'})
Ejemplo n.º 24
0
def train(args, train_dataloader, val_dataloader, test_dataloader, criterion):
    model = LSTMCrossCycleGCNDropout(args.voc_len,
                                     args.rnn_layers,
                                     args.birnn,
                                     'gru',
                                     args.word_matrix,
                                     args.resnet_input_size,
                                     args.c3d_input_size,
                                     args.rnn_layers,
                                     args.birnn,
                                     'gru',
                                     args.hidden_size,
                                     dropout_p=args.dropout,
                                     gcn_layers=args.gcn_layers,
                                     num_heads=8,
                                     answer_vocab_size=args.answer_vocab_size,
                                     q_max_len=args.q_max_length,
                                     v_max_len=args.v_max_length,
                                     tf_layers=args.tf_layers,
                                     two_loss=args.two_loss,
                                     fusion_type=args.fusion_type,
                                     ablation=args.ablation)

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    model.to(device)

    if args.change_lr == 'none':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
    elif args.change_lr == 'acc':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr / 5.,
                                     weight_decay=args.weight_decay)
        # val plateau scheduler
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               mode='max',
                                                               factor=0.1,
                                                               patience=3,
                                                               verbose=True)
        # target lr = args.lr * multiplier
        scheduler_warmup = GradualWarmupScheduler(optimizer,
                                                  multiplier=5,
                                                  total_epoch=5,
                                                  after_scheduler=scheduler)
    elif args.change_lr == 'loss':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr / 5.,
                                     weight_decay=args.weight_decay)
        # val plateau scheduler
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                               mode='min',
                                                               factor=0.1,
                                                               patience=3,
                                                               verbose=True)
        # target lr = args.lr * multiplier
        scheduler_warmup = GradualWarmupScheduler(optimizer,
                                                  multiplier=5,
                                                  total_epoch=5,
                                                  after_scheduler=scheduler)
    elif args.change_lr == 'cos':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr / 5.,
                                     weight_decay=args.weight_decay)
        # consine annealing
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.max_epoch)
        # target lr = args.lr * multiplier
        scheduler_warmup = GradualWarmupScheduler(optimizer,
                                                  multiplier=5,
                                                  total_epoch=5,
                                                  after_scheduler=scheduler)
    elif args.change_lr == 'step':
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=args.lr_list, gamma=0.1)
        # scheduler_warmup = GradualWarmupScheduler(
        #     optimizer, multiplier=5, total_epoch=5, after_scheduler=scheduler)

    best_val_acc = 0. if args.task != 'Count' else -100.

    for epoch in range(args.max_epoch):
        print('Start Training Epoch: {}'.format(epoch))

        model.train()

        loss_list = []
        prediction_list = []
        correct_answer_list = []

        if args.change_lr == 'cos':
            # consine annealing
            scheduler_warmup.step(epoch=epoch)

        for ii, data in enumerate(train_dataloader):

            if epoch == 0 and ii == 0:
                print([d.dtype for d in data], [d.size() for d in data])
            # print([d.dtype for d in data], [d.size() for d in data])
            data = [d.to(device) for d in data]

            optimizer.zero_grad()
            out, predictions, answers, _ = model(args.task, *data)
            loss = criterion(out, answers)
            loss.backward()
            optimizer.step()

            correct_answer_list.append(answers)
            loss_list.append(loss.item())
            prediction_list.append(predictions.detach())
            if ii % 100 == 0:
                print("Batch: ", ii)

        train_loss = np.mean(loss_list)
        correct_answer = torch.cat(correct_answer_list, dim=0).long()
        predict_answer = torch.cat(prediction_list, dim=0).long()
        assert correct_answer.shape == predict_answer.shape

        current_num = torch.sum(predict_answer == correct_answer).cpu().numpy()
        acc = current_num / len(correct_answer) * 100.

        # print('Learning Rate: {}'.format(optimizer.param_groups[0]['lr']))
        if args.change_lr == 'acc':
            scheduler_warmup.step(epoch, val_acc)
        elif args.change_lr == 'loss':
            scheduler_warmup.step(epoch, val_loss)
        elif args.change_lr == 'step':
            scheduler.step()

        print("Train|Epoch: {}, Acc : {:.3f}={}/{}, Train Loss: {:.3f}".format(
            epoch, acc, current_num, len(correct_answer), train_loss))
        if args.task == 'Count':
            count_loss = F.mse_loss(predict_answer.float(),
                                    correct_answer.float())
            print('Train|Count Real Loss:\t {:.3f}'.format(count_loss))

        val_acc, val_loss = val(args, model, val_dataloader, epoch, criterion)

        if val_acc > best_val_acc:
            print('Best Val Acc ======')
            best_val_acc = val_acc
        if epoch % args.val_epoch_step == 0 or val_acc >= best_val_acc:
            test(args, model, test_dataloader, epoch, criterion)
Ejemplo n.º 25
0
lr = 0.001
optim = torch.optim.SGD([v], lr=lr)
optim.param_groups[0]['initial_lr'] = lr

last_epoch = -1
scheduler = lr_scheduler.MultiStepLR(optim,
                                     milestones=[4],
                                     gamma=0.1,
                                     last_epoch=-1)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=10, eta_min=0.00001, last_epoch=-1)
# scheduler = lr_scheduler.OneCycleLR(optim, max_lr=0.001, total_steps=6000, pct_start=0.033, anneal_strategy='cos', last_epoch=last_epoch)

warmup = True
if warmup:
    scheduler = GradualWarmupScheduler(optim,
                                       multiplier=5,
                                       total_epoch=5,
                                       after_scheduler=scheduler)

# if last_epoch != -1:
#     scheduler.step()

lrs = []
for epoch in range(last_epoch + 1, 30):
    print(epoch, optim.param_groups[0]['lr'])
    lrs.append(optim.param_groups[0]['lr'])

    scheduler.step()

plt.plot(lrs)
plt.show()
Ejemplo n.º 26
0
            x, y_a, y_b, lam = cutmix_data(x, y, config.cutmix_beta)
            pred = model(x)
            loss = criterion(pred, y_a * lam + y_b * (1 - lam))
        else:
            pred = model(speech)
            loss = criterion(pred, speech_label)
        if config.amp:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item() / len(train_loader)
        scheduler.step(step)
        step += 1
        progress_bar.set_description(
            'Step: {}. LR : {:.5f}. Epoch: {}/{}. Iteration: {}/{}. current loss: {:.5f}'
            .format(step, optimizer.param_groups[0]['lr'], epoch,
                    config.n_epoch, idx + 1, len(train_loader), loss.item()))

    valid_loss = 0
    valid_acc = 0
    model.eval()
    for idx, data in enumerate(tqdm(valid_loader)):
        x = data['x'].cuda()
        y = data['y'].cuda()
        with torch.no_grad():
            pred = model(x)
            loss = criterion(pred, y)
Ejemplo n.º 27
0
import torch

from warmup_scheduler import GradualWarmupScheduler


if __name__ == '__main__':
    v = torch.zeros(10)
    optim = torch.optim.SGD([v], lr=0.01)
    scheduler = GradualWarmupScheduler(optim, multiplier=8, total_epoch=10)

    for epoch in range(1, 20):
        scheduler.step(epoch)

        print(epoch, optim.param_groups[0]['lr'])
Ejemplo n.º 28
0
import torch
from torch.optim.lr_scheduler import StepLR, ExponentialLR
from torch.optim.sgd import SGD

from warmup_scheduler import GradualWarmupScheduler

if __name__ == '__main__':
    model = [torch.nn.Parameter(torch.randn(2, 2, requires_grad=True))]
    optim = SGD(model, 0.1)

    # scheduler_warmup is chained with schduler_steplr
    scheduler_steplr = StepLR(optim, step_size=10, gamma=0.1)
    scheduler_warmup = GradualWarmupScheduler(optim,
                                              multiplier=1,
                                              total_epoch=5,
                                              after_scheduler=scheduler_steplr)

    # this zero gradient update is needed to avoid a warning message, issue #8.
    optim.zero_grad()
    optim.step()

    for epoch in range(1, 20):
        scheduler_warmup.step(epoch)
        print(epoch, optim.param_groups[0]['lr'])

        optim.step()  # backward pass (update network)
Ejemplo n.º 29
0
scheduler = GradualWarmupScheduler(optimizer,
                                   multiplier=warmup_factor,
                                   total_epoch=warmup_epo,
                                   after_scheduler=scheduler_cosine)

# optimizer = Radam.Over9000(model.parameters(), lr = init_lr)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
model = torch.nn.DataParallel(model,
                              device_ids=list(range(len(gpus.split(",")))))

qwk_max = 0.
for epoch in range(1, n_epochs + 1):
    printOut(time.ctime(), 'Epoch:', epoch)
    scheduler.step(epoch - 1)

    train_loss = train_epoch(train_loader, optimizer)
    val_loss, acc, qwk = val_epoch(valid_loader, epoch == n_epochs)

    content = time.ctime(
    ) + ' ' + f'Epoch {epoch}, lr: {optimizer.param_groups[0]["lr"]:.7f}, train loss: {np.mean(train_loss):.5f}, val loss: {np.mean(val_loss):.5f}, acc: {(acc):.5f}, qwk: {(qwk):.5f}'
    printOut(content)

    if qwk > qwk_max:
        printOut('score2 ({:.6f} --> {:.6f}).  Saving model ...'.format(
            qwk_max, qwk))
        torch.save(model.module.state_dict(), modelpath)
        qwk_max = qwk

torch.save(model.module.state_dict(),
Ejemplo n.º 30
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    opt.lr = 5e-3
    optimizer = torch.optim.Adam(model.parameters(), opt.lr, weight_decay=0)
    scheduler_consine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                                   T_max=310,
                                                                   eta_min=0)
    scheduler_warmup = GradualWarmupScheduler(
        optimizer,
        multiplier=1,
        total_epoch=10,
        after_scheduler=scheduler_consine)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        scheduler_warmup.step(epoch)
        # if epoch in opt.lr_step:
        #  save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
        #             epoch, model, optimizer)
        #  lr = opt.lr * (0.1 ** (opt.lr_step.index(epoch) + 1))
        #  print('Drop LR to', lr)
        #  for param_group in optimizer.param_groups:
        #      param_group['lr'] = lr
    logger.close()