Ejemplo n.º 1
0
    if opt.TRAIN.OPTIMIZER == 'Adam':
        optimizer = optim.Adam(model.parameters(), opt.TRAIN.LEARNING_RATE)
    elif opt.TRAIN.OPTIMIZER == 'SGD':
        optimizer = optim.SGD(model.parameters(),
                              opt.TRAIN.LEARNING_RATE,
                              momentum=0.9,
                              nesterov=True)
    else:
        assert False

    if opt.TRAIN.COSINE.ENABLE:
        set_lr(optimizer, opt.TRAIN.COSINE.LR)
        lr_scheduler = CosineLRWithRestarts(
            optimizer,
            opt.TRAIN.BATCH_SIZE,
            opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH,
            restart_period=opt.TRAIN.COSINE.PERIOD,
            t_mult=opt.TRAIN.COSINE.COEFF)
    else:
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            patience=opt.TRAIN.PATIENCE,
            factor=opt.TRAIN.LR_REDUCE_FACTOR,
            verbose=True,
            min_lr=opt.TRAIN.MIN_LR,
            threshold=opt.TRAIN.MIN_IMPROVEMENT,
            threshold_mode='abs')

    if args.pretrained is None:
        last_epoch = 0
Ejemplo n.º 2
0
def train_model(params: Dict[str, Any]) -> float:
    np.random.seed(0)
    model_dir = opt.EXPERIMENT_DIR

    logger.info('=' * 50)
    logger.info(f'hyperparameters: {params}')

    train_loader, val_loader, test_loader = load_data(args.fold, params)
    model = create_model(args.predict, float(params['dropout']))
    # freeze_layers(model)

    # if torch.cuda.device_count() == 1:
    #     torchsummary.summary(model, (3, 224, 224))

    if opt.TRAIN.OPTIMIZER == 'Adam':
        optimizer = optim.Adam(model.parameters(), opt.TRAIN.LEARNING_RATE)
    elif opt.TRAIN.OPTIMIZER == 'SGD':
        optimizer = optim.SGD(model.parameters(),
                              opt.TRAIN.LEARNING_RATE,
                              momentum=0.9,
                              nesterov=True)
    else:
        assert False

    if opt.TRAIN.COSINE.ENABLE:
        set_lr(optimizer, opt.TRAIN.COSINE.LR)
        lr_scheduler = CosineLRWithRestarts(
            optimizer,
            opt.TRAIN.BATCH_SIZE,
            opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH,
            restart_period=opt.TRAIN.COSINE.PERIOD,
            t_mult=opt.TRAIN.COSINE.COEFF)
    else:
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='max',
            patience=opt.TRAIN.PATIENCE,
            factor=opt.TRAIN.LR_REDUCE_FACTOR,
            verbose=True,
            min_lr=opt.TRAIN.MIN_LR,
            threshold=opt.TRAIN.MIN_IMPROVEMENT,
            threshold_mode='abs')

    if args.weights is None:
        last_epoch = 0
        logger.info(f'training will start from epoch {last_epoch+1}')
    else:
        last_checkpoint = torch.load(args.weights)
        assert (last_checkpoint['arch'] == opt.MODEL.ARCH)
        model.load_state_dict(last_checkpoint['state_dict'])
        optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint {args.weights} was loaded.')

        last_epoch = last_checkpoint['epoch']
        logger.info(f'loaded the model from epoch {last_epoch}')
        set_lr(optimizer, opt.TRAIN.LEARNING_RATE)

    if args.predict:
        print('inference mode')
        generate_submission(val_loader, test_loader, model, last_epoch,
                            args.weights)
        sys.exit(0)

    if opt.TRAIN.LOSS == 'BCE':
        criterion = nn.BCEWithLogitsLoss()
    else:
        raise RuntimeError('unknown loss specified')

    best_score = 0.0
    best_epoch = 0

    last_lr = read_lr(optimizer)
    best_model_path = None

    for epoch in range(last_epoch + 1, opt.TRAIN.EPOCHS + 1):
        logger.info('-' * 50)

        if not opt.TRAIN.COSINE.ENABLE:
            lr = read_lr(optimizer)
            if lr < last_lr - 1e-10 and best_model_path is not None:
                # reload the best model
                last_checkpoint = torch.load(
                    os.path.join(model_dir, best_model_path))
                assert (last_checkpoint['arch'] == opt.MODEL.ARCH)
                model.load_state_dict(last_checkpoint['state_dict'])
                optimizer.load_state_dict(last_checkpoint['optimizer'])
                logger.info(f'checkpoint {best_model_path} was loaded.')
                set_lr(optimizer, lr)
                last_lr = lr

            if lr < opt.TRAIN.MIN_LR * 1.01:
                logger.info('reached minimum LR, stopping')
                break

                # logger.info(f'lr={lr}, start cosine annealing!')
                # set_lr(optimizer, opt.TRAIN.COSINE.LR)
                # opt.TRAIN.COSINE.ENABLE = True
                #
                # lr_scheduler = CosineLRWithRestarts(optimizer, opt.TRAIN.BATCH_SIZE,
                #     opt.TRAIN.BATCH_SIZE * opt.TRAIN.STEPS_PER_EPOCH,
                #     restart_period=opt.TRAIN.COSINE.PERIOD, t_mult=opt.TRAIN.COSINE.COEFF)

        if opt.TRAIN.COSINE.ENABLE:
            lr_scheduler.step()

        read_lr(optimizer)

        train(train_loader, model, criterion, optimizer, epoch, lr_scheduler)
        score, _ = validate(val_loader, model, epoch)

        if not opt.TRAIN.COSINE.ENABLE:
            lr_scheduler.step(score)  # type: ignore

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        data_to_save = {
            'epoch': epoch,
            'arch': opt.MODEL.ARCH,
            'state_dict': model.state_dict(),
            'best_score': best_score,
            'score': score,
            'optimizer': optimizer.state_dict(),
            'options': opt
        }

        filename = opt.MODEL.VERSION
        if is_best:
            best_model_path = f'{filename}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth'
            save_checkpoint(data_to_save, best_model_path, model_dir)

    logger.info(f'best score: {best_score:.04f}')
    return -best_score
Ejemplo n.º 3
0
def run() -> float:
    np.random.seed(0)
    model_dir = config.experiment_dir

    logger.info('=' * 50)

    train_loader, val_loader, test_loader = load_data(args.fold)
    logger.info(f'creating a model {config.model.arch}')
    model = create_model(config, pretrained=args.weights is None).cuda()
    criterion = get_loss(config)

    if args.summary:
        torchsummary.summary(model, (3, config.model.input_size, config.model.input_size))

    if args.lr_finder:
        optimizer = get_optimizer(config, model.parameters())
        lr_finder(train_loader, model, criterion, optimizer)
        sys.exit()

    if args.weights is None and config.train.head_only_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.steps} steps')
        logger.info(f'max_lr will be {config.train.warmup.max_lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer)

        freeze_layers(model)
        train_epoch(train_loader, model, criterion, optimizer, 0,
                    warmup_scheduler, None, config.train.warmup.steps)
        unfreeze_layers(model)

    if args.weights is None and config.train.enable_warmup:
        logger.info('-' * 50)
        logger.info(f'doing warmup for {config.train.warmup.steps} steps')
        logger.info(f'max_lr will be {config.train.warmup.max_lr}')

        optimizer = get_optimizer(config, model.parameters())
        warmup_scheduler = get_warmup_scheduler(config, optimizer)
        train_epoch(train_loader, model, criterion, optimizer, 0,
                    warmup_scheduler, None, config.train.warmup.steps)

    optimizer = get_optimizer(config, model.parameters())

    if args.weights is None:
        last_epoch = -1
    else:
        last_checkpoint = torch.load(args.weights)
        model_arch = last_checkpoint['arch'].replace('se_', 'se')

        if model_arch != config.model.arch:
            dprint(model_arch)
            dprint(config.model.arch)
            assert model_arch == config.model.arch

        model.load_state_dict(last_checkpoint['state_dict'])
        if 'optimizer' in last_checkpoint.keys():
            optimizer.load_state_dict(last_checkpoint['optimizer'])
        logger.info(f'checkpoint loaded: {args.weights}')

        last_epoch = last_checkpoint['epoch'] if 'epoch' in last_checkpoint.keys() else 99
        logger.info(f'loaded the model from epoch {last_epoch}')

        if args.lr != 0:
            set_lr(optimizer, float(args.lr))
        elif 'lr' in config.optimizer.params:
            set_lr(optimizer, config.optimizer.params.lr)
        elif 'base_lr' in config.scheduler.params:
            set_lr(optimizer, config.scheduler.params.base_lr)

    if not args.cosine:
        lr_scheduler = get_scheduler(config.scheduler, optimizer, last_epoch=
                                     (last_epoch if config.scheduler.name != 'cyclic_lr' else -1))
        assert config.scheduler2.name == ''
        lr_scheduler2 = get_scheduler(config.scheduler2, optimizer, last_epoch=last_epoch) \
                        if config.scheduler2.name else None
    else:
        epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) \
                     * config.train.batch_size

        set_lr(optimizer, float(config.cosine.start_lr))
        lr_scheduler = CosineLRWithRestarts(optimizer,
                                            batch_size=config.train.batch_size,
                                            epoch_size=epoch_size,
                                            restart_period=config.cosine.period,
                                            period_inc=config.cosine.period_inc,
                                            max_period=config.cosine.max_period)
        lr_scheduler2 = None

    if args.predict_oof or args.predict_test:
        print('inference mode')
        assert args.weights is not None

        if args.predict_oof:
            gen_train_prediction(val_loader, model, last_epoch, args.weights)
        else:
            gen_test_prediction(test_loader, model, args.weights)

        sys.exit()

    logger.info(f'training will start from epoch {last_epoch + 1}')

    best_score = 0.0
    best_epoch = 0

    last_lr = get_lr(optimizer)
    best_model_path = args.weights

    for epoch in range(last_epoch + 1, config.train.num_epochs):
        logger.info('-' * 50)

        if not is_scheduler_continuous(lr_scheduler) and lr_scheduler2 is None:
            # if we have just reduced LR, reload the best saved model
            lr = get_lr(optimizer)

            if lr < last_lr - 1e-10 and best_model_path is not None:
                logger.info(f'learning rate dropped: {lr}, reloading')
                last_checkpoint = torch.load(best_model_path)

                assert(last_checkpoint['arch']==config.model.arch)
                model.load_state_dict(last_checkpoint['state_dict'])
                optimizer.load_state_dict(last_checkpoint['optimizer'])
                logger.info(f'checkpoint loaded: {best_model_path}')
                set_lr(optimizer, lr)
                last_lr = lr

        if config.train.lr_decay_coeff != 0 and epoch in config.train.lr_decay_milestones:
            n_cycles = config.train.lr_decay_milestones.index(epoch) + 1
            total_coeff = config.train.lr_decay_coeff ** n_cycles
            logger.info(f'artificial LR scheduler: made {n_cycles} cycles, decreasing LR by {total_coeff}')

            set_lr(optimizer, config.scheduler.params.base_lr * total_coeff)
            lr_scheduler = get_scheduler(config.scheduler, optimizer,
                                         coeff=total_coeff, last_epoch=-1)
                                         # (last_epoch if config.scheduler.name != 'cyclic_lr' else -1))

        if isinstance(lr_scheduler, CosineLRWithRestarts):
            restart = lr_scheduler.epoch_step()
            if restart:
                logger.info('cosine annealing restarted, resetting the best metric')
                best_score = min(config.cosine.min_metric_val, best_score)

        train_epoch(train_loader, model, criterion, optimizer, epoch,
                    lr_scheduler, lr_scheduler2, config.train.max_steps_per_epoch)
        score, _, _ = validate(val_loader, model, epoch)

        if type(lr_scheduler) == ReduceLROnPlateau:
            lr_scheduler.step(metrics=score)
        elif not is_scheduler_continuous(lr_scheduler):
            lr_scheduler.step()

        if type(lr_scheduler2) == ReduceLROnPlateau:
            lr_scheduler2.step(metrics=score)
        elif lr_scheduler2 and not is_scheduler_continuous(lr_scheduler2):
            lr_scheduler2.step()

        is_best = score > best_score
        best_score = max(score, best_score)
        if is_best:
            best_epoch = epoch

        if is_best:
            best_model_path = os.path.join(model_dir,
                f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth')

            data_to_save = {
                'epoch': epoch,
                'arch': config.model.arch,
                'state_dict': model.state_dict(),
                'score': score,
                'optimizer': optimizer.state_dict(),
                'config': config
            }

            torch.save(data_to_save, best_model_path)
            logger.info(f'a snapshot was saved to {best_model_path}')

    logger.info(f'best score: {best_score:.04f}')
    return -best_score
Ejemplo n.º 4
0
# create model
logger.info(f"using pre-trained model {opt.MODEL.ARCH}")
model = cbam_resnet50(num_classes=DATA_INFO.NUM_CLASSES)
print(model)

model = torch.nn.DataParallel(model).cuda()

if torch.cuda.device_count() == 1:
    torchsummary.summary(model,
                         (3, opt.MODEL.INPUT_SIZE, opt.MODEL.INPUT_SIZE))

optimizer = optim.Adam(model.module.parameters(), opt.TRAIN.LEARNING_RATE)
lr_scheduler = CosineLRWithRestarts(optimizer,
                                    opt.TRAIN.BATCH_SIZE,
                                    opt.TRAIN.BATCH_SIZE *
                                    opt.TRAIN.STEPS_PER_EPOCH,
                                    restart_period=opt.TRAIN.COSINE.PERIOD,
                                    t_mult=opt.TRAIN.COSINE.COEFF,
                                    min_lr=1e-6)

if opt.TRAIN.RESUME is None:
    last_epoch = 0
    logger.info(f"Training will start from epoch {last_epoch+1}")

else:
    last_checkpoint = torch.load(opt.TRAIN.RESUME)
    assert (last_checkpoint['arch'] == opt.MODEL.ARCH)
    model.module.load_state_dict(last_checkpoint['state_dict'])
    optimizer.load_state_dict(last_checkpoint['optimizer'])
    logger.info(f"Checkpoint {opt.TRAIN.RESUME} was loaded.")
Ejemplo n.º 5
0
    Path(opt.out_path).mkdir(parents=True, exist_ok=True)

    train_set = HalfHalfDataset(opt.real_path, opt.syn_path, opt.params_path, opt.blend, opt.channels, opt.split)
    train_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batch_size, shuffle=True, pin_memory=True)

    val_set = RealDataset(opt.real_path, opt.channels, split='val')
    val_loader = DataLoader(dataset=val_set, num_workers=0, batch_size=1, shuffle=False)

    test_set = RealDataset(opt.real_path, opt.channels, split='test')
    test_loader = DataLoader(dataset=test_set, num_workers=0, batch_size=1, shuffle=False)

    opt.n_classes = train_set.n_classes
    net = PowderNet(opt.arch, opt.n_channels, train_set.n_classes)
    net = net.cuda()
    optimizer = AdamW([{'params': get_1x_lr_params(net)}, {'params': get_10x_lr_params(net), 'lr': opt.lr * 10}], lr=opt.lr, weight_decay=opt.decay)
    scheduler = CosineLRWithRestarts(optimizer, opt.batch_size, len(train_set), opt.period, opt.t_mult)
    vis = Visualizer(server=opt.server, env=opt.env)
    start_epoch = 0
    if opt.resume is not None:
        checkpoint = torch.load(opt.resume)
        old_opt = checkpoint['opt']
        assert(old_opt.channels == opt.channels)
        assert(old_opt.bands == opt.bands)
        assert(old_opt.arch == opt.arch)
        assert(old_opt.blend == opt.blend)
        assert(old_opt.lr == opt.lr)
        assert(old_opt.decay == opt.decay)
        assert(old_opt.period == opt.period)
        assert(old_opt.t_mult == opt.t_mult)
        net.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
                                                  pretrained='imagenet')

assert (opt.MODEL.INPUT_SIZE % 32 == 0)
model.avgpool = nn.AvgPool2d(opt.MODEL.INPUT_SIZE // 32, stride=1)
model.last_linear = nn.Linear(model.last_linear.in_features,
                              DATA_INFO.NUM_CLASSES)
model = torch.nn.DataParallel(model).cuda()

if torch.cuda.device_count() == 1:
    torchsummary.summary(model,
                         (3, opt.MODEL.INPUT_SIZE, opt.MODEL.INPUT_SIZE))

optimizer = optim.Adam(model.module.parameters(), opt.TRAIN.LEARNING_RATE)
lr_scheduler = CosineLRWithRestarts(optimizer,
                                    opt.TRAIN.BATCH_SIZE,
                                    opt.TRAIN.BATCH_SIZE *
                                    opt.TRAIN.STEPS_PER_EPOCH,
                                    restart_period=50,
                                    t_mult=1.2)

if opt.TRAIN.RESUME is None:
    last_epoch = 0
    logger.info(f"Training will start from epoch {last_epoch+1}")

else:
    last_checkpoint = torch.load(opt.TRAIN.RESUME)
    assert (last_checkpoint['arch'] == opt.MODEL.ARCH)
    model.module.load_state_dict(last_checkpoint['state_dict'])
    optimizer.load_state_dict(last_checkpoint['optimizer'])
    logger.info(f"Checkpoint {opt.TRAIN.RESUME} was loaded.")

    last_epoch = last_checkpoint['epoch']
Ejemplo n.º 7
0
                             num_workers=0,
                             batch_size=1,
                             shuffle=False)

    opt.n_classes = train_set.n_classes
    net = PowderNet(opt.arch, opt.n_channels, train_set.n_classes)
    net = net.cuda()
    optimizer = AdamW([{
        'params': get_1x_lr_params(net)
    }, {
        'params': get_10x_lr_params(net),
        'lr': opt.lr * 10
    }],
                      lr=opt.lr,
                      weight_decay=opt.decay)
    scheduler = CosineLRWithRestarts(optimizer, opt.batch_size, len(train_set),
                                     opt.period, opt.t_mult)
    vis = Visualizer(server=opt.server, env=opt.env)
    start_epoch = 0
    if opt.resume is not None:
        checkpoint = torch.load(opt.resume)
        old_opt = checkpoint['opt']
        assert (old_opt.channels == opt.channels)
        assert (old_opt.bands == opt.bands)
        assert (old_opt.arch == opt.arch)
        assert (old_opt.blend == opt.blend)
        assert (old_opt.lr == opt.lr)
        assert (old_opt.decay == opt.decay)
        assert (old_opt.period == opt.period)
        assert (old_opt.t_mult == opt.t_mult)
        net.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])