Example #1
0
def get_model(cfg):

    log.info(f'model: {cfg.model.name}')
    log.info(f'pretrained: {cfg.model.pretrained}')

    if cfg.model.name in ['resnext101_32x8d_wsl']:
        log.info('backbone get from: facebookresearch/WSL-Images')
        model = torch.hub.load('facebookresearch/WSL-Images', cfg.model.name)
        model.fc = torch.nn.Linear(2048, cfg.model.n_output)
        return model

    try:
        model_func = pretrainedmodels.__dict__[cfg.model.name]
        log.info('pretrained weight from: pretrainedmodels')
    except KeyError as e:
        model_func = eval(cfg.model.name)

    model = model_func(num_classes=1000, pretrained=cfg.model.pretrained)
    model.avg_pool = nn.AdaptiveAvgPool2d(1)
    model.last_linear = nn.Linear(
        model.last_linear.in_features,
        cfg.model.n_output,
    )
    log.info(f'last layer: {cfg.model.n_output}')

    return model
Example #2
0
def get_optim(cfg, parameters):

    optim = getattr(torch.optim, cfg.optim.name)(parameters,
                                                 **cfg.optim.params)
    log.info(f'optim: {cfg.optim}')

    return optim
Example #3
0
def save_model(model, optim, detail, fold, dirname):
    path = os.path.join(dirname, 'fold%d_ep%d.pt' % (fold, detail['epoch']))
    torch.save({
        'model': model.state_dict(),
        'optim': optim.state_dict(),
        'detail': detail,
    }, path)
    log.info('saved model to %s' % path)
Example #4
0
def get_loss(cfg):

    loss = getattr(nn,
                   cfg.loss.name)(weight=torch.FloatTensor([2, 1, 1, 1, 1,
                                                            1]).to(cfg.device),
                                  **cfg.loss.params)
    log.info(f'criterion: {cfg.loss}')

    return loss
Example #5
0
def do_test(cfg, model):
    assert cfg.output
    load_model(cfg.snapshot, model)
    loader_test = factory.get_dataloader(cfg.data.test)
    with torch.no_grad():
        results = [
            run_nn(cfg.data.test, 'test', model, loader_test)
            for i in range(cfg.n_tta)
        ]
    with open(cfg.output, 'wb') as f:
        pickle.dump(results, f)
    log.info('saved to %s' % cfg.output)
Example #6
0
    def __init__(self, cfg, folds):
        self.cfg = cfg
        self.folds = folds

        self.transforms = factory.get_transforms(self.cfg)

        with open(cfg.annotations, 'rb') as f:
            log.info(f'loaded file: {cfg.annotations}')
            self.df = pickle.load(f)

        if folds:
            self.df = self.df[self.df.fold.isin(folds)]

        self.df = apply_dataset_policy(self.df, self.cfg.dataset_policy)
Example #7
0
def run_train():
    args = get_args()
    cfg = Config.fromfile(args.config)

    # copy command line args to cfg
    cfg.mode = args.mode
    cfg.debug = args.debug
    cfg.fold = args.fold
    cfg.snapshot = args.snapshot
    cfg.output = args.output
    cfg.n_tta = args.n_tta
    cfg.gpu = args.gpu
    cfg.device = device

    # print setting
    show_config(cfg)

    # torch.cuda.set_device(cfg.gpu)
    set_seed(cfg.seed)

    # setup -------------------------------------
    for f in ['checkpoint', 'train', 'valid', 'test', 'backup']:
        os.makedirs(cfg.workdir + '/' + f, exist_ok=True)
    if 0:  #not work perfect
        backup_project_as_zip(
            PROJECT_PATH,
            cfg.workdir + '/backup/code.train.%s.zip' % IDENTIFIER)

    ## model ------------------------------------
    log.info('\n')
    log.info('** model setting **')
    model = factory.get_model(cfg)

    # multi-gpu----------------------------------
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
    model.to(device)

    ## ------------------------------------------
    if cfg.mode == 'train':
        do_train(cfg, model)
    elif cfg.mode == 'valid':
        do_valid(cfg, model)
    elif cfg.mode == 'test':
        do_test(cfg, model)
    else:
        log.error(f"mode '{cfg.mode}' is not in [train, valid, test]")
        exit(0)
Example #8
0
def apply_dataset_policy(df, policy):

    if policy == 'all':
        pass
    elif policy == 'pos==neg':
        df_positive = df[df.labels != '']
        df_negative = df[df.labels != '']
        df_sampled = df_negative.sample(len(df_positive))
        df = pd.concat([df_positive, df_sampled], sort=False)

    else:
        raise
    log.info(f'apply_dataset_policy: {policy}')

    return df
Example #9
0
def do_valid(cfg, model):
    assert cfg.output
    criterion = factory.get_loss(cfg)
    load_model(cfg.snapshot, model)
    loader_valid = factory.get_dataloader(cfg.data.valid, [cfg.fold])
    with torch.no_grad():
        results = [
            run_nn(cfg.data.valid,
                   'valid',
                   model,
                   loader_valid,
                   criterion=criterion) for i in range(cfg.n_tta)
        ]
    with open(cfg.output, 'wb') as f:
        pickle.dump(results, f)
    log.info('saved to %s' % cfg.output)
Example #10
0
def get_scheduler(cfg, optim, last_epoch):

    if cfg.scheduler.name == 'ReduceLROnPlateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optim,
            **cfg.scheduler.params,
        )
        scheduler.last_epoch = last_epoch
    else:
        scheduler = getattr(lr_scheduler, cfg.scheduler.name)(
            optim,
            last_epoch=last_epoch,
            **cfg.scheduler.params,
        )
    log.info(f'last_epoch: {last_epoch}')

    return scheduler
Example #11
0
def load_model(path, model, optim=None):

    # remap everthing onto CPU 
    state = torch.load(str(path), map_location=lambda storage, location: storage)

    model.load_state_dict(state['model'])
    if optim:
        log.info('loading optim too')
        optim.load_state_dict(state['optim'])
    else:
        log.info('not loading optim')

    model.cuda()

    detail = state['detail']
    log.info('loaded model from %s' % path)

    return detail
Example #12
0
def show_config(cfg):
    log.info('---[START %s] %s' % (IDENTIFIER, '-' * 32))
    log.info('\n')
    log.info('** show config **')
    log.info(f'workdir:     {cfg.workdir}')
    log.info(f'logpath:     {logger.path}')
    log.info(f'seed:        {cfg.seed}')
    log.info(f'model:       {cfg.model}')
    log.info(f'optim:       {cfg.optim}')
    log.info(f'loss:        {cfg.loss}')
    log.info(f'scheduler:   {cfg.scheduler}')
    log.info(f'mode:        {cfg.mode}')
    log.info(f'fold:        {cfg.fold}')
    log.info(f'epoch:       {cfg.epoch}')
    log.info(f'batch size:  {cfg.batch_size}')
    log.info(f'acc:         {cfg.data.train.n_grad_acc}')
    log.info(f'n_workers:   {cfg.num_workers}')
    log.info(f'apex:        {cfg.apex}')
    log.info(f'imgsize:     {cfg.imgsize}')
    log.info(f'normalize:   {cfg.normalize}')

    log.info(f'debug:       {cfg.debug}')
    log.info(f'n_tta:       {cfg.n_tta}')
    log.info(f'resume_from: {cfg.resume_from}')

    # device
    log.info(f'gpu:         {cfg.gpu}')
    log.info(f'device:      {cfg.device}')
Example #13
0
def run_nn(cfg,
           mode,
           model,
           loader,
           criterion=None,
           optim=None,
           scheduler=None,
           apex=None):
    if mode in ['train']:
        model.train()
    elif mode in ['valid', 'test']:
        model.eval()
    else:
        raise

    t1 = time.time()
    losses = []
    ids_all = []
    targets_all = []
    outputs_all = []

    for i, (inputs, targets, ids) in enumerate(loader):

        batch_size = len(inputs)

        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model(inputs)

        if mode in ['train', 'valid']:
            loss = criterion(outputs, targets)
            with torch.no_grad():
                losses.append(loss.item())

        if mode in ['train']:
            if apex:
                with amp.scale_loss(loss, optim) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()  # accumulate loss
            if (i + 1) % cfg.n_grad_acc == 0:
                optim.step()  # update
                optim.zero_grad()  # flush

        with torch.no_grad():
            ids_all.extend(ids)
            targets_all.extend(targets.cpu().numpy())
            outputs_all.extend(torch.sigmoid(outputs).cpu().numpy())
            #outputs_all.append(torch.softmax(outputs, dim=1).cpu().numpy())

        elapsed = int(time.time() - t1)
        eta = int(elapsed / (i + 1) * (len(loader) - (i + 1)))
        progress = f'\r[{mode}] {i+1}/{len(loader)} {elapsed}(s) eta:{eta}(s) loss:{(np.sum(losses)/(i+1)):.6f} loss200:{(np.sum(losses[-200:])/(min(i+1,200))):.6f} lr:{get_lr(optim):.2e}'
        print(progress, end='')
        sys.stdout.flush()

    result = {
        'ids': ids_all,
        'targets': np.array(targets_all),
        'outputs': np.array(outputs_all),
        'loss': np.sum(losses) / (i + 1),
    }

    if mode in ['train', 'valid']:
        result.update(calc_auc(result['targets'], result['outputs']))
        result.update(calc_logloss(result['targets'], result['outputs']))
        result['score'] = result['logloss']

        log.info(progress + ' auc:%.4f micro:%.4f macro:%.4f' %
                 (result['auc'], result['auc_micro'], result['auc_macro']))
        log.info('%.6f %s' %
                 (result['logloss'], np.round(result['logloss_classes'], 6)))
    else:
        log.info('')

    return result
Example #14
0
def do_train(cfg, model):
    log.info('\n')
    log.info('** start training **')

    # get criterion -----------------------------
    criterion = factory.get_loss(cfg)

    # get optimization --------------------------
    optim = factory.get_optim(cfg, model.parameters())

    # initial -----------------------------------
    best = {
        'loss': float('inf'),
        'score': 0.0,
        'epoch': -1,
    }

    # re-load model -----------------------------
    if cfg.resume_from:
        log.info('\n')
        log.info(f're-load model from {cfg.resume_from}')
        detail = load_model(cfg.resume_from, model, optim)
        best.update({
            'loss': detail['loss'],
            'score': detail['score'],
            'epoch': detail['epoch'],
        })

    # setting dataset ---------------------------
    log.info('\n')
    log.info('** dataset **')
    folds = [fold for fold in range(cfg.n_fold) if cfg.fold != fold]
    log.info(f'fold_train:    {folds}')
    log.info(f'fold_valid:    [{cfg.fold}]')

    loader_train = factory.get_dataloader(cfg.data.train, folds)
    loader_valid = factory.get_dataloader(cfg.data.valid, [cfg.fold])
    log.info(loader_train)
    log.info(loader_valid)

    # scheduler ---------------------------------
    scheduler = factory.get_scheduler(cfg, optim, best['epoch'])

    if cfg.apex:
        amp.initialize(model, optim, opt_level='O1')

    for epoch in range(best['epoch'] + 1, cfg.epoch):
        log.info(f'---epoch {epoch}---')
        set_seed(epoch)

        ## train model --------------------------
        run_nn(cfg.data.train,
               'train',
               model,
               loader_train,
               criterion=criterion,
               optim=optim,
               apex=cfg.apex)

        ## valid model --------------------------
        with torch.no_grad():
            val = run_nn(cfg.data.valid,
                         'valid',
                         model,
                         loader_valid,
                         criterion=criterion)

        detail = {
            'score': val['score'],
            'loss': val['loss'],
            'epoch': epoch,
        }
        if val['loss'] <= best['loss']:
            best.update(detail)

        save_model(model, optim, detail, cfg.fold,
                   os.path.join(cfg.workdir, 'checkpoint'))

        log.info('[best] ep:%d loss:%.4f score:%.4f' %
                 (best['epoch'], best['loss'], best['score']))

        scheduler.step(val['loss'])  # reducelronplateau