def get_model(cfg): log.info(f'model: {cfg.model.name}') log.info(f'pretrained: {cfg.model.pretrained}') if cfg.model.name in ['resnext101_32x8d_wsl']: log.info('backbone get from: facebookresearch/WSL-Images') model = torch.hub.load('facebookresearch/WSL-Images', cfg.model.name) model.fc = torch.nn.Linear(2048, cfg.model.n_output) return model try: model_func = pretrainedmodels.__dict__[cfg.model.name] log.info('pretrained weight from: pretrainedmodels') except KeyError as e: model_func = eval(cfg.model.name) model = model_func(num_classes=1000, pretrained=cfg.model.pretrained) model.avg_pool = nn.AdaptiveAvgPool2d(1) model.last_linear = nn.Linear( model.last_linear.in_features, cfg.model.n_output, ) log.info(f'last layer: {cfg.model.n_output}') return model
def get_optim(cfg, parameters): optim = getattr(torch.optim, cfg.optim.name)(parameters, **cfg.optim.params) log.info(f'optim: {cfg.optim}') return optim
def save_model(model, optim, detail, fold, dirname): path = os.path.join(dirname, 'fold%d_ep%d.pt' % (fold, detail['epoch'])) torch.save({ 'model': model.state_dict(), 'optim': optim.state_dict(), 'detail': detail, }, path) log.info('saved model to %s' % path)
def get_loss(cfg): loss = getattr(nn, cfg.loss.name)(weight=torch.FloatTensor([2, 1, 1, 1, 1, 1]).to(cfg.device), **cfg.loss.params) log.info(f'criterion: {cfg.loss}') return loss
def do_test(cfg, model): assert cfg.output load_model(cfg.snapshot, model) loader_test = factory.get_dataloader(cfg.data.test) with torch.no_grad(): results = [ run_nn(cfg.data.test, 'test', model, loader_test) for i in range(cfg.n_tta) ] with open(cfg.output, 'wb') as f: pickle.dump(results, f) log.info('saved to %s' % cfg.output)
def __init__(self, cfg, folds): self.cfg = cfg self.folds = folds self.transforms = factory.get_transforms(self.cfg) with open(cfg.annotations, 'rb') as f: log.info(f'loaded file: {cfg.annotations}') self.df = pickle.load(f) if folds: self.df = self.df[self.df.fold.isin(folds)] self.df = apply_dataset_policy(self.df, self.cfg.dataset_policy)
def run_train(): args = get_args() cfg = Config.fromfile(args.config) # copy command line args to cfg cfg.mode = args.mode cfg.debug = args.debug cfg.fold = args.fold cfg.snapshot = args.snapshot cfg.output = args.output cfg.n_tta = args.n_tta cfg.gpu = args.gpu cfg.device = device # print setting show_config(cfg) # torch.cuda.set_device(cfg.gpu) set_seed(cfg.seed) # setup ------------------------------------- for f in ['checkpoint', 'train', 'valid', 'test', 'backup']: os.makedirs(cfg.workdir + '/' + f, exist_ok=True) if 0: #not work perfect backup_project_as_zip( PROJECT_PATH, cfg.workdir + '/backup/code.train.%s.zip' % IDENTIFIER) ## model ------------------------------------ log.info('\n') log.info('** model setting **') model = factory.get_model(cfg) # multi-gpu---------------------------------- if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model.to(device) ## ------------------------------------------ if cfg.mode == 'train': do_train(cfg, model) elif cfg.mode == 'valid': do_valid(cfg, model) elif cfg.mode == 'test': do_test(cfg, model) else: log.error(f"mode '{cfg.mode}' is not in [train, valid, test]") exit(0)
def apply_dataset_policy(df, policy): if policy == 'all': pass elif policy == 'pos==neg': df_positive = df[df.labels != ''] df_negative = df[df.labels != ''] df_sampled = df_negative.sample(len(df_positive)) df = pd.concat([df_positive, df_sampled], sort=False) else: raise log.info(f'apply_dataset_policy: {policy}') return df
def do_valid(cfg, model): assert cfg.output criterion = factory.get_loss(cfg) load_model(cfg.snapshot, model) loader_valid = factory.get_dataloader(cfg.data.valid, [cfg.fold]) with torch.no_grad(): results = [ run_nn(cfg.data.valid, 'valid', model, loader_valid, criterion=criterion) for i in range(cfg.n_tta) ] with open(cfg.output, 'wb') as f: pickle.dump(results, f) log.info('saved to %s' % cfg.output)
def get_scheduler(cfg, optim, last_epoch): if cfg.scheduler.name == 'ReduceLROnPlateau': scheduler = lr_scheduler.ReduceLROnPlateau( optim, **cfg.scheduler.params, ) scheduler.last_epoch = last_epoch else: scheduler = getattr(lr_scheduler, cfg.scheduler.name)( optim, last_epoch=last_epoch, **cfg.scheduler.params, ) log.info(f'last_epoch: {last_epoch}') return scheduler
def load_model(path, model, optim=None): # remap everthing onto CPU state = torch.load(str(path), map_location=lambda storage, location: storage) model.load_state_dict(state['model']) if optim: log.info('loading optim too') optim.load_state_dict(state['optim']) else: log.info('not loading optim') model.cuda() detail = state['detail'] log.info('loaded model from %s' % path) return detail
def show_config(cfg): log.info('---[START %s] %s' % (IDENTIFIER, '-' * 32)) log.info('\n') log.info('** show config **') log.info(f'workdir: {cfg.workdir}') log.info(f'logpath: {logger.path}') log.info(f'seed: {cfg.seed}') log.info(f'model: {cfg.model}') log.info(f'optim: {cfg.optim}') log.info(f'loss: {cfg.loss}') log.info(f'scheduler: {cfg.scheduler}') log.info(f'mode: {cfg.mode}') log.info(f'fold: {cfg.fold}') log.info(f'epoch: {cfg.epoch}') log.info(f'batch size: {cfg.batch_size}') log.info(f'acc: {cfg.data.train.n_grad_acc}') log.info(f'n_workers: {cfg.num_workers}') log.info(f'apex: {cfg.apex}') log.info(f'imgsize: {cfg.imgsize}') log.info(f'normalize: {cfg.normalize}') log.info(f'debug: {cfg.debug}') log.info(f'n_tta: {cfg.n_tta}') log.info(f'resume_from: {cfg.resume_from}') # device log.info(f'gpu: {cfg.gpu}') log.info(f'device: {cfg.device}')
def run_nn(cfg, mode, model, loader, criterion=None, optim=None, scheduler=None, apex=None): if mode in ['train']: model.train() elif mode in ['valid', 'test']: model.eval() else: raise t1 = time.time() losses = [] ids_all = [] targets_all = [] outputs_all = [] for i, (inputs, targets, ids) in enumerate(loader): batch_size = len(inputs) inputs = inputs.to(device) targets = targets.to(device) outputs = model(inputs) if mode in ['train', 'valid']: loss = criterion(outputs, targets) with torch.no_grad(): losses.append(loss.item()) if mode in ['train']: if apex: with amp.scale_loss(loss, optim) as scaled_loss: scaled_loss.backward() else: loss.backward() # accumulate loss if (i + 1) % cfg.n_grad_acc == 0: optim.step() # update optim.zero_grad() # flush with torch.no_grad(): ids_all.extend(ids) targets_all.extend(targets.cpu().numpy()) outputs_all.extend(torch.sigmoid(outputs).cpu().numpy()) #outputs_all.append(torch.softmax(outputs, dim=1).cpu().numpy()) elapsed = int(time.time() - t1) eta = int(elapsed / (i + 1) * (len(loader) - (i + 1))) progress = f'\r[{mode}] {i+1}/{len(loader)} {elapsed}(s) eta:{eta}(s) loss:{(np.sum(losses)/(i+1)):.6f} loss200:{(np.sum(losses[-200:])/(min(i+1,200))):.6f} lr:{get_lr(optim):.2e}' print(progress, end='') sys.stdout.flush() result = { 'ids': ids_all, 'targets': np.array(targets_all), 'outputs': np.array(outputs_all), 'loss': np.sum(losses) / (i + 1), } if mode in ['train', 'valid']: result.update(calc_auc(result['targets'], result['outputs'])) result.update(calc_logloss(result['targets'], result['outputs'])) result['score'] = result['logloss'] log.info(progress + ' auc:%.4f micro:%.4f macro:%.4f' % (result['auc'], result['auc_micro'], result['auc_macro'])) log.info('%.6f %s' % (result['logloss'], np.round(result['logloss_classes'], 6))) else: log.info('') return result
def do_train(cfg, model): log.info('\n') log.info('** start training **') # get criterion ----------------------------- criterion = factory.get_loss(cfg) # get optimization -------------------------- optim = factory.get_optim(cfg, model.parameters()) # initial ----------------------------------- best = { 'loss': float('inf'), 'score': 0.0, 'epoch': -1, } # re-load model ----------------------------- if cfg.resume_from: log.info('\n') log.info(f're-load model from {cfg.resume_from}') detail = load_model(cfg.resume_from, model, optim) best.update({ 'loss': detail['loss'], 'score': detail['score'], 'epoch': detail['epoch'], }) # setting dataset --------------------------- log.info('\n') log.info('** dataset **') folds = [fold for fold in range(cfg.n_fold) if cfg.fold != fold] log.info(f'fold_train: {folds}') log.info(f'fold_valid: [{cfg.fold}]') loader_train = factory.get_dataloader(cfg.data.train, folds) loader_valid = factory.get_dataloader(cfg.data.valid, [cfg.fold]) log.info(loader_train) log.info(loader_valid) # scheduler --------------------------------- scheduler = factory.get_scheduler(cfg, optim, best['epoch']) if cfg.apex: amp.initialize(model, optim, opt_level='O1') for epoch in range(best['epoch'] + 1, cfg.epoch): log.info(f'---epoch {epoch}---') set_seed(epoch) ## train model -------------------------- run_nn(cfg.data.train, 'train', model, loader_train, criterion=criterion, optim=optim, apex=cfg.apex) ## valid model -------------------------- with torch.no_grad(): val = run_nn(cfg.data.valid, 'valid', model, loader_valid, criterion=criterion) detail = { 'score': val['score'], 'loss': val['loss'], 'epoch': epoch, } if val['loss'] <= best['loss']: best.update(detail) save_model(model, optim, detail, cfg.fold, os.path.join(cfg.workdir, 'checkpoint')) log.info('[best] ep:%d loss:%.4f score:%.4f' % (best['epoch'], best['loss'], best['score'])) scheduler.step(val['loss']) # reducelronplateau