def main(): set_trainable() DATA_DIR = os.path.join(os.path.dirname(__file__), 'CamVid') x_train_dir = os.path.join(DATA_DIR, 'train') y_train_dir = os.path.join(DATA_DIR, 'trainannot') x_valid_dir = os.path.join(DATA_DIR, 'val') y_valid_dir = os.path.join(DATA_DIR, 'valannot') train_ds = CamVidDataset(x_train_dir, y_train_dir, transform=get_training_augmentation()) valid_ds = CamVidDataset(x_valid_dir, y_valid_dir, transform=get_validation_augmentation()) mul_factor = 10 data_loaders = OrderedDict() data_loaders['train'] = DataLoader(train_ds, batch_size=4, num_workers=4, pin_memory=True, drop_last=True, sampler=WeightedRandomSampler( np.ones(len(train_ds)), len(train_ds) * mul_factor)) data_loaders['valid'] = DataLoader(valid_ds, batch_size=4, num_workers=0, pin_memory=True, drop_last=False) num_classes = len(CLASS_NAMES) model = LinkNet34(num_classes).cuda() num_epochs = 50 parameters = model.parameters() # parameters = get_lr_decay_parameters(model.named_parameters(), # 1e-4, # { # "firstconv": 0.1, # "firstbn": 0.1, # "encoder1": 0.1, # "encoder2": 0.2, # "encoder3": 0.3, # "encoder4": 0.4, # }) # print(parameters) adam = Adam(parameters, lr=1e-4) # one_cycle = OneCycleLR(adam, # init_lr=1e-6, # lr_range=(1e-2, 1e-5), # num_steps=num_epochs) multistep = MultiStepLR(adam, [10, 20, 30, 40], gamma=0.5) # model runner runner = SupervisedRunner() # model training visualize_predictions = partial(draw_semantic_segmentation_predictions, mode='side-by-side', class_colors=CLASS_COLORS) current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'{current_time}_linknet34' log_dir = os.path.join('runs', prefix) os.makedirs(log_dir, exist_ok=False) runner.train( model=model, criterion=JointLoss(nn.CrossEntropyLoss(), MulticlassDiceLoss(classes=np.arange(11)), second=0.5), optimizer=adam, callbacks=[ JaccardScoreCallback( mode='multiclass', # We exclude 'unlabeled' class from the evaluation class_names=CLASS_NAMES[:11], classes_of_interest=np.arange(11), prefix='iou'), ShowPolarBatchesCallback(visualize_predictions, metric='iou', minimize=True), ], logdir=log_dir, loaders=data_loaders, num_epochs=num_epochs, scheduler=multistep, verbose=True, main_metric='iou', minimize_metric=False)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('--fast', action='store_true') parser.add_argument('--mixup', action='store_true') parser.add_argument('--balance', action='store_true') parser.add_argument('--balance-datasets', action='store_true') parser.add_argument('--swa', action='store_true') parser.add_argument('--show', action='store_true') parser.add_argument('--use-idrid', action='store_true') parser.add_argument('--use-messidor', action='store_true') parser.add_argument('--use-aptos2015', action='store_true') parser.add_argument('--use-aptos2019', action='store_true') parser.add_argument('-v', '--verbose', action='store_true') parser.add_argument('--coarse', action='store_true') parser.add_argument('-acc', '--accumulation-steps', type=int, default=1, help='Number of batches to process') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Data directory') parser.add_argument('-m', '--model', type=str, default='resnet18_gap', help='') parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch Size during training, e.g. -b 64') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-es', '--early-stopping', type=int, default=None, help='Maximum number of epochs without improvement') parser.add_argument('-f', '--fold', action='append', type=int, default=None) parser.add_argument('-ft', '--fine-tune', default=0, type=int) parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='Initial learning rate') parser.add_argument('--criterion-reg', type=str, default=None, nargs='+', help='Criterion') parser.add_argument('--criterion-ord', type=str, default=None, nargs='+', help='Criterion') parser.add_argument('--criterion-cls', type=str, default=['ce'], nargs='+', help='Criterion') parser.add_argument('-l1', type=float, default=0, help='L1 regularization loss') parser.add_argument('-l2', type=float, default=0, help='L2 regularization loss') parser.add_argument('-o', '--optimizer', default='Adam', help='Name of the optimizer') parser.add_argument('-p', '--preprocessing', default=None, help='Preprocessing method') parser.add_argument( '-c', '--checkpoint', type=str, default=None, help='Checkpoint filename to use as initial model weights') parser.add_argument('-w', '--workers', default=multiprocessing.cpu_count(), type=int, help='Num workers') parser.add_argument('-a', '--augmentations', default='medium', type=str, help='') parser.add_argument('-tta', '--tta', default=None, type=str, help='Type of TTA to use [fliplr, d4]') parser.add_argument('-t', '--transfer', default=None, type=str, help='') parser.add_argument('--fp16', action='store_true') parser.add_argument('-s', '--scheduler', default='multistep', type=str, help='') parser.add_argument('--size', default=512, type=int, help='Image size for training & inference') parser.add_argument('-wd', '--weight-decay', default=0, type=float, help='L2 weight decay') parser.add_argument('-wds', '--weight-decay-step', default=None, type=float, help='L2 weight decay step to add after each epoch') parser.add_argument('-d', '--dropout', default=0.0, type=float, help='Dropout before head layer') parser.add_argument( '--warmup', default=0, type=int, help= 'Number of warmup epochs with 0.1 of the initial LR and frozed encoder' ) parser.add_argument('-x', '--experiment', default=None, type=str, help='Dropout before head layer') args = parser.parse_args() data_dir = args.data_dir num_workers = args.workers num_epochs = args.epochs batch_size = args.batch_size learning_rate = args.learning_rate l1 = args.l1 l2 = args.l2 early_stopping = args.early_stopping model_name = args.model optimizer_name = args.optimizer image_size = (args.size, args.size) fast = args.fast augmentations = args.augmentations fp16 = args.fp16 fine_tune = args.fine_tune criterion_reg_name = args.criterion_reg criterion_cls_name = args.criterion_cls criterion_ord_name = args.criterion_ord folds = args.fold mixup = args.mixup balance = args.balance balance_datasets = args.balance_datasets use_swa = args.swa show_batches = args.show scheduler_name = args.scheduler verbose = args.verbose weight_decay = args.weight_decay use_idrid = args.use_idrid use_messidor = args.use_messidor use_aptos2015 = args.use_aptos2015 use_aptos2019 = args.use_aptos2019 warmup = args.warmup dropout = args.dropout use_unsupervised = False experiment = args.experiment preprocessing = args.preprocessing weight_decay_step = args.weight_decay_step coarse_grading = args.coarse class_names = get_class_names(coarse_grading) assert use_aptos2015 or use_aptos2019 or use_idrid or use_messidor current_time = datetime.now().strftime('%b%d_%H_%M') random_name = get_random_name() if folds is None or len(folds) == 0: folds = [None] for fold in folds: torch.cuda.empty_cache() checkpoint_prefix = f'{model_name}_{args.size}_{augmentations}' if preprocessing is not None: checkpoint_prefix += f'_{preprocessing}' if use_aptos2019: checkpoint_prefix += '_aptos2019' if use_aptos2015: checkpoint_prefix += '_aptos2015' if use_messidor: checkpoint_prefix += '_messidor' if use_idrid: checkpoint_prefix += '_idrid' if coarse_grading: checkpoint_prefix += '_coarse' if fold is not None: checkpoint_prefix += f'_fold{fold}' checkpoint_prefix += f'_{random_name}' if experiment is not None: checkpoint_prefix = experiment directory_prefix = f'{current_time}/{checkpoint_prefix}' log_dir = os.path.join('runs', directory_prefix) os.makedirs(log_dir, exist_ok=False) config_fname = os.path.join(log_dir, f'{checkpoint_prefix}.json') with open(config_fname, 'w') as f: train_session_args = vars(args) f.write(json.dumps(train_session_args, indent=2)) set_manual_seed(args.seed) num_classes = len(class_names) model = get_model(model_name, num_classes=num_classes, dropout=dropout).cuda() if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transfering weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint['model_state_dict'] for name, value in pretrained_dict.items(): try: model.load_state_dict(collections.OrderedDict([(name, value)]), strict=False) except Exception as e: print(e) report_checkpoint(checkpoint) if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) report_checkpoint(checkpoint) train_ds, valid_ds, train_sizes = get_datasets( data_dir=data_dir, use_aptos2019=use_aptos2019, use_aptos2015=use_aptos2015, use_idrid=use_idrid, use_messidor=use_messidor, use_unsupervised=False, coarse_grading=coarse_grading, image_size=image_size, augmentation=augmentations, preprocessing=preprocessing, target_dtype=int, fold=fold, folds=4) train_loader, valid_loader = get_dataloaders( train_ds, valid_ds, batch_size=batch_size, num_workers=num_workers, train_sizes=train_sizes, balance=balance, balance_datasets=balance_datasets, balance_unlabeled=False) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader print('Datasets :', data_dir) print(' Train size :', len(train_loader), len(train_loader.dataset)) print(' Valid size :', len(valid_loader), len(valid_loader.dataset)) print(' Aptos 2019 :', use_aptos2019) print(' Aptos 2015 :', use_aptos2015) print(' IDRID :', use_idrid) print(' Messidor :', use_messidor) print('Train session :', directory_prefix) print(' FP16 mode :', fp16) print(' Fast mode :', fast) print(' Mixup :', mixup) print(' Balance cls. :', balance) print(' Balance ds. :', balance_datasets) print(' Warmup epoch :', warmup) print(' Train epochs :', num_epochs) print(' Fine-tune ephs :', fine_tune) print(' Workers :', num_workers) print(' Fold :', fold) print(' Log dir :', log_dir) print(' Augmentations :', augmentations) print('Model :', model_name) print(' Parameters :', count_parameters(model)) print(' Image size :', image_size) print(' Dropout :', dropout) print(' Classes :', class_names, num_classes) print('Optimizer :', optimizer_name) print(' Learning rate :', learning_rate) print(' Batch size :', batch_size) print(' Criterion (cls):', criterion_cls_name) print(' Criterion (reg):', criterion_reg_name) print(' Criterion (ord):', criterion_ord_name) print(' Scheduler :', scheduler_name) print(' Weight decay :', weight_decay, weight_decay_step) print(' L1 reg. :', l1) print(' L2 reg. :', l2) print(' Early stopping :', early_stopping) # model training callbacks = [] criterions = {} main_metric = 'cls/kappa' if criterion_reg_name is not None: cb, crits = get_reg_callbacks(criterion_reg_name, class_names=class_names, show=show_batches) callbacks += cb criterions.update(crits) if criterion_ord_name is not None: cb, crits = get_ord_callbacks(criterion_ord_name, class_names=class_names, show=show_batches) callbacks += cb criterions.update(crits) if criterion_cls_name is not None: cb, crits = get_cls_callbacks(criterion_cls_name, num_classes=num_classes, num_epochs=num_epochs, class_names=class_names, show=show_batches) callbacks += cb criterions.update(crits) if l1 > 0: callbacks += [ LPRegularizationCallback(start_wd=l1, end_wd=l1, schedule=None, prefix='l1', p=1) ] if l2 > 0: callbacks += [ LPRegularizationCallback(start_wd=l2, end_wd=l2, schedule=None, prefix='l2', p=2) ] callbacks += [CustomOptimizerCallback()] runner = SupervisedRunner(input_key='image') # Pretrain/warmup if warmup: set_trainable(model.encoder, False, False) optimizer = get_optimizer('Adam', get_optimizable_parameters(model), learning_rate=learning_rate * 0.1) runner.train(fp16=fp16, model=model, criterion=criterions, optimizer=optimizer, scheduler=None, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, 'warmup'), num_epochs=warmup, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}) del optimizer # Main train if num_epochs: set_trainable(model.encoder, True, False) optimizer = get_optimizer(optimizer_name, get_optimizable_parameters(model), learning_rate=learning_rate, weight_decay=weight_decay) if use_swa: from torchcontrib.optim import SWA optimizer = SWA(optimizer, swa_start=len(train_loader), swa_freq=512) scheduler = get_scheduler(scheduler_name, optimizer, lr=learning_rate, num_epochs=num_epochs, batches_in_epoch=len(train_loader)) # Additional callbacks that specific to main stage only added here to copy of callbacks main_stage_callbacks = callbacks if early_stopping: es_callback = EarlyStoppingCallback(early_stopping, min_delta=1e-4, metric=main_metric, minimize=False) main_stage_callbacks = callbacks + [es_callback] runner.train(fp16=fp16, model=model, criterion=criterions, optimizer=optimizer, scheduler=scheduler, callbacks=main_stage_callbacks, loaders=loaders, logdir=os.path.join(log_dir, 'main'), num_epochs=num_epochs, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}) del optimizer, scheduler best_checkpoint = os.path.join(log_dir, 'main', 'checkpoints', 'best.pth') model_checkpoint = os.path.join(log_dir, 'main', 'checkpoints', f'{checkpoint_prefix}.pth') clean_checkpoint(best_checkpoint, model_checkpoint) # Restoring best model from checkpoint checkpoint = load_checkpoint(best_checkpoint) unpack_checkpoint(checkpoint, model=model) report_checkpoint(checkpoint) # Stage 3 - Fine tuning if fine_tune: set_trainable(model.encoder, False, False) optimizer = get_optimizer(optimizer_name, get_optimizable_parameters(model), learning_rate=learning_rate) scheduler = get_scheduler('multistep', optimizer, lr=learning_rate, num_epochs=fine_tune, batches_in_epoch=len(train_loader)) runner.train(fp16=fp16, model=model, criterion=criterions, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=os.path.join(log_dir, 'finetune'), num_epochs=fine_tune, verbose=verbose, main_metric=main_metric, minimize_metric=False, checkpoint_data={"cmd_args": vars(args)}) best_checkpoint = os.path.join(log_dir, 'finetune', 'checkpoints', 'best.pth') model_checkpoint = os.path.join(log_dir, 'finetune', 'checkpoints', f'{checkpoint_prefix}.pth') clean_checkpoint(best_checkpoint, model_checkpoint)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('--fast', action='store_true') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Data directory for INRIA sattelite dataset') parser.add_argument('-m', '--model', type=str, default='cls_resnet18', help='') parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch Size during training, e.g. -b 64') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-es', '--early-stopping', type=int, default=None, help='Maximum number of epochs without improvement') parser.add_argument('-fe', '--freeze-encoder', action='store_true') parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='Initial learning rate') parser.add_argument('-l', '--criterion', type=str, default='bce', help='Criterion') parser.add_argument('-o', '--optimizer', default='Adam', help='Name of the optimizer') parser.add_argument( '-c', '--checkpoint', type=str, default=None, help='Checkpoint filename to use as initial model weights') parser.add_argument('-w', '--workers', default=multiprocessing.cpu_count(), type=int, help='Num workers') parser.add_argument('-a', '--augmentations', default='hard', type=str, help='') parser.add_argument('-tta', '--tta', default=None, type=str, help='Type of TTA to use [fliplr, d4]') parser.add_argument('-tm', '--train-mode', default='random', type=str, help='') parser.add_argument('-rm', '--run-mode', default='fit_predict', type=str, help='') parser.add_argument('--transfer', default=None, type=str, help='') parser.add_argument('--fp16', action='store_true') args = parser.parse_args() set_manual_seed(args.seed) data_dir = args.data_dir num_workers = args.workers num_epochs = args.epochs batch_size = args.batch_size learning_rate = args.learning_rate early_stopping = args.early_stopping model_name = args.model optimizer_name = args.optimizer image_size = (512, 512) fast = args.fast augmentations = args.augmentations train_mode = args.train_mode run_mode = args.run_mode log_dir = None fp16 = args.fp16 freeze_encoder = args.freeze_encoder run_train = run_mode == 'fit_predict' or run_mode == 'fit' run_predict = run_mode == 'fit_predict' or run_mode == 'predict' model = maybe_cuda(get_model(model_name, num_classes=1)) if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transfering weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint['model_state_dict'] for name, value in pretrained_dict.items(): try: model.load_state_dict(collections.OrderedDict([(name, value)]), strict=False) except Exception as e: print(e) checkpoint = None if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) checkpoint_epoch = checkpoint['epoch'] print('Loaded model weights from:', args.checkpoint) print('Epoch :', checkpoint_epoch) print('Metrics (Train):', 'f1 :', checkpoint['epoch_metrics']['train']['f1_score'], 'loss:', checkpoint['epoch_metrics']['train']['loss']) print('Metrics (Valid):', 'f1 :', checkpoint['epoch_metrics']['valid']['f1_score'], 'loss:', checkpoint['epoch_metrics']['valid']['loss']) log_dir = os.path.dirname( os.path.dirname(fs.auto_file(args.checkpoint))) if run_train: if freeze_encoder: set_trainable(model.encoder, trainable=False, freeze_bn=True) criterion = get_loss(args.criterion) parameters = get_optimizable_parameters(model) optimizer = get_optimizer(optimizer_name, parameters, learning_rate) if checkpoint is not None: try: unpack_checkpoint(checkpoint, optimizer=optimizer) print('Restored optimizer state from checkpoint') except Exception as e: print('Failed to restore optimizer state from checkpoint', e) train_loader, valid_loader = get_dataloaders( data_dir=data_dir, batch_size=batch_size, num_workers=num_workers, image_size=image_size, augmentation=augmentations, fast=fast) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'adversarial/{args.model}/{current_time}_{args.criterion}' if fp16: prefix += '_fp16' if fast: prefix += '_fast' log_dir = os.path.join('runs', prefix) os.makedirs(log_dir, exist_ok=False) scheduler = MultiStepLR(optimizer, milestones=[10, 30, 50, 70, 90], gamma=0.5) print('Train session :', prefix) print('\tFP16 mode :', fp16) print('\tFast mode :', args.fast) print('\tTrain mode :', train_mode) print('\tEpochs :', num_epochs) print('\tEarly stopping :', early_stopping) print('\tWorkers :', num_workers) print('\tData dir :', data_dir) print('\tLog dir :', log_dir) print('\tAugmentations :', augmentations) print('\tTrain size :', len(train_loader), len(train_loader.dataset)) print('\tValid size :', len(valid_loader), len(valid_loader.dataset)) print('Model :', model_name) print('\tParameters :', count_parameters(model)) print('\tImage size :', image_size) print('\tFreeze encoder :', freeze_encoder) print('Optimizer :', optimizer_name) print('\tLearning rate :', learning_rate) print('\tBatch size :', batch_size) print('\tCriterion :', args.criterion) # model training visualization_fn = partial(draw_classification_predictions, class_names=['Train', 'Test']) callbacks = [ F1ScoreCallback(), AUCCallback(), ShowPolarBatchesCallback(visualization_fn, metric='f1_score', minimize=False), ] if early_stopping: callbacks += [ EarlyStoppingCallback(early_stopping, metric='auc', minimize=False) ] runner = SupervisedRunner(input_key='image') runner.train(fp16=fp16, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=log_dir, num_epochs=num_epochs, verbose=True, main_metric='auc', minimize_metric=False, state_kwargs={"cmd_args": vars(args)}) if run_predict and not fast: # Training is finished. Let's run predictions using best checkpoint weights best_checkpoint = load_checkpoint( fs.auto_file('best.pth', where=log_dir)) unpack_checkpoint(best_checkpoint, model=model) model.eval() torch.no_grad() train_csv = pd.read_csv(os.path.join(data_dir, 'train.csv')) train_csv['id_code'] = train_csv['id_code'].apply( lambda x: os.path.join(data_dir, 'train_images', f'{x}.png')) test_ds = RetinopathyDataset(train_csv['id_code'], None, get_test_aug(image_size), target_as_array=True) test_dl = DataLoader(test_ds, batch_size, pin_memory=True, num_workers=num_workers) test_ids = [] test_preds = [] for batch in tqdm(test_dl, desc='Inference'): input = batch['image'].cuda() outputs = model(input) predictions = to_numpy(outputs['logits'].sigmoid().squeeze(1)) test_ids.extend(batch['image_id']) test_preds.extend(predictions) df = pd.DataFrame.from_dict({ 'id_code': test_ids, 'is_test': test_preds }) df.to_csv(os.path.join(log_dir, 'test_in_train.csv'), index=None)