def run(config_file): config = load_config(config_file) os.makedirs(config.work_dir, exist_ok=True) save_config(config, config.work_dir + '/config.yml') os.environ['CUDA_VISIBLE_DEVICES'] = '0' all_transforms = {} all_transforms['train'] = get_transforms(config.transforms.train) all_transforms['valid'] = get_transforms(config.transforms.test) dataloaders = { phase: make_loader( data_folder=config.data.train_dir, df_path=config.data.train_df_path, phase=phase, batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx_fold, transforms=all_transforms[phase], num_classes=config.data.num_classes, pseudo_label_path=config.train.pseudo_label_path, task='cls' ) for phase in ['train', 'valid'] } # create model model = CustomNet(config.model.encoder, config.data.num_classes) # train setting criterion = get_loss(config) params = [ {'params': model.base_params(), 'lr': config.optimizer.params.encoder_lr}, {'params': model.fresh_params(), 'lr': config.optimizer.params.decoder_lr} ] optimizer = get_optimizer(params, config) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner(model=model) callbacks = [MultiClassAccuracyCallback(threshold=0.5), F1ScoreCallback()] if os.path.exists(config.work_dir + '/checkpoints/best.pth'): callbacks.append(CheckpointCallback(resume=config.work_dir + '/checkpoints/best_full.pth')) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, callbacks=callbacks, verbose=True, fp16=True, )
def get_callbacks(class_names): num_classes = len(class_names) return [ AccuracyCallback(num_classes=num_classes), AUCCallback( num_classes=num_classes, input_key="targets_one_hot", class_names=class_names ), F1ScoreCallback( input_key="targets_one_hot", activation="Softmax" ) ]
def train(num_epochs, model, loaders, logdir): criterion = torch.nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2) callbacks = [F1ScoreCallback()] # model runner runner = SupervisedRunner() # model training runner.train(model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, logdir=logdir, num_epochs=num_epochs, callbacks=callbacks, verbose=True)
model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, # We can specify the callbacks list for the experiment; # For this task, we will check accuracy, AUC and F1 metrics callbacks=[ AccuracyCallback(num_classes=config.num_classes), AUCCallback( num_classes=config.num_classes, input_key="targets_one_hot", class_names=config.class_names ), F1ScoreCallback( input_key="targets_one_hot", activation="Softmax" ), CheckpointCallback( save_n_best=1, # resume_dir="./models/classification", metrics_filename="metrics.json" ), EarlyStoppingCallback( patience=config.patience, metric="auc/_mean", minimize=False ) ], # path to save logs logdir=config.logdir,
optimizer, factor=0.25, patience=3) num_epochs = args.e logdir = "./logs/effnet-b0" fp16_params = None # dict(opt_level="O1") runner = SupervisedRunner(device='cuda') runner.train( model=model, criterion=criterion, scheduler=scheduler, optimizer=optimizer, loaders=loaders, callbacks=[ # wAUC(), F1ScoreCallback(), AUCCallback(num_classes=4), AccuracyCallback(prefix='ACC'), OptimizerCallback(accumulation_steps=args.acc)], logdir=logdir, num_epochs=num_epochs, fp16=fp16_params, verbose=True ) if args.test > 0: test_preds_proba: Union[List, Iterable, np.ndarray] = [] model.eval() progress_bar_test = tqdm(test_dataset) with torch.no_grad(): for i, im in enumerate(progress_bar_test): inputs = im.to('cuda')
"valid": valid_dl } criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD([ {'params': model.layer1.parameters(), 'lr': LR / 10}, {'params': model.layer2.parameters(), 'lr': LR / 5}, {'params': model.layer3.parameters(), 'lr': LR / 2}, {'params': model.layer4.parameters(), 'lr': LR / 1}, ], lr=LR) # scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=[LR / 10, LR / 5, LR / 2, LR / 1], total_steps=100) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=1e-7) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, cooldown=2, min_lr=1e-7) callbacks = [ AccuracyCallback(num_classes=5, threshold=0.5, activation='Softmax'), F1ScoreCallback(input_key="targets_one_hot", activation='Softmax', threshold=0.5), ] runner = SupervisedRunner() ## Step 1. runner.train( model=model, criterion=criterion, optimizer=optimizer, callbacks=callbacks, loaders=loaders, logdir=logdir, num_epochs=num_epochs, verbose=1, scheduler=scheduler,
def get_callbacks(num_classes): callbacks = [ AccuracyCallback(num_classes=num_classes), F1ScoreCallback(input_key="targets_one_hot", activation="Softmax") ] return callbacks
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('--seed', type=int, default=1234, help='Random seed') arg('--model-name', type=str, default=Path('seresnext101'), help='String model name used for saving') arg('--run-root', type=Path, default=Path('../results'), help='Directory for saving model') arg('--data-root', type=Path, default=Path('../data')) arg('--image-size', type=int, default=224, help='Image size for training') arg('--batch-size', type=int, default=16, help='Batch size during training') arg('--fold', type=int, default=0, help='Validation fold') arg('--n-epochs', type=int, default=10, help='Epoch to run') arg('--learning-rate', type=float, default=1e-3, help='Initial learning rate') arg('--step', type=int, default=1, help='Current training step') arg('--patience', type=int, default=4) arg('--criterion', type=str, default='bce', help='Criterion') arg('--optimizer', default='Adam', help='Name of the optimizer') arg('--continue_train', type=bool, default=False) arg('--checkpoint', type=str, default=Path('../results'), help='Checkpoint file path') arg('--workers', type=int, default=2) arg('--debug', type=bool, default=True) args = parser.parse_args() set_seed(args.seed) """ SET PARAMS """ args.debug = True ON_KAGGLE = configs.ON_KAGGLE N_CLASSES = configs.NUM_CLASSES args.image_size = configs.SIZE args.data_root = configs.DATA_ROOT use_cuda = cuda.is_available() fold = args.fold num_workers = args.workers num_epochs = args.n_epochs batch_size = args.batch_size learning_rate = args.learning_rate """ LOAD DATA """ print(os.listdir(args.data_root)) folds = pd.read_csv(args.data_root / 'folds.csv') train_root = args.data_root / 'train' if args.debug: folds = folds.head(50) train_fold = folds[folds['fold'] != fold] valid_fold = folds[folds['fold'] == fold] check_fold(train_fold, valid_fold) def get_dataloader(df: pd.DataFrame, image_transform) -> DataLoader: """ Calls dataloader to load Imet Dataset """ return DataLoader( ImetDataset(train_root, df, image_transform), shuffle=True, batch_size=batch_size, num_workers=num_workers, ) train_loader = get_dataloader(train_fold, image_transform=albu_transform) valid_loader = get_dataloader(valid_fold, image_transform=valid_transform) print('{} items in train, {} in valid'.format(len(train_loader.dataset), len(valid_loader.dataset))) loaders = OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader """ MODEL """ model = seresnext101(num_classes=N_CLASSES) if use_cuda: model = model.cuda() criterion = nn.BCEWithLogitsLoss() optimizer = Adam(model.parameters(), lr=learning_rate) scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=args.patience) """ MODEL RUNNER """ # call an instance of the model runner runner = SupervisedRunner() # logs folder current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'{current_time}_{args.model_name}' logdir = os.path.join(args.run_root, prefix) os.makedirs(logdir, exist_ok=False) print('\tTrain session :', prefix) print('\tOn KAGGLE :', ON_KAGGLE) print('\tDebug :', args.debug) print('\tClasses number :', N_CLASSES) print('\tModel :', args.model_name) print('\tParameters :', model.parameters()) print('\tImage size :', args.image_size) print('\tEpochs :', num_epochs) print('\tWorkers :', num_workers) print('\tLog dir :', logdir) print('\tLearning rate :', learning_rate) print('\tBatch size :', batch_size) print('\tPatience :', args.patience) if args.continue_train: state = load_model(model, args.checkpoint) epoch = state['epoch'] step = state['step'] print('Loaded model weights from {}, epoch {}, step {}'.format( args.checkpoint, epoch, step)) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=loaders, callbacks=[ F1ScoreCallback(threshold=0.5), #F2ScoreCallback(num_classes=N_CLASSES), EarlyStoppingCallback(patience=args.patience, min_delta=0.01) ], logdir=logdir, num_epochs=num_epochs, verbose=True) # by default it only plots loss, works in IPython Notebooks #utils.plot_metrics(logdir=logdir, metrics=["loss", "_base/lr"]) """ INFERENCE TEST """ loaders = OrderedDict([("infer", loaders["train"])]) runner.infer( model=model, loaders=loaders, callbacks=[ CheckpointCallback(resume=f"{logdir}/checkpoints/best.pth"), InferCallback() ], ) print(runner.callbacks[1].predictions["logits"])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--seed', type=int, default=42, help='Random seed') parser.add_argument('--fast', action='store_true') parser.add_argument('-dd', '--data-dir', type=str, default='data', help='Data directory for INRIA sattelite dataset') parser.add_argument('-m', '--model', type=str, default='cls_resnet18', help='') parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch Size during training, e.g. -b 64') parser.add_argument('-e', '--epochs', type=int, default=100, help='Epoch to run') parser.add_argument('-es', '--early-stopping', type=int, default=None, help='Maximum number of epochs without improvement') parser.add_argument('-fe', '--freeze-encoder', action='store_true') parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4, help='Initial learning rate') parser.add_argument('-l', '--criterion', type=str, default='bce', help='Criterion') parser.add_argument('-o', '--optimizer', default='Adam', help='Name of the optimizer') parser.add_argument( '-c', '--checkpoint', type=str, default=None, help='Checkpoint filename to use as initial model weights') parser.add_argument('-w', '--workers', default=multiprocessing.cpu_count(), type=int, help='Num workers') parser.add_argument('-a', '--augmentations', default='hard', type=str, help='') parser.add_argument('-tta', '--tta', default=None, type=str, help='Type of TTA to use [fliplr, d4]') parser.add_argument('-tm', '--train-mode', default='random', type=str, help='') parser.add_argument('-rm', '--run-mode', default='fit_predict', type=str, help='') parser.add_argument('--transfer', default=None, type=str, help='') parser.add_argument('--fp16', action='store_true') args = parser.parse_args() set_manual_seed(args.seed) data_dir = args.data_dir num_workers = args.workers num_epochs = args.epochs batch_size = args.batch_size learning_rate = args.learning_rate early_stopping = args.early_stopping model_name = args.model optimizer_name = args.optimizer image_size = (512, 512) fast = args.fast augmentations = args.augmentations train_mode = args.train_mode run_mode = args.run_mode log_dir = None fp16 = args.fp16 freeze_encoder = args.freeze_encoder run_train = run_mode == 'fit_predict' or run_mode == 'fit' run_predict = run_mode == 'fit_predict' or run_mode == 'predict' model = maybe_cuda(get_model(model_name, num_classes=1)) if args.transfer: transfer_checkpoint = fs.auto_file(args.transfer) print("Transfering weights from model checkpoint", transfer_checkpoint) checkpoint = load_checkpoint(transfer_checkpoint) pretrained_dict = checkpoint['model_state_dict'] for name, value in pretrained_dict.items(): try: model.load_state_dict(collections.OrderedDict([(name, value)]), strict=False) except Exception as e: print(e) checkpoint = None if args.checkpoint: checkpoint = load_checkpoint(fs.auto_file(args.checkpoint)) unpack_checkpoint(checkpoint, model=model) checkpoint_epoch = checkpoint['epoch'] print('Loaded model weights from:', args.checkpoint) print('Epoch :', checkpoint_epoch) print('Metrics (Train):', 'f1 :', checkpoint['epoch_metrics']['train']['f1_score'], 'loss:', checkpoint['epoch_metrics']['train']['loss']) print('Metrics (Valid):', 'f1 :', checkpoint['epoch_metrics']['valid']['f1_score'], 'loss:', checkpoint['epoch_metrics']['valid']['loss']) log_dir = os.path.dirname( os.path.dirname(fs.auto_file(args.checkpoint))) if run_train: if freeze_encoder: set_trainable(model.encoder, trainable=False, freeze_bn=True) criterion = get_loss(args.criterion) parameters = get_optimizable_parameters(model) optimizer = get_optimizer(optimizer_name, parameters, learning_rate) if checkpoint is not None: try: unpack_checkpoint(checkpoint, optimizer=optimizer) print('Restored optimizer state from checkpoint') except Exception as e: print('Failed to restore optimizer state from checkpoint', e) train_loader, valid_loader = get_dataloaders( data_dir=data_dir, batch_size=batch_size, num_workers=num_workers, image_size=image_size, augmentation=augmentations, fast=fast) loaders = collections.OrderedDict() loaders["train"] = train_loader loaders["valid"] = valid_loader current_time = datetime.now().strftime('%b%d_%H_%M') prefix = f'adversarial/{args.model}/{current_time}_{args.criterion}' if fp16: prefix += '_fp16' if fast: prefix += '_fast' log_dir = os.path.join('runs', prefix) os.makedirs(log_dir, exist_ok=False) scheduler = MultiStepLR(optimizer, milestones=[10, 30, 50, 70, 90], gamma=0.5) print('Train session :', prefix) print('\tFP16 mode :', fp16) print('\tFast mode :', args.fast) print('\tTrain mode :', train_mode) print('\tEpochs :', num_epochs) print('\tEarly stopping :', early_stopping) print('\tWorkers :', num_workers) print('\tData dir :', data_dir) print('\tLog dir :', log_dir) print('\tAugmentations :', augmentations) print('\tTrain size :', len(train_loader), len(train_loader.dataset)) print('\tValid size :', len(valid_loader), len(valid_loader.dataset)) print('Model :', model_name) print('\tParameters :', count_parameters(model)) print('\tImage size :', image_size) print('\tFreeze encoder :', freeze_encoder) print('Optimizer :', optimizer_name) print('\tLearning rate :', learning_rate) print('\tBatch size :', batch_size) print('\tCriterion :', args.criterion) # model training visualization_fn = partial(draw_classification_predictions, class_names=['Train', 'Test']) callbacks = [ F1ScoreCallback(), AUCCallback(), ShowPolarBatchesCallback(visualization_fn, metric='f1_score', minimize=False), ] if early_stopping: callbacks += [ EarlyStoppingCallback(early_stopping, metric='auc', minimize=False) ] runner = SupervisedRunner(input_key='image') runner.train(fp16=fp16, model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, callbacks=callbacks, loaders=loaders, logdir=log_dir, num_epochs=num_epochs, verbose=True, main_metric='auc', minimize_metric=False, state_kwargs={"cmd_args": vars(args)}) if run_predict and not fast: # Training is finished. Let's run predictions using best checkpoint weights best_checkpoint = load_checkpoint( fs.auto_file('best.pth', where=log_dir)) unpack_checkpoint(best_checkpoint, model=model) model.eval() torch.no_grad() train_csv = pd.read_csv(os.path.join(data_dir, 'train.csv')) train_csv['id_code'] = train_csv['id_code'].apply( lambda x: os.path.join(data_dir, 'train_images', f'{x}.png')) test_ds = RetinopathyDataset(train_csv['id_code'], None, get_test_aug(image_size), target_as_array=True) test_dl = DataLoader(test_ds, batch_size, pin_memory=True, num_workers=num_workers) test_ids = [] test_preds = [] for batch in tqdm(test_dl, desc='Inference'): input = batch['image'].cuda() outputs = model(input) predictions = to_numpy(outputs['logits'].sigmoid().squeeze(1)) test_ids.extend(batch['image_id']) test_preds.extend(predictions) df = pd.DataFrame.from_dict({ 'id_code': test_ids, 'is_test': test_preds }) df.to_csv(os.path.join(log_dir, 'test_in_train.csv'), index=None)
print('Training only head for {} epochs with inital lr {}'.format(head_n_epochs, head_lr)) for p in model.parameters(): p.requires_grad = False for p in model._fc.parameters(): p.requires_grad = True optimizer = torch.optim.Adam(model.parameters(), lr=head_lr) criterion = nn.CrossEntropyLoss() scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.75, patience=2) runner.train(model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, scheduler=scheduler, callbacks=[ F1ScoreCallback(), MixupCallback() ], num_epochs=head_n_epochs, verbose=True) print('Train whole net for {} epochs with initial lr {}'.format(full_n_epochs, full_lr)) for p in model.parameters(): p.requires_grad = True optimizer = torch.optim.Adam(model.parameters(), lr=full_lr) criterion = nn.CrossEntropyLoss() scheduler = ReduceLROnPlateau(optimizer=optimizer, factor=0.5, patience=5) runner.train(model=model, criterion=criterion, optimizer=optimizer, loaders=loaders, logdir=logdir, scheduler=scheduler,