def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=0.0, **kwargs): if optimizer_name.lower() == "sgd": return SGD(parameters, learning_rate, momentum=0.9, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "ranger": return Ranger(parameters, learning_rate, weight_decay=weight_decay, **kwargs) # if optimizer_name.lower() == "qhadamw": # return QHAdamW(parameters, learning_rate, weight_decay=weight_decay, # **kwargs) # if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, adam_w_mode=True, **kwargs) if optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD return FusedSGD(parameters, learning_rate, weight_decay=weight_decay, momentum=0.9, **kwargs) if optimizer_name.lower() == "diffgrad": return DiffGrad(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "novograd": return Novograd(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)
# In[24]: learning_rate = 0.001 encoder_learning_rate = 0.0005 # Since we use a pre-trained encoder, we will reduce the learning rate on it. layerwise_params = { "encoder*": dict(lr=encoder_learning_rate, weight_decay=0.00003) } # This function removes weight_decay for biases and applies our layerwise_params model_params = utils.process_model_params(model, layerwise_params=layerwise_params) # Catalyst has new SOTA optimizers out of box base_optimizer = RAdam(model_params, lr=learning_rate, weight_decay=0.0003) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.25, patience=2) # In[25]: num_epochs = 3 logdir = "./logs/segmentation" device = utils.get_device() print(f"device: {device}") if is_fp16_used:
args.data_folder, args.meta_info_file, one_hot_encoding=args.one_hot_encoding, bs=args.batch_size, num_classes=args.num_classes, num_workers=args.num_workers, augmenters=augmenters, ) if args.optimizer == 'Adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.max_lr, weight_decay=1e-4) elif args.optimizer == 'RAdam': base_optimizer = RAdam(model.parameters(), lr=args.max_lr, weight_decay=1e-4) optimizer = Lookahead(base_optimizer) elif args.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), momentum=0.95, lr=args.max_lr, weight_decay=1e-4) else: print('You have choose the default Optimizer - Adam') optimizer = torch.optim.Adam(model.parameters(), lr=args.max_lr, weight_decay=1e-4) criterion = OrderedDict({ "ce": CustomCrossEntropyLoss(),
}), 'attn_linknet': (LinkNetGated, { 'num_classes': 4, 'in_channels': 3 }) } preprocessing_fn = smp.encoders.get_preprocessing_fn(encoder_name=args.encoder, pretrained='imagenet') model = models[args.model.lower()][0](**models[args.model.lower()][1]).cuda() layerwise_params = {"enc*": dict(lr=args.lr_e, weight_decay=0.00001)} model_params = utils.process_model_params(model, layerwise_params=layerwise_params) base_optimizer = RAdam(model_params, lr=args.lr_d, weight_decay=1e-6) optimizer = Lookahead(base_optimizer) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.15) criteria = {'dice': DiceLoss(), 'bce': torch.nn.BCEWithLogitsLoss()} train = pd.read_csv(f'train_preprocessed.csv') train_ids = pd.read_csv(f'./folds/fold_{args.fold}_train.csv').values.ravel() valid_ids = pd.read_csv(f'./folds/fold_{args.fold}_val.csv').values.ravel() num_workers = 4 bs = args.bs train_dataset = CloudDataset(df=train, image_size=(args.size, args.size * 2), path=path, datatype='train',
def run(config_file): config = load_config(config_file) config.work_dir = 'result/' + config.work_dir print('working directory:', config.work_dir) all_transforms = {} all_transforms['train'] = Transform(size=config.data.image_size, threshold=20., sigma=-1., blur_ratio=0.2, noise_ratio=0.2, cutout_ratio=0.2, grid_distortion_ratio=0.2, random_brightness_ratio=0.2, piece_affine_ratio=0.2, ssr_ratio=0.2) all_transforms['valid'] = Transform(size=config.data.image_size) dataloaders = { phase: make_loader( phase=phase, batch_size=config.train.batch_size, num_workers=config.num_workers, idx_fold=config.data.params.idx, fold_csv=config.data.params.fold_csv, transforms=all_transforms[phase], # debug=config.debug ) for phase in ['train', 'valid'] } model = get_model(config) model = model.to(device) # we have multiple criterions criterion = { "ce": nn.CrossEntropyLoss(), # Define your awesome losses in here. Ex: Focal, lovasz, etc } optimizer = RAdam(model.parameters(), lr=config.optimizer.params.lr) if config.optimizer.lookahead.apply: optimizer = Lookahead(optimizer) scheduler = get_scheduler(optimizer, config) # model runner runner = SupervisedRunner( device=device, input_key="images", output_key=("logit_grapheme_root", "logit_vowel_diacritic", "logit_consonant_diacritic"), input_target_key=("grapheme_roots", "vowel_diacritics", "consonant_diacritics"), ) callbacks = [] if config.train.early_stop_patience > 0: callbacks.append( EarlyStoppingCallback(patience=config.train.early_stop_patience)) if config.train.accumulation_size > 0: accumulation_steps = config.train.accumulation_size // config.train.batch_size callbacks.extend( [OptimizerCallback(accumulation_steps=accumulation_steps)]) # to resume from check points if exists if os.path.exists(config.work_dir + '/checkpoints/best.pth') and config.train.resume: callbacks.append( CheckpointCallback(resume=config.work_dir + '/checkpoints/last_full.pth')) if config.train.mixup: CC = MixupCallback else: CC = CriterionCallback callbacks.extend([ CC( input_key="grapheme_roots", output_key="logit_grapheme_root", criterion_key='ce', prefix='loss_gr', ), CC( input_key="vowel_diacritics", output_key="logit_vowel_diacritic", criterion_key='ce', prefix='loss_wd', ), CC( input_key="consonant_diacritics", output_key="logit_consonant_diacritic", criterion_key='ce', prefix='loss_cd', ), CriterionAggregatorCallback( prefix="loss", loss_aggregate_fn="weighted_sum", loss_keys={ "loss_gr": 2.0, "loss_wd": 1.0, "loss_cd": 1.0 }, ), # metrics HMacroAveragedRecall(), ]) # model training runner.train( model=model, criterion=criterion, optimizer=optimizer, scheduler=scheduler, loaders=dataloaders, logdir=config.work_dir, num_epochs=config.train.num_epochs, main_metric="hmar", minimize_metric=False, monitoring_params=None, callbacks=callbacks, verbose=True, fp16=False, )