def run() -> float: np.random.seed(0) model_dir = config.experiment_dir logger.info('=' * 50) train_loader, val_loader, test_loader = load_data(args.fold) logger.info(f'creating a model {config.model.arch}') model = create_model(config, pretrained=args.weights is None).cuda() criterion = get_loss(config) if args.summary: torchsummary.summary(model, (3, config.model.input_size, config.model.input_size)) if args.lr_finder: optimizer = get_optimizer(config, model.parameters()) lr_finder(train_loader, model, criterion, optimizer) sys.exit() if args.weights is None and config.train.head_only_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.steps} steps') logger.info(f'max_lr will be {config.train.warmup.max_lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer) freeze_layers(model) train_epoch(train_loader, model, criterion, optimizer, 0, warmup_scheduler, None, config.train.warmup.steps) unfreeze_layers(model) if args.weights is None and config.train.enable_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.steps} steps') logger.info(f'max_lr will be {config.train.warmup.max_lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer) train_epoch(train_loader, model, criterion, optimizer, 0, warmup_scheduler, None, config.train.warmup.steps) optimizer = get_optimizer(config, model.parameters()) if args.weights is None: last_epoch = -1 else: last_checkpoint = torch.load(args.weights) model_arch = last_checkpoint['arch'].replace('se_', 'se') if model_arch != config.model.arch: dprint(model_arch) dprint(config.model.arch) assert model_arch == config.model.arch model.load_state_dict(last_checkpoint['state_dict']) if 'optimizer' in last_checkpoint.keys(): optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {args.weights}') last_epoch = last_checkpoint['epoch'] if 'epoch' in last_checkpoint.keys() else 99 logger.info(f'loaded the model from epoch {last_epoch}') if args.lr != 0: set_lr(optimizer, float(args.lr)) elif 'lr' in config.optimizer.params: set_lr(optimizer, config.optimizer.params.lr) elif 'base_lr' in config.scheduler.params: set_lr(optimizer, config.scheduler.params.base_lr) if not args.cosine: lr_scheduler = get_scheduler(config.scheduler, optimizer, last_epoch= (last_epoch if config.scheduler.name != 'cyclic_lr' else -1)) assert config.scheduler2.name == '' lr_scheduler2 = get_scheduler(config.scheduler2, optimizer, last_epoch=last_epoch) \ if config.scheduler2.name else None else: epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) \ * config.train.batch_size set_lr(optimizer, float(config.cosine.start_lr)) lr_scheduler = CosineLRWithRestarts(optimizer, batch_size=config.train.batch_size, epoch_size=epoch_size, restart_period=config.cosine.period, period_inc=config.cosine.period_inc, max_period=config.cosine.max_period) lr_scheduler2 = None if args.predict_oof or args.predict_test: print('inference mode') assert args.weights is not None if args.predict_oof: gen_train_prediction(val_loader, model, last_epoch, args.weights) else: gen_test_prediction(test_loader, model, args.weights) sys.exit() logger.info(f'training will start from epoch {last_epoch + 1}') best_score = 0.0 best_epoch = 0 last_lr = get_lr(optimizer) best_model_path = args.weights for epoch in range(last_epoch + 1, config.train.num_epochs): logger.info('-' * 50) if not is_scheduler_continuous(lr_scheduler) and lr_scheduler2 is None: # if we have just reduced LR, reload the best saved model lr = get_lr(optimizer) if lr < last_lr - 1e-10 and best_model_path is not None: logger.info(f'learning rate dropped: {lr}, reloading') last_checkpoint = torch.load(best_model_path) assert(last_checkpoint['arch']==config.model.arch) model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {best_model_path}') set_lr(optimizer, lr) last_lr = lr if config.train.lr_decay_coeff != 0 and epoch in config.train.lr_decay_milestones: n_cycles = config.train.lr_decay_milestones.index(epoch) + 1 total_coeff = config.train.lr_decay_coeff ** n_cycles logger.info(f'artificial LR scheduler: made {n_cycles} cycles, decreasing LR by {total_coeff}') set_lr(optimizer, config.scheduler.params.base_lr * total_coeff) lr_scheduler = get_scheduler(config.scheduler, optimizer, coeff=total_coeff, last_epoch=-1) # (last_epoch if config.scheduler.name != 'cyclic_lr' else -1)) if isinstance(lr_scheduler, CosineLRWithRestarts): restart = lr_scheduler.epoch_step() if restart: logger.info('cosine annealing restarted, resetting the best metric') best_score = min(config.cosine.min_metric_val, best_score) train_epoch(train_loader, model, criterion, optimizer, epoch, lr_scheduler, lr_scheduler2, config.train.max_steps_per_epoch) score, _, _ = validate(val_loader, model, epoch) if type(lr_scheduler) == ReduceLROnPlateau: lr_scheduler.step(metrics=score) elif not is_scheduler_continuous(lr_scheduler): lr_scheduler.step() if type(lr_scheduler2) == ReduceLROnPlateau: lr_scheduler2.step(metrics=score) elif lr_scheduler2 and not is_scheduler_continuous(lr_scheduler2): lr_scheduler2.step() is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch if is_best: best_model_path = os.path.join(model_dir, f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth') data_to_save = { 'epoch': epoch, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'score': score, 'optimizer': optimizer.state_dict(), 'config': config } torch.save(data_to_save, best_model_path) logger.info(f'a snapshot was saved to {best_model_path}') logger.info(f'best score: {best_score:.04f}') return -best_score
def run() -> float: np.random.seed(0) model_dir = config.experiment_dir logger.info('=' * 50) # logger.info(f'hyperparameters: {params}') train_loader, val_loader, test_loader, label_encoder = load_data(args.fold) model = create_model() optimizer = get_optimizer(config, model.parameters()) lr_scheduler = get_scheduler(config, optimizer) lr_scheduler2 = get_scheduler( config, optimizer) if config.scheduler2.name else None criterion = get_loss(config) if args.weights is None: last_epoch = 0 logger.info(f'training will start from epoch {last_epoch+1}') else: last_checkpoint = torch.load(args.weights) assert last_checkpoint['arch'] == config.model.arch model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint {args.weights} was loaded.') last_epoch = last_checkpoint['epoch'] logger.info(f'loaded the model from epoch {last_epoch}') if args.lr_override != 0: set_lr(optimizer, float(args.lr_override)) elif 'lr' in config.scheduler.params: set_lr(optimizer, config.scheduler.params.lr) if args.gen_predict: print('inference mode') generate_submission(val_loader, test_loader, model, label_encoder, last_epoch, args.weights) sys.exit(0) if args.gen_features: print('inference mode') generate_features(test_loader, model, args.weights) sys.exit(0) best_score = 0.0 best_epoch = 0 last_lr = get_lr(optimizer) best_model_path = args.weights for epoch in range(last_epoch + 1, config.train.num_epochs + 1): logger.info('-' * 50) # if not is_scheduler_continuous(config.scheduler.name): # # if we have just reduced LR, reload the best saved model # lr = get_lr(optimizer) # logger.info(f'learning rate {lr}') # # if lr < last_lr - 1e-10 and best_model_path is not None: # last_checkpoint = torch.load(os.path.join(model_dir, best_model_path)) # assert(last_checkpoint['arch']==config.model.arch) # model.load_state_dict(last_checkpoint['state_dict']) # optimizer.load_state_dict(last_checkpoint['optimizer']) # logger.info(f'checkpoint {best_model_path} was loaded.') # set_lr(optimizer, lr) # last_lr = lr # # if lr < config.train.min_lr * 1.01: # logger.info('reached minimum LR, stopping') # break get_lr(optimizer) train(train_loader, model, criterion, optimizer, epoch, lr_scheduler, lr_scheduler2) score = validate(val_loader, model, epoch) if not is_scheduler_continuous(config.scheduler.name): lr_scheduler.step(score) if lr_scheduler2 and not is_scheduler_continuous( config.scheduler.name): lr_scheduler2.step(score) is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch data_to_save = { 'epoch': epoch, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'best_score': best_score, 'score': score, 'optimizer': optimizer.state_dict(), 'options': config } filename = config.version if is_best: best_model_path = f'{filename}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth' save_checkpoint(data_to_save, best_model_path, model_dir) logger.info(f'best score: {best_score:.04f}') return -best_score
def lr_finder(train_loader: Any, model: Any, criterion: Any, optimizer: Any) -> None: ''' Finds the optimal LR range and sets up first optimizer parameters. ''' logger.info('lr_finder called') batch_time = AverageMeter() num_steps = min(len(train_loader), config.train.lr_finder.num_steps) logger.info(f'total batches: {num_steps}') end = time.time() lr_str = '' model.train() init_value = config.train.lr_finder.init_value final_value = config.train.lr_finder.final_value beta = config.train.lr_finder.beta mult = (final_value / init_value) ** (1 / (num_steps - 1)) lr = init_value avg_loss = best_loss = 0.0 losses = np.zeros(num_steps) logs = np.zeros(num_steps) for i, (input_, target) in enumerate(train_loader): if i >= num_steps: break set_lr(optimizer, lr) output = model(input_.cuda()) loss = criterion(output, target.cuda()) loss_val = loss.data.item() predict = (output.detach() > 0.1).type(torch.FloatTensor) f2 = F_score(predict, target, beta=2) optimizer.zero_grad() loss.backward() optimizer.step() lr_str = f'\tlr {lr:.08f}' # compute the smoothed loss avg_loss = beta * avg_loss + (1 - beta) * loss_val smoothed_loss = avg_loss / (1 - beta ** (i + 1)) # stop if the loss is exploding if i > 0 and smoothed_loss > 4 * best_loss: break # record the best loss if smoothed_loss < best_loss or i == 0: best_loss = smoothed_loss # store the values losses[i] = smoothed_loss logs[i] = math.log10(lr) # update the lr for the next step lr *= mult batch_time.update(time.time() - end) end = time.time() if i % config.train.log_freq == 0: logger.info(f'lr_finder [{i}/{num_steps}]\t' f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' f'loss {loss:.4f} ({smoothed_loss:.4f})\t' f'F2 {f2:.4f} {lr_str}') np.savez(os.path.join(config.experiment_dir, f'lr_finder_{config.version}'), logs=logs, losses=losses) d1 = np.zeros_like(losses); d1[1:] = losses[1:] - losses[:-1] first, last = np.argmin(d1), np.argmin(losses) MAGIC_COEFF = 4 highest_lr = 10 ** logs[last] best_high_lr = highest_lr / MAGIC_COEFF best_low_lr = 10 ** logs[first] logger.info(f'best_low_lr={best_low_lr} best_high_lr={best_high_lr} ' f'highest_lr={highest_lr}') def find_nearest(array: np.array, value: float) -> int: return (np.abs(array - value)).argmin() last = find_nearest(logs, math.log10(best_high_lr)) logger.info(f'first={first} last={last}') import matplotlib.pyplot as plt plt.plot(logs, losses, '-D', markevery=[first, last]) plt.savefig(os.path.join(config.experiment_dir, 'lr_finder_plot.png'))
def run(hyperparams: Optional[Dict[str, str]] = None) -> float: np.random.seed(0) logger.info('=' * 50) if hyperparams: hash = hashlib.sha224(str(hyperparams).encode()).hexdigest()[:8] model_dir = os.path.join(config.general.experiment_dir, f'{hash}') if not os.path.exists(model_dir): os.makedirs(model_dir) str_params = str(hyperparams) logger.info(f'hyperparameters: {hyperparams}') config.augmentations.update(hyperparams) else: model_dir = config.general.experiment_dir train_loader, val_loader, test_loader = load_data(args.fold) epoch_size = min(len(train_loader), config.train.max_steps_per_epoch) logger.info(f'creating a model {config.model.arch}') model = create_model(config, pretrained=args.weights is None).cuda() criterion = get_loss(config) if args.summary: torchsummary.summary( model, (config.model.num_channels * 2, config.model.input_size, config.model.input_size)) if args.lr_finder: optimizer = get_optimizer(config, model.parameters()) lr_finder(train_loader, model, criterion, optimizer) sys.exit() if args.weights is None and config.train.head_only_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.epochs} epochs') logger.info(f'max lr will be {config.optimizer.params.lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer, epoch_size) freeze_layers(model) for epoch in range(config.train.warmup.epochs): train_epoch(train_loader, model, criterion, optimizer, epoch, warmup_scheduler) unfreeze_layers(model) if args.weights is None and config.train.enable_warmup: logger.info('-' * 50) logger.info(f'doing warmup for {config.train.warmup.epochs} epochs') logger.info(f'max lr will be {config.optimizer.params.lr}') optimizer = get_optimizer(config, model.parameters()) warmup_scheduler = get_warmup_scheduler(config, optimizer, epoch_size) for epoch in range(config.train.warmup.epochs): train_epoch(train_loader, model, criterion, optimizer, epoch, warmup_scheduler) optimizer = get_optimizer(config, model.parameters()) if args.weights is None: last_epoch = -1 else: last_checkpoint = torch.load(args.weights) model_arch = last_checkpoint['arch'].replace('se_', 'se') if model_arch != config.model.arch: dprint(model_arch) dprint(config.model.arch) assert model_arch == config.model.arch model.load_state_dict(last_checkpoint['state_dict']) if 'optimizer' in last_checkpoint.keys(): optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {args.weights}') last_epoch = last_checkpoint[ 'epoch'] if 'epoch' in last_checkpoint.keys() else 99 logger.info(f'loaded the model from epoch {last_epoch}') if args.lr != 0: set_lr(optimizer, float(args.lr)) elif 'lr' in config.optimizer.params: set_lr(optimizer, config.optimizer.params.lr) elif 'base_lr' in config.scheduler.params: set_lr(optimizer, config.scheduler.params.base_lr) lr_scheduler = get_scheduler(config, optimizer, epoch_size=epoch_size) if args.predict_oof or args.predict_test: print('inference mode') assert args.weights is not None if args.predict_oof: gen_train_prediction(val_loader, model, last_epoch, args.weights) else: gen_test_prediction(test_loader, model, args.weights) sys.exit() logger.info(f'training will start from epoch {last_epoch + 1}') best_score = 0.0 best_epoch = 0 last_lr = get_lr(optimizer) best_model_path = args.weights for epoch in range(last_epoch + 1, config.train.num_epochs): logger.info('-' * 50) if not is_scheduler_continuous(lr_scheduler): # if we have just reduced LR, reload the best saved model lr = get_lr(optimizer) if lr < last_lr - 1e-10 and best_model_path is not None: logger.info(f'learning rate dropped: {lr}, reloading') last_checkpoint = torch.load(best_model_path) assert (last_checkpoint['arch'] == config.model.arch) model.load_state_dict(last_checkpoint['state_dict']) optimizer.load_state_dict(last_checkpoint['optimizer']) logger.info(f'checkpoint loaded: {best_model_path}') set_lr(optimizer, lr) last_lr = lr if isinstance(lr_scheduler, CosineLRWithRestarts): restart = lr_scheduler.epoch_step() if restart: logger.info( 'cosine annealing restarted, resetting the best metric') best_score = min(config.train.restart_metric_val, best_score) train_epoch(train_loader, model, criterion, optimizer, epoch, lr_scheduler) score, _ = validate(val_loader, model, epoch) if type(lr_scheduler) == ReduceLROnPlateau: lr_scheduler.step(metrics=score) elif not is_scheduler_continuous(lr_scheduler): lr_scheduler.step() is_best = score > best_score best_score = max(score, best_score) if is_best: best_epoch = epoch if is_best: best_model_path = os.path.join( model_dir, f'{config.version}_f{args.fold}_e{epoch:02d}_{score:.04f}.pth') data_to_save = { 'epoch': epoch, 'arch': config.model.arch, 'state_dict': model.state_dict(), 'score': score, 'optimizer': optimizer.state_dict(), 'config': config } torch.save(data_to_save, best_model_path) logger.info(f'a snapshot was saved to {best_model_path}') logger.info(f'best score: {best_score:.04f}') return -best_score