def train_stage_one(args, model, train_loader, valid_loader, criterion): optimizer = WeightDecayOptimizerWrapper( torch.optim.Adam(model.parameters(), lr=2e-3), 0.1) freeze_layers(model, [True, True, False]) # stage 1 n_steps = len(train_loader) // 2 bot = ImageClassificationBot(model=model, train_loader=train_loader, val_loader=valid_loader, clip_grad=10., optimizer=optimizer, echo=not ON_KAGGLE, criterion=criterion, avg_window=len(train_loader) // 10, callbacks=[ LearningRateSchedulerCallback( TriangularLR(optimizer, 100, ratio=3, steps_per_cycle=n_steps)) ], pbar=not ON_KAGGLE, use_tensorboard=False) bot.logger.info(bot.criterion) bot.train(n_steps, log_interval=len(train_loader) // 10, snapshot_interval=len(train_loader) // 4) bot.load_model(bot.best_performers[0][1]) torch.save(bot.model.state_dict(), str(CACHE_DIR / f"stage1_{args.fold}.pth")) bot.remove_checkpoints(keep=0)
def train(): train_dl, valid_dl = get_cifar10_dataset(batch_size=1024) steps_per_epoch = len(train_dl) model = get_wide_resnet() # optimizer = WeightDecayOptimizerWrapper(optim.SGD( # model.parameters(), lr=0.1, # momentum=0.9, weight_decay=0), 0.05) # optimizer = WeightDecayOptimizerWrapper(AdaBound( # model.parameters(), lr=1e-3, final_lr=0.1, gamma=1/steps_per_epoch/2.5, weight_decay=0 # ), 0.05) optimizer = WeightDecayOptimizerWrapper(optim.Adam( model.parameters(), lr=1.5e-3), 0.1) model, optimizer = amp.initialize( model, optimizer, opt_level="O2", keep_batchnorm_fp32=True, loss_scale="dynamic" ) n_epochs = 50 n_steps = n_epochs * steps_per_epoch bot = CifarBot( model=model, train_loader=train_dl, val_loader=valid_dl, optimizer=optimizer, echo=True, avg_window=steps_per_epoch // 5, criterion=nn.CrossEntropyLoss(), device=DEVICE, clip_grad=1., callbacks=[ LearningRateSchedulerCallback( TriangularLR( optimizer, 100, ratio=4, steps_per_cycle=n_steps ) ) ], metrics=[SoftmaxAccuracy()], pbar=True, use_amp=True ) bot.train( n_steps, snapshot_interval=steps_per_epoch, log_interval=steps_per_epoch // 5, keep_n_snapshots=1 ) print(f"GPU Memory Used: {get_gpu_memory_map()} MB") bot.load_model(bot.best_performers[0][1]) torch.save(bot.model.state_dict(), "cache/baseline.pth") bot.remove_checkpoints(keep=0)
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('config', type=str) arg('--from-checkpoint', type=str, default='') args = parser.parse_args() with open(args.config) as fin: config = yaml.safe_load(fin) train_loader, valid_loader = get_loaders(config["video"]) model_config = config["video"]["model"] training_config = config["video"]["training"] model = create_video_model(model_config) print(model) optimizer_grouped_parameters = [ { 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in NO_DECAY)], }, { 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in NO_DECAY)], } ] optimizer = WeightDecayOptimizerWrapper( torch.optim.Adam( optimizer_grouped_parameters, lr=float(training_config['lr']), eps=float(training_config['eps']) ), [float(training_config['weight_decay']), 0] ) if args.from_checkpoint: bot = resume_training( training_config, args.from_checkpoint, model, optimizer, train_loader, valid_loader) else: bot = train_from_start( training_config, model, optimizer, train_loader, valid_loader) target_dir = (MODEL_DIR / datetime.now().strftime("%Y%m%d_%H%M")) target_dir.mkdir(parents=True) torch.save(bot.model.state_dict(), target_dir / "model.pth") with open(target_dir / "config.yaml", "w") as fout: fout.write(yaml.dump(config, default_flow_style=False))
def get_optimizer(model, lr): return WeightDecayOptimizerWrapper(torch.optim.Adam([{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in NO_DECAY) ], }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in NO_DECAY) ], }], weight_decay=0, lr=lr), weight_decay=[1e-1, 0], change_with_lr=True)
def train_from_scratch(args, model, train_loader, valid_loader, criterion): n_steps = len(train_loader) * args.epochs optimizer = WeightDecayOptimizerWrapper(torch.optim.Adam([{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in NO_DECAY) ], }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in NO_DECAY) ], }], weight_decay=0, lr=args.lr), weight_decay=[1e-1, 0], change_with_lr=True) if args.debug: print("No decay:", [ n for n, p in model.named_parameters() if any(nd in n for nd in NO_DECAY) ]) if args.amp: if not APEX_AVAILABLE: raise ValueError("Apex is not installed!") model, optimizer = amp.initialize(model, optimizer, opt_level=args.amp) callbacks = [ LearningRateSchedulerCallback( # TriangularLR( # optimizer, 100, ratio=4, steps_per_cycle=n_steps # ) GradualWarmupScheduler(optimizer, 100, len(train_loader), after_scheduler=CosineAnnealingLR( optimizer, n_steps - len(train_loader)))) ] if args.mixup_alpha: callbacks.append( MixUpCallback(alpha=args.mixup_alpha, softmax_target=True)) bot = ImageClassificationBot(model=model, train_loader=train_loader, val_loader=valid_loader, clip_grad=10., optimizer=optimizer, echo=True, criterion=criterion, avg_window=len(train_loader) // 5, callbacks=callbacks, pbar=True, use_tensorboard=True, use_amp=(args.amp != '')) bot.train( n_steps, log_interval=len(train_loader) // 6, snapshot_interval=len(train_loader) // 2, # early_stopping_cnt=8, min_improv=1e-2, keep_n_snapshots=1) bot.remove_checkpoints(keep=1) bot.load_model(bot.best_performers[0][1]) torch.save(bot.model.state_dict(), CACHE_DIR / f"final_weights.pth") bot.remove_checkpoints(keep=0)
def train_stage_two(args, model, train_loader, valid_loader, criterion): n_steps = len(train_loader) * args.epochs optimizer = WeightDecayOptimizerWrapper( setup_differential_learning_rates( partial( torch.optim.Adam, weight_decay=0 # AdaBound, weight_decay=0, gamma=1/5000, betas=(.8, .999) # torch.optim.SGD, momentum=0.9 ), model, [1e-5, 8e-5, 5e-4], [1., 1., 1.]), weight_decay=5e-2, change_with_lr=True) freeze_layers(model, [False, False, False]) bot = ImageClassificationBot( model=model, train_loader=train_loader, val_loader=valid_loader, clip_grad=10., optimizer=optimizer, echo=not ON_KAGGLE, criterion=criterion, avg_window=len(train_loader) // 15, callbacks=[ LearningRateSchedulerCallback( TriangularLR(optimizer, 100, ratio=4, steps_per_cycle=n_steps) # GradualWarmupScheduler( # optimizer, 100, len(train_loader), # after_scheduler=CosineAnnealingLR( # optimizer, n_steps - len(train_loader) # ) ), MixUpCallback(alpha=0.2) ], pbar=not ON_KAGGLE, use_tensorboard=not ON_KAGGLE) bot.logger.info(bot.criterion) bot.model.load_state_dict(torch.load(CACHE_DIR / f"stage1_{args.fold}.pth")) # def snapshot_or_not(step): # if step < 4000: # if step % 2000 == 0: # return True # elif (step - 4000) % 1000 == 0: # return True # return False bot.train( n_steps, log_interval=len(train_loader) // 20, snapshot_interval=len(train_loader) // 2, # snapshot_interval=snapshot_or_not, early_stopping_cnt=args.early_stop, min_improv=1e-4, keep_n_snapshots=1) bot.load_model(bot.best_performers[0][1]) bot.remove_checkpoints(keep=0) # Final model torch.save(bot.model, MODEL_DIR / f"final_{args.fold}.pth") # Failover (args + state dict) torch.save([args.arch, bot.model.state_dict()], MODEL_DIR / f"failover_{args.arch}_{args.fold}.pth")
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('config') arg('context_model_dir', type=str) arg('segment_model_dir', type=str) arg('--steps', type=int, default=-1) arg('--fold', type=int, default=0) arg('--name', type=str, default="context_model") args = parser.parse_args() with open(args.config) as fin: config = yaml.safe_load(fin) training_config = config["segment_w_context"]["training"] train_loader, valid_loader = get_loaders( training_config["batch_size"], fold=args.fold, seed=int(os.environ.get("SEED", "9293")), offset=training_config["offset"]) if args.steps > 0: # override training_config["steps"] = args.steps context_model_dir = Path(args.context_model_dir) with open(context_model_dir / "config.yaml") as fin: context_config = yaml.safe_load(fin) config["context_base"] = context_config["video"] context_state_dict = torch.load(str(context_model_dir / "model.pth")) segment_model_dir = Path(args.segment_model_dir) with open(segment_model_dir / "config.yaml") as fin: segment_config = yaml.safe_load(fin) config["segment_base"] = segment_config["video"] segment_state_dict = torch.load(str(segment_model_dir / "model.pth")) model = prepare_model(config, context_state_dict=context_state_dict, segment_state_dict=segment_state_dict) print(model) # optimizer_grouped_parameters = [] lr = float(training_config["lr"]) optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.segment_model.named_parameters() if not any(nd in n for nd in NO_DECAY) ], 'lr': lr / 2 }, { 'params': [ p for n, p in model.segment_model.named_parameters() if any(nd in n for nd in NO_DECAY) ], 'lr': lr / 2 }] if config["segment_w_context"]["model"]["finetune_context"]: optimizer_grouped_parameters += [{ 'params': [ p for n, p in model.context_model.named_parameters() if not any(nd in n for nd in NO_DECAY) ], 'lr': lr / 4 }, { 'params': [ p for n, p in model.context_model.named_parameters() if any(nd in n for nd in NO_DECAY) ], 'lr': lr / 4 }] for module in (model.expert_fc, model.gating_fc, model.intermediate_fc): optimizer_grouped_parameters += [{ 'params': [ p for n, p in module.named_parameters() if not any(nd in n for nd in NO_DECAY) ], 'lr': lr }, { 'params': [ p for n, p in module.named_parameters() if any(nd in n for nd in NO_DECAY) ], 'lr': lr }] optimizer = WeightDecayOptimizerWrapper( torch.optim.Adam(optimizer_grouped_parameters, lr=lr, eps=float(training_config["eps"])), [training_config["weight_decay"], 0] * (len(optimizer_grouped_parameters) // 2)) n_steps = training_config["steps"] checkpoints = CheckpointCallback(keep_n_checkpoints=1, checkpoint_dir=CACHE_DIR / "model_cache/", monitor_metric="roc_auc") break_points = [0, int(n_steps * 0.25)] lr_durations = np.diff(break_points + [n_steps]) bot = YoutubeBot( model=model, train_loader=train_loader, valid_loader=valid_loader, clip_grad=10., optimizer=optimizer, echo=True, criterion=SampledCrossEntropyLoss(), callbacks=[ LearningRateSchedulerCallback( MultiStageScheduler( [ LinearLR(optimizer, 0.01, lr_durations[0]), LinearLR( optimizer, 0.001, lr_durations[1], upward=False) # CosineAnnealingLR(optimizer, lr_durations[1]) ], start_at_epochs=break_points)), MovingAverageStatsTrackerCallback( avg_window=1200, log_interval=1000, ), checkpoints, ], pbar=True, use_tensorboard=False) bot.train(total_steps=n_steps, checkpoint_interval=training_config["ckpt_interval"]) bot.load_model(checkpoints.best_performers[0][1]) checkpoints.remove_checkpoints(keep=0) # save the model target_dir = ( MODEL_DIR / f"{args.name}_{args.fold}_{datetime.now().strftime('%Y%m%d-%H%M')}") target_dir.mkdir(parents=True) torch.save(bot.model.state_dict(), target_dir / "model.pth") with open(target_dir / "config.yaml", "w") as fout: fout.write(yaml.dump(config, default_flow_style=False))