def __init__(self, model, device, config, folder): self.config = config self.epoch = 0 #设置工作目录 self.base_dir = f'./model/seresnext_512/{folder}' if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) self.log_path = f'{self.base_dir}/log.txt' self.best_score = 0 self.best_loss = 10**5 self.best_ap = 0 self.model = model self.device = device self.best_true = np.array([]) self.best_pred = np.array([]) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.lr) self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params) # self.scheduler.step #self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=5, after_scheduler=self.scheduler) self.scheduler_warmup = GradualWarmupScheduler(self.optimizer, multiplier=1, total_epoch=6) # self.criterion = FocalLoss(logits=True).to(self.device) self.criterion = LabelSmoothing().to(self.device) self.log(f'Fitter prepared. Device is {self.device}')
def train2(net, train_loader, test_loader): loss_fn = nn.CrossEntropyLoss() net2 = BYOL_Classification(net, 10) net2.eval() net2.cuda() for pq in net.parameters(): pq.requires_grad = False optimizer = optim.Adam(filter(lambda p: p.requires_grad, net2.parameters()), lr=1e-3) from warmup_scheduler import GradualWarmupScheduler scheduler = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=20, after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=80)) train_start = time.time() for epoch in range(1, 100 + 1): train_loss = 0 net2.train() epoch_start = time.time() for idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() data = data.cuda() target = target.cuda() data = net2(data)[1] loss = loss_fn(data, target) train_loss += loss.item() loss.backward() optimizer.step() train_loss /= (idx + 1) scheduler.step() epoch_time = time.time() - epoch_start if epoch % 10 == 0: net.eval() total = 0.0 correct = 0.0 for test_data in test_loader: images, labels = test_data images = images.cuda() labels = labels.cuda() outputs = net2(images)[1] _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print("Epoch\t", epoch, "\tTest accuracy\t", correct / total * 100) elapsed_train_time = time.time() - train_start print('Finished training. Train time was:', elapsed_train_time)
def train(args, config, loader, device): logging.info('Start training...') model = getattr(net, config.model.name)(**config.model.args, **config.embedder) model = model.to(device) criterion = getattr(nn, config.loss.name)(**config.loss.args).to(device) optimizer = getattr(torch.optim, config.optimizer.name)(model.parameters(), **config.optimizer.args) if hasattr(config, 'lr_scheduler'): if hasattr(config.lr_scheduler, 'name'): scheduler = getattr(torch.optim.lr_scheduler, config.lr_scheduler.name)( optimizer, **config.lr_scheduler.args) else: scheduler = None if hasattr(config.lr_scheduler, 'warm_up'): scheduler_warm_up = GradualWarmupScheduler( optimizer, multiplier=config.lr_scheduler.warm_up.multiplier, total_epoch=config.lr_scheduler.warm_up.epoch, after_scheduler=scheduler) loss = Box({'train': 0.0, 'val': 0.0}) metrics = Box({'train': [Accuracy()], 'val': [Accuracy()]}) for epoch in range(config.train.n_epoch): if hasattr(config, 'lr_scheduler'): if hasattr(config.lr_scheduler, 'warm_up'): scheduler_warm_up.step() else: scheduler.step() loss.train, metrics.train = run_epoch( model, optimizer, criterion, loader.train, train=True, metrics=metrics.train, max_norm=config.max_norm if hasattr(config, 'max_norm') else -1) loss.val, metrics.val = run_epoch(model, optimizer, criterion, loader.val, train=False, metrics=metrics.val) saved_path = os.path.join(args.model_folder, 'checkpoints', f'epoch_{epoch}.pt') save_model(saved_path, epoch, model, optimizer) log_metrics(epoch, args.model_folder, loss, metrics)
def train(net, loader): optimizer = SGD_with_lars(net.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-6) from warmup_scheduler import GradualWarmupScheduler scheduler = GradualWarmupScheduler( optimizer, multiplier=1, total_epoch=20, after_scheduler=optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=180)) train_start = time.time() for epoch in range(1, 100 + 1): print('hi') train_loss = 0 net.train() epoch_start = time.time() for idx, (data, target) in enumerate(loader): optimizer.zero_grad() dat1 = data[0].cuda() dat2 = data[1].cuda() loss = net(dat1, dat2) train_loss += loss.item() loss.backward() optimizer.step() train_loss /= (idx + 1) scheduler.step() epoch_time = time.time() - epoch_start print( "Epoch\t", epoch, "\tLoss\t", train_loss, "\tTime\t", epoch_time, ) elapsed_train_time = time.time() - train_start print('Finished training. Train time was:', elapsed_train_time)
def get_scheduler(args, optimizer): args = vars(args) if args['scheduler'] == "warmup": print(f'Using warmup scheduler with cosine annealing') print( f"warmup epochs : {args['warmup_epochs']} | total epochs {args['epochs']}" ) print(f"lr_start : {args['lr']} ---> lr_end : {args['lr_end']}") scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args['epochs'], eta_min=args['lr_end']) scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=args['warmup_epochs'], after_scheduler=scheduler_cosine) elif args['scheduler'] == "multistep": print( f"Using multistep scheduler with gamma = {args['gamma']} and milestones = {args['milestones']}" ) scheduler = MultiStepLR(optimizer, milestones=args['milestones'], gamma=args['gamma']) elif args['scheduler'] == "cosine": print(f"Using cosine annealing from {args['lr']} to {args['lr_end']}") scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args['epochs'], eta_min=args['lr_end']) return scheduler
def get_scheduler(optimizer, args): if args.lr_scheduler == 'CosineAnnealingLR': print('Use cosine scheduler') scheduler_next = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=args.epochs) elif args.lr_scheduler == 'StepLR': print('Use step scheduler, step size: {}, gamma: {}'.format( args.step_size, args.gamma)) scheduler_next = torch.optim.lr_scheduler.StepLR( optimizer, step_size=args.step_size, gamma=args.gamma) elif args.lr_scheduler == 'MultiStepLR': print('Use MultiStepLR scheduler, milestones: {}, gamma: {}'.format( args.milestones, args.gamma)) scheduler_next = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.milestones, gamma=args.gamma) else: raise NotImplementedError if args.warmup_epoch <= 0: return scheduler_next print('Use warmup scheduler') lr_scheduler = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_next) return lr_scheduler
def configure_optimizers(self): if self.hparams['opt'] == 'sgd': opt = torch.optim.SGD(self.parameters(), lr=self.hparams.lr, momentum=0.9, weight_decay=5e-4) elif self.hparams['opt'] == 'adam': opt = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=5e-4) if self.hparams['sched'] == 'cyclic:': scheduler = CyclicLR(optimizer=opt, base_lr=self.hparams.lr / 500, max_lr=self.hparams.lr / 10) elif self.hparams['sched'] == 'cosine_annealing_warm_restarts': scheduler = CosineAnnealingWarmRestarts( optimizer=opt, T_0=2000, eta_min=self.hparams.lr / 1000.0, T_mult=1, ) elif self.hparams['sched'] == 'exp': scheduler_steplr = ExponentialLR(opt, gamma=0.95) scheduler = GradualWarmupScheduler( opt, multiplier=1, total_epoch=5, after_scheduler=scheduler_steplr) self.sched = scheduler self.opt = opt return opt
def configure_optimizers(self): self.optim = Adam( self.parameters(), lr=self.cfg["train"]["lr"], weight_decay=self.cfg["train"]["l2"], ) self.sched = CosineAnnealingLR(self.optim, T_max=self.cfg["train"]["lr_restart"]) self.warmup = GradualWarmupScheduler( self.optim, multiplier=1, total_epoch=self.cfg["train"]["warmup"], after_scheduler=self.sched, ) return [self.optim], [self.sched]
def get_optimizer_and_scheduler(args, model): """ Returns an pytorch optimizer and scheduler for each specific dataset Args: args: arguments of the program model: the model we use for training and testing Returns: optimizer, scheduler """ if args.dataset == 'QM9': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd, amsgrad=False) scheduler_ = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9961697) scheduler = GradualWarmupScheduler(optimizer, multiplier=1.0, total_epoch=1, after_scheduler=scheduler_) else: optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = None return optimizer, scheduler
def __init__(self, model, device, config): self.config = config self.epoch = 0 self.base_dir = f'./{config.folder}' if not os.path.exists(self.base_dir): os.makedirs(self.base_dir) self.log_path = f'{self.base_dir}/log.txt' self.best_summary_loss = 10**5 self.model = model self.device = device param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] self.optimizer = config.OptimizerClass(self.model.parameters(), lr=config.lr) self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params) if self.config.warmup: self.warmup_scheduler = GradualWarmupScheduler( self.optimizer, multiplier=1, total_epoch=5, after_scheduler=self.scheduler) self.log(f'Fitter prepared. Device is {self.device}') if self.config.apex: self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1')
def configure_optimizers(self): self.optimizer = RAdam(self.parameters(), lr=self.cfg.train.lr, weight_decay=2e-5) warmup_epo = 1 warmup_factor = 10 scheduler_cos = CosineAnnealingLR(self.optimizer, T_max=self.cfg.train.epoch - warmup_epo, eta_min=0) self.scheduler = GradualWarmupScheduler(self.optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cos) return [self.optimizer], [self.scheduler]
def configure_optimizers(self): optimizer = optim.Adam(model.parameters(), lr=init_lr / warmup_factor) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs - warmup_epo) scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) return [optimizer], [scheduler]
def make_scheduler(optimizer, stage): if stage['scheduler'] == 'OneCycleLR': return OneCycleLR(optimizer=optimizer, **stage['scheduler_params']) elif stage['scheduler'] == 'GradualWarmupScheduler': return GradualWarmupScheduler(optimizer=optimizer, **stage['scheduler_params']) return getattr(torch.optim.lr_scheduler, stage['scheduler'])(optimizer=optimizer, **stage['scheduler_params'])
def build_model(config, device, train=True): # load model if config['model'] == 'default': net = model.Resnet50() elif config['model'] == 'fused': net = model_fused.Resnet50() elif config['model'] == 'quant': net = model_quant.Resnet50() elif config['model'] == 'tf': net = model_tf.Resnet50() elif config['model'] == 'tf_fused': net = model_tf_fused.Resnet50() else: raise ValueError('cannot load model, check config file') # load loss if config['loss'] == 'cross_entropy': loss_fn = nn.CrossEntropyLoss() else: raise ValueError('cannot load loss, check config file') net = net.to(device) loss_fn = loss_fn.to(device) if not train: return net, loss_fn # load optimizer if config['optimizer'] == 'sgd': optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=config['learning_rate'], momentum=0.9, weight_decay=config['weight_decay']) elif config['optimizer'] == 'adam': optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=config['learning_rate'], weight_decay=config['weight_decay']) else: raise ValueError('cannot load optimizer, check config file') # load scheduler if config['scheduler'] == 'cosine': scheduler_step = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=config['t_max']) elif config['scheduler'] == 'step': scheduler_step = torch.optim.lr_scheduler.StepLR( optimizer, step_size=config['lr_decay_every'], gamma=config["lr_decay"]) else: raise ValueError('cannot load scheduler, check config file') scheduler = GradualWarmupScheduler(optimizer, multiplier=config['lr_multiplier'], total_epoch=config['lr_epoch'], after_scheduler=scheduler_step) return net, loss_fn, optimizer, scheduler
def get_scheduler(optimizer, name): if name == "gradual_warmup": exp_lr = ExponentialLR(optimizer, gamma=0.996) return GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=800, after_scheduler=exp_lr) if name == "cosine": return CosineAnnealingWarmRestarts(optimizer, 100, 2) raise ValueError("incorrect scheduler name: %s" % name)
def _setup_model(self): num_classes = 2 num_aux_classes = self.train_dataloader.dataset.num_auxiliary_classes freeze_backbone = self.model_kwargs.get('freeze_backbone', False) self.model_kwargs['num_aux_classes'] = num_aux_classes self.model = Model(num_main_classes=num_classes, num_aux_classes=num_aux_classes, freeze_backbone=freeze_backbone) if self.model_kwargs.get('aux_labels_type', None) == "imagenet": # Initialize auxiliary head to imagenet fc self.model.auxiliary_head.weight = self.model.backbone.fc.weight self.model.auxiliary_head.bias = self.model.backbone.fc.bias if self.use_cuda: self.model = self.model.cuda() self.model = nn.DataParallel(self.model) self.main_loss = nn.CrossEntropyLoss() self.auxiliary_loss = nn.CrossEntropyLoss() self.start_epoch = 0 self.end_epoch = self.model_kwargs.get('epochs_to_run', 1) self.current_epoch = 0 self.global_train_batch_idx = 0 self.global_val_batch_idx = 0 lr = float(self.model_kwargs.get('initial_lr', 0.01)) endlr = float(self.model_kwargs.get('endlr', 0.0)) optim_params = dict( lr=lr, momentum=float(self.model_kwargs.get('momentum', 0.9)), weight_decay=float(self.model_kwargs.get('weight_decay', 0.0001)), ) self.optimizer = optim.SGD(self.model.parameters(), **optim_params) max_epochs = int(self.model_kwargs.get('max_epochs', 90)) warmup_epochs = int(self.model_kwargs.get('warmup_epochs', 0)) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, max_epochs - warmup_epochs, eta_min=endlr) self.optimizer_scheduler = GradualWarmupScheduler( optimizer=self.optimizer, multiplier=1.0, warmup_epochs=warmup_epochs, after_scheduler=scheduler)
def create_lr_scheduler( conf_lrs: Config, epochs: int, optimizer: Optimizer, steps_per_epoch: Optional[int]) -> Tuple[Optional[_LRScheduler], bool]: # epoch_or_step - apply every epoch or every step scheduler, epoch_or_step = None, True if conf_lrs is not None: lr_scheduler_type = conf_lrs['type'] # TODO: default should be none? if lr_scheduler_type == 'cosine': # adjust max epochs for warmup # TODO: shouldn't we be increasing epochs or schedule lr only after warmup? if conf_lrs.get('warmup', None): epochs -= conf_lrs['warmup']['epochs'] scheduler = lr_scheduler.CosineAnnealingLR( optimizer, T_max=epochs, eta_min=conf_lrs['min_lr']) elif lr_scheduler_type == 'resnet': scheduler = _adjust_learning_rate_resnet(optimizer, epochs) elif lr_scheduler_type == 'pyramid': scheduler = _adjust_learning_rate_pyramid(optimizer, epochs, get_optim_lr(optimizer)) elif lr_scheduler_type == 'step': decay_period = conf_lrs['decay_period'] gamma = conf_lrs['gamma'] scheduler = lr_scheduler.StepLR(optimizer, decay_period, gamma=gamma) elif lr_scheduler_type == 'one_cycle': assert steps_per_epoch is not None ensure_pytorch_ver('1.3.0', 'LR scheduler OneCycleLR is not available.') max_lr = conf_lrs['max_lr'] epoch_or_step = False scheduler = lr_scheduler.OneCycleLR( optimizer, max_lr=max_lr, epochs=epochs, steps_per_epoch=steps_per_epoch, ) # TODO: other params elif not lr_scheduler_type: scheduler = None # TODO: check support for this or use StepLR else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) # select warmup for LR schedule if conf_lrs.get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=conf_lrs['warmup']['multiplier'], total_epoch=conf_lrs['warmup']['epochs'], after_scheduler=scheduler) return scheduler, epoch_or_step
def get_scheduler(optimizer, opt): """Return a learning rate scheduler Parameters: optimizer -- the optimizer of the network opt (option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions. opt.lr_policy is the name of learning rate policy: linear | step | plateau | cosine For 'linear', we keep the same learning rate for the first <opt.niter> epochs and linearly decay the rate to zero over the next <opt.niter_decay> epochs. For other schedulers (step, plateau, and cosine), we use the default PyTorch schedulers. See https://pytorch.org/docs/stable/optim.html for more details. """ if opt.lr_policy == 'linear': def lambda_rule(epoch): lr_l = 1.0 - max(0, epoch + opt.epoch_count - opt.niter) / float(opt.niter_decay + 1) return lr_l scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule) elif opt.lr_policy == 'step': scheduler = lr_scheduler.StepLR(optimizer, step_size=opt.lr_decay_iters, gamma=0.1) elif opt.lr_policy == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.2, threshold=0.01, patience=5) elif opt.lr_policy == 'cosine': scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.niter, eta_min=0) elif opt.lr_policy == 'warmup': scheduler_cosine = lr_scheduler.CosineAnnealingLR(optimizer, T_max=opt.niter, eta_min=0) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=10, after_scheduler=scheduler_cosine) else: return NotImplementedError( 'learning rate policy [%s] is not implemented', opt.lr_policy) return scheduler
def configure_optimizers(self): optimizer_cls, scheduler_cls = get_optimizer(self.cfg) conf_optim = self.cfg.Optimizer optimizer = optimizer_cls(self.parameters(), **conf_optim.optimizer.params) if scheduler_cls is None: return [optimizer] else: scheduler_default = scheduler_cls( optimizer, **conf_optim.lr_scheduler.params ) scheduler = GradualWarmupScheduler( optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_default, ) return [optimizer], [scheduler]
def fit(model, data, optimizer, scheduler, loss, augmentation, parameters): model = model.train() optimizer_fun = getattr(optim, optimizer['name'])(model.parameters(), **optimizer) scheduler_fun = getattr(optim, scheduler['name'])(optimizer, **scheduler) loss_fun = getattr(nn, loss['name'])(**loss) if scheduler.get("warmup", None) is not None: nb_epoch_warmup = int(parameters['epoch'] * scheduler["warmup"]) optimizer_fun.defaults['lr'] *= 0.01 scheduler_fun = GradualWarmupScheduler(optimizer_fun, multiplier=100, total_epoch=nb_epoch_warmup, after_scheduler=scheduler_fun) for ep in range( parameters['epoch']): # loop over the dataset multiple times running_loss = 0.0 for i, d in enumerate(data): # get the inputs; d is a list of [inputs, labels] inputs, labels = d # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss_measure = loss_fun(outputs, labels) loss_measure.backward() optimizer.step() scheduler.step() # print statistics running_loss += loss_measure.item() if i % 10 == 0: # print every 10 mini-batches print('[%d, %5d] loss: %.3f' % (ep + 1, i + 1, running_loss / 10)) running_loss = 0.0 print('Finished Training') return model
def get_scheduler(hparams, optimizer): eps = 1e-8 if hparams.lr_scheduler == 'steplr': scheduler = MultiStepLR(optimizer, milestones=hparams.decay_step, gamma=hparams.decay_gamma) elif hparams.lr_scheduler == 'cosine': scheduler = CosineAnnealingLR(optimizer, T_max=hparams.num_epochs, eta_min=eps) else: raise ValueError('scheduler not recognized!') if hparams.warmup_epochs > 0 and hparams.optimizer not in [ 'radam', 'ranger' ]: scheduler = GradualWarmupScheduler( optimizer, multiplier=hparams.warmup_multiplier, total_epoch=hparams.warmup_epochs, after_scheduler=scheduler) return scheduler
def main(): lr = 0.00001 PATH = '/home/ruoyaow/imageqa-qgen/evaluation' if len(sys.argv) > 1 and sys.argv[1] == 'c': pretrained = PATH else: pretrained = 'bert-base-uncased' model = BertForMaskedLM.from_pretrained(pretrained, output_hidden_states=True, output_attentions=False) if GPU: model = model.cuda() with open('nouns_unbalance.pkl', 'rb') as f: word_dict = pickle.load(f) max_epoch = 10 batch_size = 32 optimizer = AdamW(model.parameters(), lr=lr) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, max_epoch) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=5, total_epoch=max_epoch, after_scheduler=scheduler_cosine) train_data = 'noun_blank_unbalance.txt' evaluation, trainld, testld = loadData(train_data, batch_size) eval(testld, model, tokenizer, word_dict) eval(evaluation, model, tokenizer, word_dict)
dataset_valid, batch_size=batch_size, sampler=SequentialSampler(dataset_valid), num_workers=num_workers) model = enetv2(enet_type, out_dim=out_dim) model = model.to(device) criterion = LabelSmoothingLoss(out_dim, smoothing=0.1) # criterion = nn.CrossEntropyLoss() # criterion = MyBCELoss() optimizer = optim.Adam(model.parameters(), lr=init_lr / warmup_factor) scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, n_epochs - warmup_epo) scheduler = GradualWarmupScheduler(optimizer, multiplier=warmup_factor, total_epoch=warmup_epo, after_scheduler=scheduler_cosine) # optimizer = Radam.Over9000(model.parameters(), lr = init_lr) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, n_epochs) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = torch.nn.DataParallel(model, device_ids=list(range(len(gpus.split(","))))) qwk_max = 0. for epoch in range(1, n_epochs + 1): printOut(time.ctime(), 'Epoch:', epoch) scheduler.step(epoch - 1) train_loss = train_epoch(train_loader, optimizer)
def training(train_data_list, val_data_list, test_files, fold): os.makedirs(os.path.join(config.weights, config.model_name) + os.sep + str(fold), exist_ok=True) os.makedirs(config.best_models, exist_ok=True) ### ---------- get model ------------------------------------------ model = FF3DNet(drop=0.5) ### ---------- set lr, opt, loss ------------------------------------------ img_params = list(map(id, model.img_encoder.parameters())) rest_params = filter(lambda p: id(p) not in img_params, model.parameters()) params = [ { 'params': rest_params, 'lr': config.lr }, { 'params': model.img_encoder.parameters(), 'lr': config.lr * 3 }, ] optimizer = torch.optim.SGD(params, momentum=0.9, weight_decay=1e-4) scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs - 5, eta_min=config.lr / 100) scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=5, after_scheduler=scheduler) criterion = nn.CrossEntropyLoss().to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) model.to(device) best_results = [0, np.inf, 0] val_metrics = [0, np.inf, 0] ### ---------- load dataset ------------------------------------------ train_gen = MultiModalDataset(train_data_list, config.train_data, config.train_vis, mode="train") train_loader = DataLoader(train_gen, batch_size=config.batch_size, shuffle=True, pin_memory=True, num_workers=4) # val_data=getfiles("val") # val_data.sort() val_csv = "/root/userfolder/linan/C/preliminary/val.csv" val_data = pd.read_csv(val_csv) val_gen = MultiModalDataset(val_data, config.train_data, config.train_vis, augument=False, mode="val") val_loader = DataLoader(val_gen, 512, shuffle=False, pin_memory=True, num_workers=4) test_gen = MultiModalDataset(test_files, config.test_data, config.test_vis, augument=False, mode="test") test_loader = DataLoader(test_gen, 512, shuffle=False, pin_memory=True, num_workers=4) # --- train, val, test ------------------------- resume = False start = timer() print("multi fold val") #___________________________________________________________________________________________________________________ for index in [1, 2, 3]: print(index) checkpoint_loss = torch.load( 'checkpoints/best_models/0626_debug_fold_' + str(index) + '_model_best_loss.pth.tar') model.load_state_dict(checkpoint_loss["state_dict"]) test(val_loader, model, fold, checkpoint_loss, 'best_loss', False, index) checkpoint_acc = torch.load( 'checkpoints/best_models/0626_debug_fold_' + str(index) + '_model_best_acc.pth.tar') model.load_state_dict(checkpoint_acc["state_dict"]) test(val_loader, model, fold, checkpoint_acc, 'best_acc', False, index) #test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', True) 0 / 0 #___________________________________________________________________________________________________________________ if resume: checkpoint_loss = torch.load( 'checkpoints/best_models/0616_coslr_55_fold_0_model_best_loss.pth.tar' ) model.load_state_dict(checkpoint_loss["state_dict"]) test(test_loader, model, fold, checkpoint_loss, 'best_loss', False) checkpoint_acc = torch.load( 'checkpoints/best_models/0616_coslr_55_fold_0_model_best_acc.pth.tar' ) model.load_state_dict(checkpoint_acc["state_dict"]) test(test_loader, model, fold, checkpoint_acc, 'best_acc', False) test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', True) else: ### ---------- train loop ---------------- for epoch in range(4, config.epochs): scheduler_warmup.step(metrics=val_metrics[0]) for param_group in optimizer.param_groups: log.write(str(param_group['lr']) + '\n') train_metrics = train(train_loader, model, criterion, optimizer, epoch, val_metrics, best_results, start) # val_metrics_tta = evaluate(val_loader_tta,model,criterion,epoch,train_metrics,best_results,start) val_metrics = evaluate(val_loader, model, criterion, epoch, train_metrics, best_results, start) is_best_acc = val_metrics[0] > best_results[0] best_results[0] = max(val_metrics[0], best_results[0]) is_best_loss = val_metrics[1] < best_results[1] best_results[1] = min(val_metrics[1], best_results[1]) is_best_f1 = val_metrics[2] > best_results[2] best_results[2] = max(val_metrics[2], best_results[2]) save_checkpoint( { "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_acc": best_results[0], "best_loss": best_results[1], "optimizer": optimizer.state_dict(), "fold": fold, "best_f1": best_results[2], }, is_best_acc, is_best_loss, is_best_f1, fold) print('\r', end='', flush=True) print(val_metrics[0], val_metrics[1], val_metrics[2], "val") log.write( '%s %5.1f %6.1f | %0.3f %0.3f %0.3f | %0.3f %0.3f %0.3f | %s %s %s | %s' % ( \ "best", epoch, epoch, train_metrics[0], train_metrics[1], train_metrics[2], val_metrics[0], val_metrics[1], val_metrics[2], str(best_results[0])[:8], str(best_results[1])[:8], str(best_results[2])[:8], time_to_str((timer() - start), 'min')) ) log.write("\n") time.sleep(0.01) # log.write("\n----------------------------------------------- [START %s] %s\n\n" % ( # datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 51)) # log.write( # ' |------------ Train -------|----------- Valid ---------|----------Best Results---|------------|\n') # log.write( # 'mode iter epoch | acc loss f1_macro | acc loss f1_macro | acc loss f1_macro | time |\n') # log.write( # '-------------------------------------------------------------------------------------------------------------------------|\n') ### ---------- per fold ensemble best loss ckpt and best acc ckpt checkpoint_loss = torch.load( 'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' % (config.model_name, str(fold))) model.load_state_dict(checkpoint_loss["state_dict"]) test(test_loader, model, fold, checkpoint_loss, 'best_loss', False) checkpoint_acc = torch.load( 'checkpoints/best_models/%s_fold_%s_model_best_acc.pth.tar' % (config.model_name, str(fold))) model.load_state_dict(checkpoint_acc["state_dict"]) test(test_loader, model, fold, checkpoint_acc, 'best_acc', False) test_ensemble_loss_acc(test_loader, fold, [checkpoint_loss, checkpoint_acc], 'ensemble', not config.k_fold) ### ----------- last kfold ensemble all before k ensemble ckpts if config.k_fold and fold == config.num_kf: mean_npy = np.zeros([10000, 9]) for i in range(1, config.num_kf + 1): checkpoint = torch.load( 'checkpoints/best_models/%s_fold_%s_model_best_loss.pth.tar' % (config.model_name, str(i))) loss_pred = np.load('preds_9/%s/%s_val_fold%s_%s.npy' % (checkpoint["model_name"], checkpoint["model_name"], str(i), 'ensemble')) mean_npy += loss_pred mean_npy = mean_npy / config.num_kf np.save( 'preds_9/%s/%s_val_fold%s_%s.npy' % (checkpoint["model_name"], checkpoint["model_name"], 'cv', 'ensemble'), mean_npy) gen_txt(mean_npy, checkpoint, 'cv', 'ensemble')
elif args.opt == "sgd": optimizer = optim.SGD(net.parameters(), lr=args.lr) if not args.cos: from torch.optim import lr_scheduler scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, verbose=True, min_lr=1e-3 * 1e-5, factor=0.1) else: scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.n_epochs - 1) scheduler = GradualWarmupScheduler(optimizer, multiplier=10, total_epoch=1, after_scheduler=scheduler_cosine) ##### Training def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets)
import torch from warmup_scheduler import GradualWarmupScheduler if __name__ == '__main__': v = torch.zeros(10) optim = torch.optim.SGD([v], lr=0.01) scheduler = GradualWarmupScheduler(optim, multiplier=8, total_epoch=10) for epoch in range(1, 20): scheduler.step(epoch) print(epoch, optim.param_groups[0]['lr'])
def train(name, df, VAL_FOLD=0, resume=False): dt_string = datetime.now().strftime("%d|%m_%H|%M|%S") print("Starting -->", dt_string) os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs('checkpoint', exist_ok=True) run = f"{name}_[{dt_string}]" wandb.init(project="imanip", config=config_defaults, name=run) config = wandb.config # model = SRM_Classifer(num_classes=1, encoder_checkpoint='weights/pretrain_[31|03_12|16|32].h5') model = SMP_SRM_UPP(classifier_only=True) # for name_, param in model.named_parameters(): # if 'classifier' in name_: # continue # else: # param.requires_grad = False print("Parameters : ", sum(p.numel() for p in model.parameters() if p.requires_grad)) wandb.save('segmentation/smp_srm.py') wandb.save('dataset.py') train_imgaug, train_geo_aug = get_train_transforms() transforms_normalize = get_transforms_normalize() #region ########################-- CREATE DATASET and DATALOADER --######################## train_dataset = DATASET(dataframe=df, mode="train", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, imgaug_augment=train_imgaug, geo_augment=train_geo_aug) train_loader = DataLoader(train_dataset, batch_size=config.train_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) valid_dataset = DATASET( dataframe=df, mode="val", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, ) valid_loader = DataLoader(valid_dataset, batch_size=config.valid_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) test_dataset = DATASET( dataframe=df, mode="test", val_fold=VAL_FOLD, test_fold=TEST_FOLD, transforms_normalize=transforms_normalize, ) test_loader = DataLoader(test_dataset, batch_size=config.valid_batch_size, shuffle=True, num_workers=4, pin_memory=True, drop_last=False) #endregion ###################################################################################### optimizer = get_optimizer(model, config.optimizer, config.learning_rate, config.weight_decay) # after_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( # optimizer, # patience=config.schedule_patience, # mode="min", # factor=config.schedule_factor, # ) after_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_0=35, T_mult=2) scheduler = GradualWarmupScheduler(optimizer=optimizer, multiplier=1, total_epoch=config.warmup + 1, after_scheduler=after_scheduler) # this zero gradient update is needed to avoid a warning message, issue #8. # optimizer.zero_grad() # optimizer.step() criterion = nn.BCEWithLogitsLoss() es = EarlyStopping(patience=200, mode="min") model = nn.DataParallel(model).to(device) # wandb.watch(model, log_freq=50, log='all') start_epoch = 0 if resume: checkpoint = torch.load( 'checkpoint/(using pretrain)COMBO_ALL_FULL_[09|04_12|46|35].pt') scheduler.load_state_dict(checkpoint['scheduler_state_dict']) model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_epoch = checkpoint['epoch'] + 1 print("-----------> Resuming <------------") for epoch in range(start_epoch, config.epochs): print(f"Epoch = {epoch}/{config.epochs-1}") print("------------------") train_metrics = train_epoch(model, train_loader, optimizer, scheduler, criterion, epoch) valid_metrics = valid_epoch(model, valid_loader, criterion, epoch) scheduler.step(valid_metrics['valid_loss']) print( f"TRAIN_ACC = {train_metrics['train_acc_05']}, TRAIN_LOSS = {train_metrics['train_loss']}" ) print( f"VALID_ACC = {valid_metrics['valid_acc_05']}, VALID_LOSS = {valid_metrics['valid_loss']}" ) print("Optimizer LR", optimizer.param_groups[0]['lr']) print("Scheduler LR", scheduler.get_lr()[0]) wandb.log({ 'optim_lr': optimizer.param_groups[0]['lr'], 'schedule_lr': scheduler.get_lr()[0] }) es( valid_metrics["valid_loss"], model, model_path=os.path.join(OUTPUT_DIR, f"{run}.h5"), ) if es.early_stop: print("Early stopping") break checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), } torch.save(checkpoint, os.path.join('checkpoint', f"{run}.pt")) if os.path.exists(os.path.join(OUTPUT_DIR, f"{run}.h5")): print( model.load_state_dict( torch.load(os.path.join(OUTPUT_DIR, f"{run}.h5")))) print("LOADED FOR TEST") test_metrics = test(model, test_loader, criterion) wandb.save(os.path.join(OUTPUT_DIR, f"{run}.h5")) return test_metrics
lr = 0.001 optim = torch.optim.SGD([v], lr=lr) optim.param_groups[0]['initial_lr'] = lr last_epoch = -1 scheduler = lr_scheduler.MultiStepLR(optim, milestones=[4], gamma=0.1, last_epoch=-1) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=10, eta_min=0.00001, last_epoch=-1) # scheduler = lr_scheduler.OneCycleLR(optim, max_lr=0.001, total_steps=6000, pct_start=0.033, anneal_strategy='cos', last_epoch=last_epoch) warmup = True if warmup: scheduler = GradualWarmupScheduler(optim, multiplier=5, total_epoch=5, after_scheduler=scheduler) # if last_epoch != -1: # scheduler.step() lrs = [] for epoch in range(last_epoch + 1, 30): print(epoch, optim.param_groups[0]['lr']) lrs.append(optim.param_groups[0]['lr']) scheduler.step() plt.plot(lrs) plt.show()
def train(args, train_dataset, model): tb_writer = SummaryWriter(args.tb_writer_dir) result_writer = ResultWriter(args.eval_results_dir) if args.weighted_sampling == 1: # 세 가지 구질이 불균일하게 분포되었으므로 세 개를 동일한 비율로 샘플링 # 결과적으로 이 방법을 썼을 때 좋지 않아서 wighted_sampling은 쓰지 않았음 ball_type, counts = np.unique(train_dataset.pitch, return_counts=True) count_dict = dict(zip(ball_type, counts)) weights = [1.0 / count_dict[p] for p in train_dataset.pitch] sampler = WeightedRandomSampler(weights, len(train_dataset), replacement=True) logger.info("Do Weighted Sampling") else: sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=sampler) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // len(train_dataloader) + 1 else: t_total = len(train_dataloader) * args.num_train_epochs args.warmup_step = int(args.warmup_percent * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = [ "bias", "layernorm.weight", ] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) if args.warmup_step != 0: scheduler_cosine = CosineAnnealingLR(optimizer, t_total) scheduler = GradualWarmupScheduler(optimizer, 1, args.warmup_step, after_scheduler=scheduler_cosine) else: scheduler = CosineAnnealingLR(optimizer, t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.n_gpu > 1: model = torch.nn.DataParallel(model) loss_fct = torch.nn.NLLLoss() # Train! logger.info("***** Running Baseball Transformer *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Warmup Steps = %d", args.warmup_step) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Total train batch size = %d", args.train_batch_size) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 best_step = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss, logging_val_loss = 0.0, 0.0, 0.0 best_pitch_micro_f1, best_pitch_macro_f1, = 0, 0 best_loss = 1e10 best_pitch_macro_f1 = 0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): ( pitcher, batter, state, pitch, label, pitch_memory, label_memory, memory_mask, ) = list(map(lambda x: x.to(args.device), batch)) model.train() pitching_score, memories = model( pitcher, batter, state, pitch_memory, label_memory, memory_mask, ) pitching_score = pitching_score.log_softmax(dim=-1) loss = loss_fct(pitching_score, pitch) if args.n_gpu > 1: loss = loss.mean() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_during_training: results, f1_results, f1_log, cm = evaluate( args, args.eval_data_file, model) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") print_result(output_eval_file, results, f1_log, cm) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) logging_val_loss = results["loss"] tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # best 모델 선정 지표를 loss말고 macro-f1으로 설정(trade-off 존재) # if best_loss > results["loss"]: if best_pitch_macro_f1 < results["pitch_macro_f1"]: best_pitch_micro_f1 = results["pitch_micro_f1"] best_pitch_macro_f1 = results["pitch_macro_f1"] best_loss = results["loss"] results["best_step"] = best_step = global_step output_dir = os.path.join(args.output_dir, "best_model/") os.makedirs(output_dir, exist_ok=True) torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving best model to %s", output_dir) result_path = os.path.join(output_dir, "best_results.txt") print_result(result_path, results, f1_log, cm, off_logger=True) results.update(dict(f1_results)) result_writer.update(args, **results) logger.info(" best pitch micro f1 : %s", best_pitch_micro_f1) logger.info(" best pitch macro f1 : %s", best_pitch_macro_f1) logger.info(" best loss : %s", best_loss) logger.info(" best step : %s", best_step) if args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) tb_writer.close() return global_step, tr_loss / global_step
model_restoration.cuda() device_ids = [i for i in range(torch.cuda.device_count())] if torch.cuda.device_count() > 1: print("\n\nLet's use", torch.cuda.device_count(), "GPUs!\n\n") new_lr = opt.OPTIM.LR_INITIAL optimizer = optim.Adam(model_restoration.parameters(), lr=new_lr, betas=(0.9, 0.999),eps=1e-8, weight_decay=1e-8) ######### Scheduler ########### if warmup: warmup_epochs = 3 scheduler_cosine = optim.lr_scheduler.CosineAnnealingLR(optimizer, opt.OPTIM.NUM_EPOCHS-warmup_epochs, eta_min=1e-6) scheduler = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=warmup_epochs, after_scheduler=scheduler_cosine) scheduler.step() ######### Resume ########### if opt.TRAINING.RESUME: path_chk_rest = utils.get_last_path(model_dir, '_latest.pth') utils.load_checkpoint(model_restoration,path_chk_rest) start_epoch = utils.load_start_epoch(path_chk_rest) + 1 utils.load_optim(optimizer, path_chk_rest) for i in range(1, start_epoch): scheduler.step() new_lr = scheduler.get_lr()[0] print('------------------------------------------------------------------------------') print("==> Resuming Training with learning rate:", new_lr) print('------------------------------------------------------------------------------')