def test_invalid_lr(): weight = torch.randn(10, 5).float().cuda().requires_grad_() bias = torch.randn(10).float().cuda().requires_grad_() with pytest.raises(ValueError): MADGRAD([weight, bias], lr=0) with pytest.raises(ValueError): MADGRAD([weight, bias], lr=-1e-2)
def set_optimizer(optimizer, model, learning_rate): if optimizer == 'Adam': optimizer_ft = optim.Adam(model.parameters(), lr=learning_rate) elif optimizer == 'SGD': optimizer_ft = optim.SGD(model.parameters(), lr = learning_rate, momentum=0.9, weight_decay=1e-4) elif optimizer == 'MADGRAD': optimizer_ft = MADGRAD(model.parameters(), lr=learning_rate) else: raise NameError(f'!!!!! operator ERROR : {operator} !!!!!') return optimizer_ft
def configure_optimizers(self): if cfg['optimizer'] == "Adam": optimizer = torch.optim.Adam(self.netD.parameters(), lr=cfg['lr']) elif cfg['optimizer'] == "AdamP": optimizer = AdamP(self.netD.parameters(), lr=cfg['lr'], betas=(0.9, 0.999), weight_decay=1e-2) elif cfg['optimizer'] == "SGDP": optimizer = SGDP(self.netD.parameters(), lr=cfg['lr'], weight_decay=1e-5, momentum=0.9, nesterov=True) elif cfg['optimizer'] == "MADGRAD": from madgrad import MADGRAD optimizer = MADGRAD(self.netD.parameters(), lr=cfg['lr'], momentum=0.9, weight_decay=0.01, eps=1e-6) return optimizer
def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TRAIN_ONLY = cfg.values.train_only seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/train.tsv") additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv") whole_label = whole_df['label'].values # additional_label = additional_df['label'].values if cfg.values.tokenizer_arc: tokenizer_module = getattr(import_module('transformers'), cfg.values.tokenizer_arc) tokenizer = tokenizer_module.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999, early_stopping_threshold=0.001) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay max_grad_norm=cfg.values.train_args.max_grad_norm, logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. dataloader_num_workers=4, seed=SEED, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, load_best_model_at_end=True, # metric_for_best_model='accuracy' ) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] # train_df = pd.concat((train_df, additional_df)) val_df = whole_df.iloc[val_idx] if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' optimizer = MADGRAD(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, # callbacks=[early_stopping] ) k += 1 # train model trainer.train() else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) if not TRAIN_ONLY: train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) # train_df = pd.concat((train_df, additional_df)) if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) optimizer = transformers.AdamW(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, callbacks=[early_stopping]) # train model trainer.train() else: training_args.evaluation_strategy = 'no' if cfg.values.model_arc == 'Roberta': print('Roberta') tokenized_train = roberta_tokenized_dataset( whole_df, tokenizer) else: tokenized_train = tokenized_dataset(whole_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, whole_df['label'].values) try: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + '/only_train' training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train' optimizer = AdamP(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train()
def main(config): transforms = get_augmentation(config) # gender_age_class 의 분포를 고려해서 fold를 나눠준다 df = pd.read_csv('/opt/ml/input/data/train/train.csv') skf = StratifiedKFold(n_splits=config.TRAIN.KFOLD, shuffle=True, random_state=42) for fold, (_, val_) in enumerate(skf.split(X=df, y=df.gender_age_class)): df.loc[val_, 'kfold'] = int(fold) df['kfold'] = df['kfold'].astype(int) # 각 fold에 대해 dataset&dataloader를 만들고 list에 저장해둔다. dataloaders = [] for fold in range(config.TRAIN.KFOLD): train_df = df[df.kfold != fold] valid_df = df[df.kfold == fold] # Oversampling # https://www.kaggle.com/tanlikesmath/diabetic-retinopathy-with-resnet50-oversampling/notebook train_df = balance_data( train_df.pivot_table(index='gender_age_class', aggfunc=len).max().max(), train_df) valid_df = valid_df.reset_index(drop=True) train_dataset = MaskDataset(train_df, config.PATH.ROOT, transforms['train']) valid_dataset = MaskDataset(valid_df, config.PATH.ROOT, transforms['valid']) train_loader = DataLoader(train_dataset, batch_size=config.TRAIN.BATCH_SIZE, num_workers=config.TRAIN.NUM_WORKERS, pin_memory=True, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=config.TRAIN.BATCH_SIZE, num_workers=config.TRAIN.NUM_WORKERS, pin_memory=True, shuffle=False) dataloaders.append((train_loader, valid_loader)) # loss function 선택 loss_collection = { 'TaylorCE': TaylorCrossEntropyLoss(), 'CE': CustomLoss(config.TRAIN.T), 'F1Loss': F1Loss(), 'KD-Reg': loss_kd_regularization(config), 'Cumbo': Cumbo_Loss(config) } loss = loss_collection[config.TRAIN.LOSS] print(f'Loss : {config.TRAIN.LOSS}') f1_scores = [] for fold, dataloader in enumerate(dataloaders): print(f'\n----------- FOLD {fold} TRAINING START --------------\n') model = EfficientNet_b0(6, True, config.MODEL.FREEZE) # model = DenseNet(6, config.MODEL.HIDDEN, config.MODEL.FREEZE) # model = ResNext(6, config.MODEL.FREEZE) optimizer = MADGRAD(model.parameters(), lr=1e-4) # optimizer = get_adamp(lr=config.TRAIN.BASE_LR, model=model, weight_decay=1e-6) scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1) model_set = { 'model': model, 'optimizer': optimizer, 'scheduler': scheduler, 'criterion': loss } train_loader = dataloader[0] valid_loader = dataloader[1] trainer = Trainer(model_set, config, train_loader, valid_loader, fold) best_f1 = trainer.train(config.TRAIN.EPOCH) print(f'\nFOLD F{fold}: {best_f1:.3f}\n') f1_scores.append(best_f1) print(f'MEAN F1 - {sum(f1_scores)/len(f1_scores)}')
def test_sparse(): weight = torch.randn(5, 1).cuda().requires_grad_() weight_sparse = weight.detach().clone().requires_grad_() optimizer_dense = MADGRAD([weight], lr=1e-3, momentum=0) optimizer_sparse = MADGRAD([weight_sparse], lr=1e-3, momentum=0) weight.grad = torch.rand_like(weight) weight.grad[0] = 0.0 # Add a zero weight_sparse.grad = weight.grad.to_sparse() optimizer_dense.step() optimizer_sparse.step() assert torch.allclose(weight, weight_sparse) weight.grad = torch.rand_like(weight) weight.grad[1] = 0.0 # Add a zero weight_sparse.grad = weight.grad.to_sparse() optimizer_dense.step() optimizer_sparse.step() assert torch.allclose(weight, weight_sparse) weight.grad = torch.rand_like(weight) weight.grad[0] = 0.0 # Add a zero weight_sparse.grad = weight.grad.to_sparse() optimizer_dense.step() optimizer_sparse.step() assert torch.allclose(weight, weight_sparse)
def test_momentum_zero(): weight, bias, input = make_full_precision_params() optimizer = MADGRAD([weight, bias], lr=1e-3, momentum=0) step_test(optimizer, weight, bias, input)
def test_step_full_precision_inferred(): weight, bias, input = make_full_precision_params() optimizer = MADGRAD([weight, bias], lr=1e-3) step_test(optimizer, weight, bias, input)
def train(self): device = self.device print('Running on device: {}'.format(device), 'start training...') print( f'Setting - Epochs: {self.num_epochs}, Learning rate: {self.learning_rate} ' ) train_loader = self.train_loader valid_loader = self.valid_loader model = self.model.to(device) if self.optimizer == 0: optimizer = torch.optim.Adam(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) elif self.optimizer == 1: optimizer = torch.optim.AdamW(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) elif self.optimizer == 2: optimizer = MADGRAD(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) elif self.optimizer == 3: optimizer = AdamP(model.parameters(), lr=self.learning_rate, weight_decay=1e-5) criterion = torch.nn.CrossEntropyLoss().to(device) if self.use_swa: optimizer = SWA(optimizer, swa_start=2, swa_freq=2, swa_lr=1e-5) # scheduler # scheduler_dct = { 0: None, 1: StepLR(optimizer, 10, gamma=0.5), 2: ReduceLROnPlateau(optimizer, 'min', factor=0.4, patience=int(0.3 * self.early_stopping_patience)), 3: CosineAnnealingLR(optimizer, T_max=5, eta_min=0.) } scheduler = scheduler_dct[self.scheduler] # early stopping early_stopping = EarlyStopping(patience=self.early_stopping_patience, verbose=True, path=f'checkpoint_{self.job}.pt') # training self.train_loss_lst = list() self.train_acc_lst = list() self.val_loss_lst = list() self.val_acc_lst = list() for epoch in range(1, self.num_epochs + 1): with tqdm(train_loader, unit='batch') as tepoch: avg_val_loss, avg_val_acc = None, None for idx, (img, label) in enumerate(tepoch): tepoch.set_description(f"Epoch {epoch}") model.train() optimizer.zero_grad() img, label = img.float().to(device), label.long().to( device) output = model(img) loss = criterion(output, label) predictions = output.argmax(dim=1, keepdim=True).squeeze() correct = (predictions == label).sum().item() accuracy = correct / len(img) loss.backward() optimizer.step() if idx == len(train_loader) - 1: val_loss_lst, val_acc_lst = list(), list() model.eval() with torch.no_grad(): for val_img, val_label in valid_loader: val_img, val_label = val_img.float().to( device), val_label.long().to(device) val_out = model(val_img) val_loss = criterion(val_out, val_label) val_pred = val_out.argmax( dim=1, keepdim=True).squeeze() val_acc = (val_pred == val_label ).sum().item() / len(val_img) val_loss_lst.append(val_loss.item()) val_acc_lst.append(val_acc) avg_val_loss = np.mean(val_loss_lst) avg_val_acc = np.mean(val_acc_lst) * 100. self.train_loss_lst.append(loss) self.train_acc_lst.append(accuracy) self.val_loss_lst.append(avg_val_loss) self.val_acc_lst.append(avg_val_acc) if scheduler is not None: current_lr = optimizer.param_groups[0]['lr'] else: current_lr = self.learning_rate # log tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy, val_loss=avg_val_loss, val_acc=avg_val_acc, current_lr=current_lr) # early stopping check early_stopping(avg_val_loss, model) if early_stopping.early_stop: print("Early stopping") break # scheduler update if scheduler is not None: if self.scheduler == 2: scheduler.step(avg_val_loss) else: scheduler.step() if self.use_swa: optimizer.swap_swa_sgd() self.model.load_state_dict(torch.load(f'checkpoint_{self.job}.pt'))
def train(cfg, train_loader, val_loader, val_labels, k): # Set Config MODEL_ARC = cfg.values.model_arc OUTPUT_DIR = cfg.values.output_dir NUM_CLASSES = cfg.values.num_classes SAVE_PATH = os.path.join(OUTPUT_DIR, MODEL_ARC) best_score = 0. device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') os.makedirs(SAVE_PATH, exist_ok=True) if k > 0: os.makedirs(SAVE_PATH + f'/{k}_fold', exist_ok=True) num_epochs = cfg.values.train_args.num_epochs max_lr = cfg.values.train_args.max_lr min_lr = cfg.values.train_args.min_lr weight_decay = cfg.values.train_args.weight_decay log_intervals = cfg.values.train_args.log_intervals model = CNNModel(model_arc=MODEL_ARC, num_classes=NUM_CLASSES) model.to(device) # base_optimizer = SGDP # optimizer = SAM(model.parameters(), base_optimizer, lr=max_lr, momentum=momentum) optimizer = MADGRAD(model.parameters(), lr=max_lr, weight_decay=weight_decay) first_cycle_steps = len(train_loader) * num_epochs // 2 scheduler = CosineAnnealingWarmupRestarts( optimizer, first_cycle_steps=first_cycle_steps, cycle_mult=1.0, max_lr=max_lr, min_lr=min_lr, warmup_steps=int(first_cycle_steps * 0.2), gamma=0.5) criterion = nn.BCEWithLogitsLoss() wandb.watch(model) for epoch in range(num_epochs): model.train() loss_values = AverageMeter() scaler = GradScaler() for step, (images, labels) in enumerate(tqdm(train_loader)): images = images.to(device) labels = labels.to(device) batch_size = labels.size(0) with autocast(): logits = model(images) loss = criterion(logits.view(-1), labels) loss_values.update(loss.item(), batch_size) optimizer.zero_grad() scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() wandb.log({ 'Learning rate': get_learning_rate(optimizer)[0], 'Train Loss': loss_values.val }) if step % log_intervals == 0: tqdm.write( f'Epoch : [{epoch + 1}/{num_epochs}][{step}/{len(train_loader)}] || ' f'LR : {get_learning_rate(optimizer)[0]:.6e} || ' f'Train Loss : {loss_values.val:.4f} ({loss_values.avg:.4f}) ||' ) with torch.no_grad(): model.eval() loss_values = AverageMeter() preds = [] for step, (images, labels) in enumerate(tqdm(val_loader)): images = images.to(device) labels = labels.to(device) batch_size = labels.size(0) logits = model(images) loss = criterion(logits.view(-1), labels) preds.append(logits.sigmoid().to('cpu').numpy()) loss_values.update(loss.item(), batch_size) predictions = np.concatenate(preds) # f1, roc_auc = get_score(val_labels, predictions) roc_auc = get_score(val_labels, predictions) is_best = roc_auc >= best_score best_score = max(roc_auc, best_score) if is_best: if k > 0: remove_all_file(SAVE_PATH + f'/{k}_fold') print( f"Save checkpoints {SAVE_PATH + f'/{k}_fold/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth'}..." ) torch.save( model.state_dict(), SAVE_PATH + f'/{k}_fold/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth' ) else: remove_all_file(SAVE_PATH) print( f"Save checkpoints {SAVE_PATH + f'/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth'}..." ) torch.save( model.state_dict(), SAVE_PATH + f'/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth') wandb.log({ 'Validation Loss average': loss_values.avg, 'ROC AUC Score': roc_auc, # 'F1 Score' : f1 }) tqdm.write(f'Epoch : [{epoch + 1}/{num_epochs}] || ' f'Val Loss : {loss_values.avg:.4f} || ' f'ROC AUC score : {roc_auc:.4f} ||')