def test_invalid_lr():
    weight = torch.randn(10, 5).float().cuda().requires_grad_()
    bias = torch.randn(10).float().cuda().requires_grad_()
    with pytest.raises(ValueError):
        MADGRAD([weight, bias], lr=0)
    with pytest.raises(ValueError):
        MADGRAD([weight, bias], lr=-1e-2)
Exemple #2
0
def set_optimizer(optimizer, model, learning_rate):
    if optimizer == 'Adam':
        optimizer_ft = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer == 'SGD':
        optimizer_ft = optim.SGD(model.parameters(), 
                                 lr = learning_rate,
                                 momentum=0.9,
                                 weight_decay=1e-4)
    elif optimizer == 'MADGRAD':
        optimizer_ft = MADGRAD(model.parameters(),
                               lr=learning_rate)
    else:
        raise NameError(f'!!!!! operator ERROR : {operator} !!!!!')
    return optimizer_ft
 def configure_optimizers(self):
     if cfg['optimizer'] == "Adam":
         optimizer = torch.optim.Adam(self.netD.parameters(), lr=cfg['lr'])
     elif cfg['optimizer'] == "AdamP":
         optimizer = AdamP(self.netD.parameters(),
                           lr=cfg['lr'],
                           betas=(0.9, 0.999),
                           weight_decay=1e-2)
     elif cfg['optimizer'] == "SGDP":
         optimizer = SGDP(self.netD.parameters(),
                          lr=cfg['lr'],
                          weight_decay=1e-5,
                          momentum=0.9,
                          nesterov=True)
     elif cfg['optimizer'] == "MADGRAD":
         from madgrad import MADGRAD
         optimizer = MADGRAD(self.netD.parameters(),
                             lr=cfg['lr'],
                             momentum=0.9,
                             weight_decay=0.01,
                             eps=1e-6)
     return optimizer
def train(cfg):
    SEED = cfg.values.seed
    MODEL_NAME = cfg.values.model_name
    USE_KFOLD = cfg.values.val_args.use_kfold
    TRAIN_ONLY = cfg.values.train_only

    seed_everything(SEED)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config')
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42

    whole_df = load_data("/opt/ml/input/data/train/train.tsv")
    additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv")

    whole_label = whole_df['label'].values
    # additional_label = additional_df['label'].values

    if cfg.values.tokenizer_arc:
        tokenizer_module = getattr(import_module('transformers'),
                                   cfg.values.tokenizer_arc)
        tokenizer = tokenizer_module.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999,
                                           early_stopping_threshold=0.001)

    training_args = TrainingArguments(
        output_dir=cfg.values.train_args.output_dir,  # output directory
        save_total_limit=cfg.values.train_args.
        save_total_limit,  # number of total save model.
        save_steps=cfg.values.train_args.save_steps,  # model saving step.
        num_train_epochs=cfg.values.train_args.
        num_epochs,  # total number of training epochs
        learning_rate=cfg.values.train_args.lr,  # learning_rate
        per_device_train_batch_size=cfg.values.train_args.
        train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=cfg.values.train_args.
        eval_batch_size,  # batch size for evaluation         
        warmup_steps=cfg.values.train_args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=cfg.values.train_args.
        weight_decay,  # strength of weight decay            
        max_grad_norm=cfg.values.train_args.max_grad_norm,
        logging_dir=cfg.values.train_args.
        logging_dir,  # directory for storing logs
        logging_steps=cfg.values.train_args.logging_steps,  # log saving step.
        evaluation_strategy=cfg.values.train_args.
        evaluation_strategy,  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=cfg.values.train_args.eval_steps,  # evaluation step.
        dataloader_num_workers=4,
        seed=SEED,
        label_smoothing_factor=cfg.values.train_args.label_smoothing_factor,
        load_best_model_at_end=True,
        # metric_for_best_model='accuracy'
    )

    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k)

        k = 1
        for train_idx, val_idx in kfold.split(whole_df, whole_label):
            print('\n')
            cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15)
            train_df = whole_df.iloc[train_idx]
            # train_df = pd.concat((train_df, additional_df))
            val_df = whole_df.iloc[val_idx]

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold'
            training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold'

            optimizer = MADGRAD(model.parameters(),
                                lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )
            k += 1
            # train model
            trainer.train()

    else:
        cpprint('=' * 20 + f'START TRAINING' + '=' * 20)
        if not TRAIN_ONLY:
            train_df, val_df = train_test_split(
                whole_df,
                test_size=cfg.values.val_args.test_size,
                random_state=SEED)
            # train_df = pd.concat((train_df, additional_df))

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            optimizer = transformers.AdamW(model.parameters(),
                                           lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step)
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                callbacks=[early_stopping])

            # train model
            trainer.train()

        else:
            training_args.evaluation_strategy = 'no'

            if cfg.values.model_arc == 'Roberta':
                print('Roberta')
                tokenized_train = roberta_tokenized_dataset(
                    whole_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(whole_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          whole_df['label'].values)

            try:
                model = AutoModelForSequenceClassification.from_pretrained(
                    MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + '/only_train'
            training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train'

            optimizer = AdamP(model.parameters(),
                              lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )

            # train model
            trainer.train()
Exemple #5
0
def main(config):

    transforms = get_augmentation(config)

    # gender_age_class 의 분포를 고려해서 fold를 나눠준다
    df = pd.read_csv('/opt/ml/input/data/train/train.csv')
    skf = StratifiedKFold(n_splits=config.TRAIN.KFOLD,
                          shuffle=True,
                          random_state=42)
    for fold, (_, val_) in enumerate(skf.split(X=df, y=df.gender_age_class)):
        df.loc[val_, 'kfold'] = int(fold)
    df['kfold'] = df['kfold'].astype(int)

    # 각 fold에 대해 dataset&dataloader를 만들고 list에 저장해둔다.
    dataloaders = []
    for fold in range(config.TRAIN.KFOLD):
        train_df = df[df.kfold != fold]
        valid_df = df[df.kfold == fold]

        # Oversampling
        # https://www.kaggle.com/tanlikesmath/diabetic-retinopathy-with-resnet50-oversampling/notebook
        train_df = balance_data(
            train_df.pivot_table(index='gender_age_class',
                                 aggfunc=len).max().max(), train_df)
        valid_df = valid_df.reset_index(drop=True)
        train_dataset = MaskDataset(train_df, config.PATH.ROOT,
                                    transforms['train'])
        valid_dataset = MaskDataset(valid_df, config.PATH.ROOT,
                                    transforms['valid'])

        train_loader = DataLoader(train_dataset,
                                  batch_size=config.TRAIN.BATCH_SIZE,
                                  num_workers=config.TRAIN.NUM_WORKERS,
                                  pin_memory=True,
                                  shuffle=True)
        valid_loader = DataLoader(valid_dataset,
                                  batch_size=config.TRAIN.BATCH_SIZE,
                                  num_workers=config.TRAIN.NUM_WORKERS,
                                  pin_memory=True,
                                  shuffle=False)
        dataloaders.append((train_loader, valid_loader))

    # loss function 선택
    loss_collection = {
        'TaylorCE': TaylorCrossEntropyLoss(),
        'CE': CustomLoss(config.TRAIN.T),
        'F1Loss': F1Loss(),
        'KD-Reg': loss_kd_regularization(config),
        'Cumbo': Cumbo_Loss(config)
    }
    loss = loss_collection[config.TRAIN.LOSS]
    print(f'Loss : {config.TRAIN.LOSS}')

    f1_scores = []
    for fold, dataloader in enumerate(dataloaders):
        print(f'\n----------- FOLD {fold} TRAINING START --------------\n')
        model = EfficientNet_b0(6, True, config.MODEL.FREEZE)
        # model = DenseNet(6, config.MODEL.HIDDEN, config.MODEL.FREEZE)
        # model = ResNext(6, config.MODEL.FREEZE)
        optimizer = MADGRAD(model.parameters(), lr=1e-4)
        # optimizer = get_adamp(lr=config.TRAIN.BASE_LR, model=model, weight_decay=1e-6)
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer, T_0=10, T_mult=1, eta_min=1e-6, last_epoch=-1)
        model_set = {
            'model': model,
            'optimizer': optimizer,
            'scheduler': scheduler,
            'criterion': loss
        }
        train_loader = dataloader[0]
        valid_loader = dataloader[1]
        trainer = Trainer(model_set, config, train_loader, valid_loader, fold)
        best_f1 = trainer.train(config.TRAIN.EPOCH)
        print(f'\nFOLD F{fold}: {best_f1:.3f}\n')
        f1_scores.append(best_f1)

    print(f'MEAN F1 - {sum(f1_scores)/len(f1_scores)}')
def test_sparse():
    weight = torch.randn(5, 1).cuda().requires_grad_()
    weight_sparse = weight.detach().clone().requires_grad_()
    optimizer_dense = MADGRAD([weight], lr=1e-3, momentum=0)
    optimizer_sparse = MADGRAD([weight_sparse], lr=1e-3, momentum=0)

    weight.grad = torch.rand_like(weight)
    weight.grad[0] = 0.0  # Add a zero
    weight_sparse.grad = weight.grad.to_sparse()

    optimizer_dense.step()
    optimizer_sparse.step()
    assert torch.allclose(weight, weight_sparse)

    weight.grad = torch.rand_like(weight)
    weight.grad[1] = 0.0  # Add a zero
    weight_sparse.grad = weight.grad.to_sparse()

    optimizer_dense.step()
    optimizer_sparse.step()
    assert torch.allclose(weight, weight_sparse)

    weight.grad = torch.rand_like(weight)
    weight.grad[0] = 0.0  # Add a zero
    weight_sparse.grad = weight.grad.to_sparse()

    optimizer_dense.step()
    optimizer_sparse.step()
    assert torch.allclose(weight, weight_sparse)
def test_momentum_zero():
    weight, bias, input = make_full_precision_params()
    optimizer = MADGRAD([weight, bias], lr=1e-3, momentum=0)

    step_test(optimizer, weight, bias, input)
def test_step_full_precision_inferred():
    weight, bias, input = make_full_precision_params()
    optimizer = MADGRAD([weight, bias], lr=1e-3)

    step_test(optimizer, weight, bias, input)
Exemple #9
0
    def train(self):
        device = self.device
        print('Running on device: {}'.format(device), 'start training...')
        print(
            f'Setting - Epochs: {self.num_epochs}, Learning rate: {self.learning_rate} '
        )

        train_loader = self.train_loader
        valid_loader = self.valid_loader

        model = self.model.to(device)
        if self.optimizer == 0:
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=self.learning_rate,
                                         weight_decay=1e-5)
        elif self.optimizer == 1:
            optimizer = torch.optim.AdamW(model.parameters(),
                                          lr=self.learning_rate,
                                          weight_decay=1e-5)
        elif self.optimizer == 2:
            optimizer = MADGRAD(model.parameters(),
                                lr=self.learning_rate,
                                weight_decay=1e-5)
        elif self.optimizer == 3:
            optimizer = AdamP(model.parameters(),
                              lr=self.learning_rate,
                              weight_decay=1e-5)
        criterion = torch.nn.CrossEntropyLoss().to(device)

        if self.use_swa:
            optimizer = SWA(optimizer, swa_start=2, swa_freq=2, swa_lr=1e-5)

        # scheduler #
        scheduler_dct = {
            0:
            None,
            1:
            StepLR(optimizer, 10, gamma=0.5),
            2:
            ReduceLROnPlateau(optimizer,
                              'min',
                              factor=0.4,
                              patience=int(0.3 *
                                           self.early_stopping_patience)),
            3:
            CosineAnnealingLR(optimizer, T_max=5, eta_min=0.)
        }
        scheduler = scheduler_dct[self.scheduler]

        # early stopping
        early_stopping = EarlyStopping(patience=self.early_stopping_patience,
                                       verbose=True,
                                       path=f'checkpoint_{self.job}.pt')

        # training
        self.train_loss_lst = list()
        self.train_acc_lst = list()
        self.val_loss_lst = list()
        self.val_acc_lst = list()
        for epoch in range(1, self.num_epochs + 1):
            with tqdm(train_loader, unit='batch') as tepoch:
                avg_val_loss, avg_val_acc = None, None

                for idx, (img, label) in enumerate(tepoch):
                    tepoch.set_description(f"Epoch {epoch}")

                    model.train()
                    optimizer.zero_grad()

                    img, label = img.float().to(device), label.long().to(
                        device)

                    output = model(img)
                    loss = criterion(output, label)
                    predictions = output.argmax(dim=1, keepdim=True).squeeze()
                    correct = (predictions == label).sum().item()
                    accuracy = correct / len(img)

                    loss.backward()
                    optimizer.step()

                    if idx == len(train_loader) - 1:

                        val_loss_lst, val_acc_lst = list(), list()

                        model.eval()
                        with torch.no_grad():
                            for val_img, val_label in valid_loader:
                                val_img, val_label = val_img.float().to(
                                    device), val_label.long().to(device)

                                val_out = model(val_img)
                                val_loss = criterion(val_out, val_label)
                                val_pred = val_out.argmax(
                                    dim=1, keepdim=True).squeeze()
                                val_acc = (val_pred == val_label
                                           ).sum().item() / len(val_img)

                                val_loss_lst.append(val_loss.item())
                                val_acc_lst.append(val_acc)

                        avg_val_loss = np.mean(val_loss_lst)
                        avg_val_acc = np.mean(val_acc_lst) * 100.

                        self.train_loss_lst.append(loss)
                        self.train_acc_lst.append(accuracy)
                        self.val_loss_lst.append(avg_val_loss)
                        self.val_acc_lst.append(avg_val_acc)

                    if scheduler is not None:
                        current_lr = optimizer.param_groups[0]['lr']
                    else:
                        current_lr = self.learning_rate

                    # log
                    tepoch.set_postfix(loss=loss.item(),
                                       accuracy=100. * accuracy,
                                       val_loss=avg_val_loss,
                                       val_acc=avg_val_acc,
                                       current_lr=current_lr)

                # early stopping check
                early_stopping(avg_val_loss, model)
                if early_stopping.early_stop:
                    print("Early stopping")
                    break

                # scheduler update
                if scheduler is not None:
                    if self.scheduler == 2:
                        scheduler.step(avg_val_loss)
                    else:
                        scheduler.step()
        if self.use_swa:
            optimizer.swap_swa_sgd()
        self.model.load_state_dict(torch.load(f'checkpoint_{self.job}.pt'))
Exemple #10
0
def train(cfg, train_loader, val_loader, val_labels, k):
    # Set Config
    MODEL_ARC = cfg.values.model_arc
    OUTPUT_DIR = cfg.values.output_dir
    NUM_CLASSES = cfg.values.num_classes

    SAVE_PATH = os.path.join(OUTPUT_DIR, MODEL_ARC)

    best_score = 0.

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    os.makedirs(SAVE_PATH, exist_ok=True)

    if k > 0:
        os.makedirs(SAVE_PATH + f'/{k}_fold', exist_ok=True)

    num_epochs = cfg.values.train_args.num_epochs
    max_lr = cfg.values.train_args.max_lr
    min_lr = cfg.values.train_args.min_lr
    weight_decay = cfg.values.train_args.weight_decay
    log_intervals = cfg.values.train_args.log_intervals

    model = CNNModel(model_arc=MODEL_ARC, num_classes=NUM_CLASSES)
    model.to(device)

    # base_optimizer = SGDP
    # optimizer = SAM(model.parameters(), base_optimizer, lr=max_lr, momentum=momentum)
    optimizer = MADGRAD(model.parameters(),
                        lr=max_lr,
                        weight_decay=weight_decay)
    first_cycle_steps = len(train_loader) * num_epochs // 2

    scheduler = CosineAnnealingWarmupRestarts(
        optimizer,
        first_cycle_steps=first_cycle_steps,
        cycle_mult=1.0,
        max_lr=max_lr,
        min_lr=min_lr,
        warmup_steps=int(first_cycle_steps * 0.2),
        gamma=0.5)

    criterion = nn.BCEWithLogitsLoss()

    wandb.watch(model)

    for epoch in range(num_epochs):
        model.train()

        loss_values = AverageMeter()

        scaler = GradScaler()

        for step, (images, labels) in enumerate(tqdm(train_loader)):
            images = images.to(device)
            labels = labels.to(device)
            batch_size = labels.size(0)

            with autocast():
                logits = model(images)
                loss = criterion(logits.view(-1), labels)

            loss_values.update(loss.item(), batch_size)

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            wandb.log({
                'Learning rate': get_learning_rate(optimizer)[0],
                'Train Loss': loss_values.val
            })

            if step % log_intervals == 0:
                tqdm.write(
                    f'Epoch : [{epoch + 1}/{num_epochs}][{step}/{len(train_loader)}] || '
                    f'LR : {get_learning_rate(optimizer)[0]:.6e} || '
                    f'Train Loss : {loss_values.val:.4f} ({loss_values.avg:.4f}) ||'
                )

        with torch.no_grad():
            model.eval()

            loss_values = AverageMeter()
            preds = []

            for step, (images, labels) in enumerate(tqdm(val_loader)):
                images = images.to(device)
                labels = labels.to(device)
                batch_size = labels.size(0)

                logits = model(images)
                loss = criterion(logits.view(-1), labels)

                preds.append(logits.sigmoid().to('cpu').numpy())

                loss_values.update(loss.item(), batch_size)

        predictions = np.concatenate(preds)

        # f1, roc_auc = get_score(val_labels, predictions)
        roc_auc = get_score(val_labels, predictions)
        is_best = roc_auc >= best_score
        best_score = max(roc_auc, best_score)

        if is_best:
            if k > 0:
                remove_all_file(SAVE_PATH + f'/{k}_fold')
                print(
                    f"Save checkpoints {SAVE_PATH + f'/{k}_fold/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth'}..."
                )
                torch.save(
                    model.state_dict(), SAVE_PATH +
                    f'/{k}_fold/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth'
                )
            else:
                remove_all_file(SAVE_PATH)
                print(
                    f"Save checkpoints {SAVE_PATH + f'/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth'}..."
                )
                torch.save(
                    model.state_dict(), SAVE_PATH +
                    f'/{epoch + 1}_epoch_{best_score * 100.0:.2f}%.pth')

        wandb.log({
            'Validation Loss average': loss_values.avg,
            'ROC AUC Score': roc_auc,
            # 'F1 Score' : f1
        })

        tqdm.write(f'Epoch : [{epoch + 1}/{num_epochs}] || '
                   f'Val Loss : {loss_values.avg:.4f} || '
                   f'ROC AUC score : {roc_auc:.4f} ||')