Exemple #1
0
def _eval(model, dataloader, CONFIG):
    model.eval()
    torch.set_grad_enabled(False)

    correct = 0
    total = 0

    all_targ = torch.tensor([]).to(dtype=torch.int64).cuda(CONFIG['NUM_GPU'])
    all_pred = torch.tensor([]).to(dtype=torch.int64).cuda(CONFIG['NUM_GPU'])

    for test_data in dataloader:
        X, y = test_data
        X, y = X.cuda(CONFIG['NUM_GPU']), y.cuda(CONFIG['NUM_GPU'])

        y_pred = model(X)

        all_pred = torch.cat((all_pred, torch.argmax(y_pred, dim=1)))
        all_targ = torch.cat((all_targ, y.to(torch.int64)))
        total += y.size(0)
        correct += accuracy(torch.argmax(y_pred, dim=1),
                            y,
                            method='5_class_vec_output') * y.size(0)
    acc = round(correct / total, 4)
    c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(),
                                               all_pred.cpu().numpy())
    model.train()
    torch.set_grad_enabled(True)
    return acc, c_matrix, kappa, all_pred.cpu().numpy()
Exemple #2
0
def _eval(model, dataloader, CONFIG):
    model.eval()
    torch.set_grad_enabled(False)

    correct = 0
    total = 0

    all_targ = torch.tensor([]).to(dtype=torch.int64).cuda()
    all_pred = torch.tensor([]).to(dtype=torch.int64).cuda()

    for test_data in dataloader:
        X, y = test_data
        X, y = X.cuda(), y.cuda()

        y_pred = model(X)

        y_pred_classified = y_pred.view(-1).clone()
        for i in range(len(y_pred)):
            y_pred_classified[i] = classify(y_pred[i])

        all_pred = torch.cat((all_pred, y_pred_classified.to(torch.int64)))
        all_targ = torch.cat((all_targ, y.to(torch.int64)))

        total += y.size(0)
        correct += accuracy(y_pred.cpu(), y.cpu().float()) * y.size(0)
    acc = round(correct / total, 4)
    c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(),
                                               all_pred.cpu().numpy())
    model.train()
    torch.set_grad_enabled(True)
    return acc, c_matrix, kappa, all_pred.cpu().numpy()
Exemple #3
0
def run():
    data_df = pd.read_csv('../input/train.csv')
    train_df, valid_df = train_test_split(data_df,
                                          random_state=42,
                                          test_size=0.1)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)

    train_y = train_df['median_relevance'].values
    valid_y = valid_df['median_relevance'].values

    train_dataset = CrowdFlowerDataset(
        query=train_df['query'].values,
        prod_title=train_df['product_title'].values,
        prod_description=train_df['product_description'].values,
        targets=train_y)
    valid_dataset = CrowdFlowerDataset(
        query=valid_df['query'].values,
        prod_title=valid_df['product_title'].values,
        prod_description=valid_df['product_description'].values,
        targets=valid_y)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=configs.TRAIN_BATCH_SIZE, shuffle=True)

    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=configs.VALID_BATCH_SIZE, shuffle=False)

    num_train_steps = int(
        len(train_dataset) / configs.TRAIN_BATCH_SIZE * configs.EPOCHS)
    device = configs.DEVICE
    model = BERTBaseUncased().to(device)
    optimizer = configs.OPTIMIZER(model.parameters(), lr=configs.LR)
    scheduler = configs.SCHEDULER(optimizer,
                                  num_warmup_steps=0,
                                  num_training_steps=num_train_steps)

    for epoch in range(configs.EPOCHS):

        epoch_start = time.time()

        epoch_train_loss = train_loop_fn(train_dataloader, model, optimizer,
                                         scheduler)
        outputs, targets, epoch_valid_loss = eval_loop_fn(
            valid_dataloader, model)

        epoch_end = time.time()
        epoch_time_elapsed = (epoch_end - epoch_start) / 60.0
        print(f'time take to run a epoch - {epoch_time_elapsed}')
        print(
            f'Epoch - Training loss - {epoch_train_loss} Valid loss - {epoch_valid_loss}'
        )

        qw_kappa = quadratic_weighted_kappa(targets.flatten(),
                                            outputs.flatten())
        print(f'Quadratic Weighted Kappa: {qw_kappa}')
Exemple #4
0
def evaluate(model_path, test_dataset, CONFIG):
    c_matrix = np.zeros((5, 5), dtype=int)
    trained_model = torch.load(model_path).cuda(CONFIG['NUM_GPU'])
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    test_acc, test_c_matrix, test_kappa = _eval(trained_model, test_loader)
    print('==============================')
    print('Finished! test acc: {}'.format(test_acc))
    print('Confusion Matrix:')
    print(c_matrix)
    print('quadratic kappa: {}'.format(quadratic_weighted_kappa(c_matrix)))
    print('==============================')
Exemple #5
0
def evaluate(CONFIG):

    #creat result folder
    if not os.path.isdir(CONFIG['SAVE_PATH']):
        os.makedirs(CONFIG['SAVE_PATH'])

    # creat dataset
    test_dataset = generate_stem_dataset(CONFIG['DATA_PATH'],
                                         CONFIG['INPUT_SIZE'],
                                         CONFIG['DATA_AUGMENTATION'],
                                         cv=False,
                                         mode='evaluate')

    # creat dataloader
    test_loader = DataLoader(test_dataset,
                             batch_size=CONFIG['BATCH_SIZE'],
                             num_workers=CONFIG['NUM_WORKERS'],
                             shuffle=False)
    # define model
    model_name = CONFIG['MODEL_NAME']
    model = EfficientNet.from_pretrained(model_name)
    feature = model._fc.in_features
    model._fc = nn.Linear(in_features=feature, out_features=1, bias=True)

    #multi-gpu setting
    torch.cuda.set_device(CONFIG['GPU_NUM'][0])
    model = torch.nn.DataParallel(
        model, device_ids=CONFIG['GPU_NUM']).to(device=torch.device('cuda'))

    # load pretrained weights
    if CONFIG['PRETRAINED_PATH']:
        state_dict = torch.load(CONFIG['PRETRAINED_PATH'])
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if 'module' not in k:
                k = 'module.' + k
            else:
                k = k.replace('features.module.', 'module.features.')
            new_state_dict[k] = v
        model.load_state_dict(new_state_dict)

    # evaluate
    model.eval()
    torch.set_grad_enabled(False)

    correct = 0
    total = 0

    all_targ = torch.tensor([]).to(dtype=torch.int64).cuda()
    all_pred = torch.tensor([]).to(dtype=torch.int64).cuda()
    logit_pred_y = []
    logit_targ_y = []
    for test_data in test_loader:
        X, y = test_data
        X, y = X.cuda(), y.cuda()

        y_pred = model(X)

        y_pred_classified = y_pred.view(-1).clone()
        for i in range(len(y_pred)):
            y_pred_classified[i] = classify(y_pred[i])

        all_pred = torch.cat((all_pred, y_pred_classified.to(torch.int64)))
        all_targ = torch.cat((all_targ, y.to(torch.int64)))

        total += y.size(0)
        correct += accuracy(y_pred.cpu(), y.cpu().float()) * y.size(0)

        logit_pred_y += list(y_pred.view(-1).cpu().numpy())
        logit_targ_y += list(y.cpu().float().numpy())

    acc = round(correct / total, 4)
    c_matrix, kappa = quadratic_weighted_kappa(all_targ.cpu().numpy(),
                                               all_pred.cpu().numpy())

    ks_dataframe = pd.DataFrame({'pred': logit_pred_y, 'targ': logit_targ_y})
    ks_dataframe.to_csv(os.path.join(CONFIG['SAVE_PATH'],
                                     model_name + '_eval_results.csv'),
                        index=False,
                        sep=',')

    print('==============================')
    print('Test acc: {}'.format(acc))
    print('Confusion Matrix:\n{}'.format(c_matrix))
    print('quadratic kappa: {}'.format(kappa))
    print('==============================')
def main(argv):
    
    if len(argv) < 5:
        sys.exit('please provide\n the path to your data sets;\n train or test or both keyword;\n data file(s) extension (pk or csv);\n \
what column to fit if train mode is set/how to call output if test mode is set;\n \
id column for the final result;\n \
optional:\n \
simple fit or grid search;\n \
save the model created\n')
    
    path=argv[1]
    if not path.endswith('/'):
        path=path+'/'
    train_or_test=argv[2]
    ext=argv[3]
    fit_y=argv[4]
    id='none'
    if len(argv) > 5:
        id=argv[5]
    simple_fit=0
    dump_model=0
    if len(argv) > 6:
        if (argv[6] == 'simple'):
            simple_fit=1
            if (len(argv) > 7 and argv[7] == 'save'):
                dump_model=1
        elif (argv[6] == 'save'):
            dump_model=1
    
    if train_or_test != 'test':
        if ext == 'pk':
            train_features = pd.read_pickle(path + 'train.pk')
        else:
            train_features = pd.read_csv(path + 'train.csv').fillna("")
        if fit_y not in train_features:
            sys.exit(fit_y+' not found in the provided dta set, verify your data and try again')
        y = train_features[fit_y]
        train_features = train_features.drop([fit_y], axis=1)
        if id in train_features:
            train_features = train_features.drop([id], axis=1)

        # -- the model
        svd = TruncatedSVD()
        scl = StandardScaler()
        model = LinearSVC()
        pip = pipeline.Pipeline([('svd', svd),('scl', scl),('svm', model)])

        if simple_fit:
            X_train, X_test, y_train, y_test = train_test_split(train_features, y, test_size=0.1, random_state=0)
    
    
            pip.fit(X_train, y_train)        
            predicted = pip.predict(X_test)
    
            sc = quadratic_weighted_kappa(y_test, predicted)
            print("score: %0.3f" % sc)
            best_model = pip
        else:
        # -- Grid parameter search
            param_grid = {'svd__n_components' : [2,3],'svm__C': [5,10] }
    
            scorer = make_scorer(quadratic_weighted_kappa, greater_is_better = True)
    
            model = grid_search.GridSearchCV(estimator = pip, 
                                             param_grid=param_grid, 
                                             scoring=scorer,
                                             verbose=10, 
                                             n_jobs=-1, 
                                             iid=True, 
                                             refit=True, 
                                             cv=3)
            model.fit(train_features, y)
    
            print("Best score: %0.3f" % model.best_score_)
            print("Best parameters set:")
            best_parameters = model.best_estimator_.get_params()
            for param_name in sorted(param_grid.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
            best_model = model.best_estimator_
            best_model.fit(train_features,y)
    
        if dump_model:
            with open(path+'model.dmp', 'wb') as f:
                pickle.dump(best_model, f)
            result="model.dmp"
        else:
            result="N/A"
    
    if train_or_test != 'train':
        if ext == 'pk':
            test_features = pd.read_pickle(path + 'test.pk')
        else:
            test_features = pd.read_csv(path + 'test.csv').fillna("")
    
        if train_or_test == 'test':        
            with open(path+'model.dmp', 'rb') as f:
                best_model = pickle.load(f)

        out_id = None
        if id in test_features:
            out_id = test_features[id]
            test_features = test_features.drop([id], axis=1)

        predictions = best_model.predict(test_features)
      
        if out_id is not None:
            result = pd.DataFrame({id : out_id, fit_y : predictions})
        else:
            result = pd.DataFrame({"ID": test_features.index.tolist(), fit_y : predictions})
        result.to_csv(path+"result.csv", index=False)
    
        result="result.csv"
    
    if result == 'N/A':
        return 'none'
    else:
        return path+result
Exemple #7
0
def main(argv):

    if len(argv) < 5:
        sys.exit(
            'please provide\n the path to your data sets;\n train or test or both keyword;\n data file(s) extension (pk or csv);\n \
what column to fit if train mode is set/how to call output if test mode is set;\n \
id column for the final result;\n \
optional:\n \
simple fit or grid search;\n \
save the model created\n')

    path = argv[1]
    if not path.endswith('/'):
        path = path + '/'
    train_or_test = argv[2]
    ext = argv[3]
    fit_y = argv[4]
    id = 'none'
    if len(argv) > 5:
        id = argv[5]
    simple_fit = 0
    dump_model = 0
    if len(argv) > 6:
        if (argv[6] == 'simple'):
            simple_fit = 1
            if (len(argv) > 7 and argv[7] == 'save'):
                dump_model = 1
        elif (argv[6] == 'save'):
            dump_model = 1

    if train_or_test != 'test':
        if ext == 'pk':
            train_features = pd.read_pickle(path + 'train.pk')
        else:
            train_features = pd.read_csv(path + 'train.csv').fillna("")
        if fit_y not in train_features:
            sys.exit(
                fit_y +
                ' not found in the provided dta set, verify your data and try again'
            )
        y = train_features[fit_y]
        train_features = train_features.drop([fit_y], axis=1)
        if id in train_features:
            train_features = train_features.drop([id], axis=1)

        # -- the model
        svd = TruncatedSVD()
        scl = StandardScaler()
        model = LinearSVC()
        pip = pipeline.Pipeline([('svd', svd), ('scl', scl), ('svm', model)])

        if simple_fit:
            X_train, X_test, y_train, y_test = train_test_split(train_features,
                                                                y,
                                                                test_size=0.1,
                                                                random_state=0)

            pip.fit(X_train, y_train)
            predicted = pip.predict(X_test)

            sc = quadratic_weighted_kappa(y_test, predicted)
            print("score: %0.3f" % sc)
            best_model = pip
        else:
            # -- Grid parameter search
            param_grid = {'svd__n_components': [2, 3], 'svm__C': [5, 10]}

            scorer = make_scorer(quadratic_weighted_kappa,
                                 greater_is_better=True)

            model = grid_search.GridSearchCV(estimator=pip,
                                             param_grid=param_grid,
                                             scoring=scorer,
                                             verbose=10,
                                             n_jobs=-1,
                                             iid=True,
                                             refit=True,
                                             cv=3)
            model.fit(train_features, y)

            print("Best score: %0.3f" % model.best_score_)
            print("Best parameters set:")
            best_parameters = model.best_estimator_.get_params()
            for param_name in sorted(param_grid.keys()):
                print("\t%s: %r" % (param_name, best_parameters[param_name]))

            best_model = model.best_estimator_
            best_model.fit(train_features, y)

        if dump_model:
            with open(path + 'model.dmp', 'wb') as f:
                pickle.dump(best_model, f)
            result = "model.dmp"
        else:
            result = "N/A"

    if train_or_test != 'train':
        if ext == 'pk':
            test_features = pd.read_pickle(path + 'test.pk')
        else:
            test_features = pd.read_csv(path + 'test.csv').fillna("")

        if train_or_test == 'test':
            with open(path + 'model.dmp', 'rb') as f:
                best_model = pickle.load(f)

        out_id = None
        if id in test_features:
            out_id = test_features[id]
            test_features = test_features.drop([id], axis=1)

        predictions = best_model.predict(test_features)

        if out_id is not None:
            result = pd.DataFrame({id: out_id, fit_y: predictions})
        else:
            result = pd.DataFrame({
                "ID": test_features.index.tolist(),
                fit_y: predictions
            })
        result.to_csv(path + "result.csv", index=False)

        result = "result.csv"

    if result == 'N/A':
        return 'none'
    else:
        return path + result
Exemple #8
0
def train_cnn(run_name, trn_x, val_x, trn_y, val_y, cfg):

    train_loader = factory.get_dataloader(trn_x, trn_y, cfg.data.train)
    valid_loader = factory.get_dataloader(val_x, val_y, cfg.data.valid)

    model = factory.get_model(cfg).to(device)
    
    criterion = factory.get_loss(cfg)
    optimizer = factory.get_optim(cfg, model.parameters())
    scheduler = factory.get_scheduler(cfg, optimizer)

    best_epoch = -1
    best_val_score = -np.inf
    best_coef = []
    mb = master_bar(range(cfg.data.train.epochs))

    train_loss_list = []
    val_loss_list = []
    val_score_list = []
    initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5]

    for epoch in mb:
        start_time = time.time()

        model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg)

        valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg)

        if cfg.model.n_classes > 1:
            val_score = quadratic_weighted_kappa(val_y, valid_preds.argmax(1))
            cm = confusion_matrix(val_y, valid_preds.argmax(1))
        else:
            optR = QWKOptimizedRounder()
            optR.fit(valid_preds.copy(), val_y, initial_coef)
            coef = optR.coefficients()
            valid_preds_class = optR.predict(valid_preds.copy(), coef)
            val_score = quadratic_weighted_kappa(val_y, valid_preds_class)
            cm = confusion_matrix(val_y, valid_preds_class)
        
        # cm = np.round(cm / np.sum(cm, axis=1, keepdims=True), 3)

        train_loss_list.append(avg_loss)
        val_loss_list.append(avg_val_loss)
        val_score_list.append(val_score)

        if cfg.scheduler.name != 'ReduceLROnPlateau':
            scheduler.step()
        elif cfg.scheduler.name == 'ReduceLROnPlateau':
            scheduler.step(avg_val_loss)
        
        elapsed = time.time() - start_time
        mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s')
        logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} val_score: {val_score:.4f} time: {elapsed:.0f}s')

        if val_score > best_val_score:
            best_epoch = epoch + 1
            best_val_score = val_score
            best_valid_preds = valid_preds
            if cfg.model.multi_gpu:
                best_model = model.module.state_dict()
            else:
                best_model = model.state_dict()
            if cfg.model.n_classes == 1:
                best_coef = coef
            best_cm = cm

    print('\n\nCONFUSION MATRIX')
    logging.debug('\n\nCONFUSION MATRIX')
    print(cm)
    logging.debug(cm)

    print('\n\n===================================\n')
    print(f'CV: {best_val_score:.6f}')
    print(f'BEST EPOCH: {best_epoch}')
    logging.debug(f'\n\nCV: {best_val_score:.6f}')
    logging.debug(f'BEST EPOCH: {best_epoch}\n\n')
    print('\n===================================\n\n')

    result = {
        'cv': best_val_score,
    }

    np.save(f'../logs/{run_name}/oof.npy', best_valid_preds)
    np.save(f'../logs/{run_name}/best_coef.npy', best_coef)
    torch.save(best_model, f'../logs/{run_name}/weight_best.pt')
    save_png(run_name, cfg, train_loss_list, val_loss_list, val_score_list)
    
    return result
Exemple #9
0
def train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg):

    ordinal_val_preds = np.zeros_like(val_y)

    for i, col in enumerate(trn_y.columns[1:]):
        print(f'\n\n====================  {col}  ====================')
        logging.debug(f'\n\n====================  {col}  ====================')

        train_loader = factory.get_dataloader(trn_x, trn_y[col], cfg.data.train)
        valid_loader = factory.get_dataloader(val_x, val_y[col], cfg.data.valid)

        model = factory.get_model(cfg).to(device)
        
        criterion = factory.get_loss(cfg)
        optimizer = factory.get_optim(cfg, model.parameters())
        scheduler = factory.get_scheduler(cfg, optimizer)

        best_epoch = -1
        best_val_loss = np.inf
        mb = master_bar(range(cfg.data.train.epochs))

        train_loss_list = []
        val_loss_list = []
        val_score_list = []
        initial_coef = [0.5, 1.5, 2.5, 3.5, 4.5]

        for epoch in mb:
            start_time = time.time()

            model, avg_loss = train_epoch(model, train_loader, criterion, optimizer, mb, cfg)

            valid_preds, avg_val_loss = val_epoch(model, valid_loader, criterion, cfg)

            train_loss_list.append(avg_loss)
            val_loss_list.append(avg_val_loss)

            if cfg.scheduler.name != 'ReduceLROnPlateau':
                scheduler.step()
            elif cfg.scheduler.name == 'ReduceLROnPlateau':
                scheduler.step(avg_val_loss)

            elapsed = time.time() - start_time
            mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')
            logging.debug(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f} time: {elapsed:.0f}s')

            if avg_val_loss < best_val_loss:
                best_epoch = epoch + 1
                best_val_loss = avg_val_loss
                best_valid_preds = valid_preds
                if cfg.model.multi_gpu:
                    best_model = model.module.state_dict()
                else:
                    best_model = model.state_dict()

        print(f'epoch: {best_epoch}   loss: {best_val_loss}')

        ordinal_val_preds[:, i] = 1 / (1 + np.exp(-1 * best_valid_preds))

        np.save(f'../logs/{run_name}/oof_{col}.npy', best_valid_preds)
        torch.save(best_model, f'../logs/{run_name}/weight_best_{col}.pt')

    valid_preds = np.sum(ordinal_val_preds, axis=1)
    val_y = (np.sum(val_y.values, axis=1) - 1).astype(int)

    optR = QWKOptimizedRounder()
    optR.fit(valid_preds.copy(), val_y, initial_coef)
    best_coef = optR.coefficients()
    valid_preds_class = optR.predict(valid_preds.copy(), best_coef)
    best_val_score = quadratic_weighted_kappa(val_y, valid_preds_class)
    cm = confusion_matrix(val_y, valid_preds_class)

    print('\n\nCONFUSION MATRIX')
    logging.debug('\n\nCONFUSION MATRIX')
    print(cm)
    logging.debug(cm)

    print('\n\n===================================\n')
    print(f'CV: {best_val_score:.6f}')
    logging.debug(f'\n\nCV: {best_val_score:.6f}')
    print('\n===================================\n\n')

    result = {
        'cv': best_val_score,
    }

    np.save(f'../logs/{run_name}/best_coef.npy', best_coef)
    
    return result
Exemple #10
0
def train(net,
          net_size,
          input_size,
          feature_dim,
          train_dataset,
          val_dataset,
          epochs,
          learning_rate,
          batch_size,
          save_path,
          pretrained_model=None):
    # create dataloader
    train_targets = [sampler[1] for sampler in train_dataset.imgs]
    weighted_sampler = ScheduledWeightedSampler(len(train_dataset),
                                                train_targets, 0.975, True)
    train_loader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              sampler=weighted_sampler,
                              drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # define model
    model = net(net_size, input_size, feature_dim).cuda()
    print_msg('Trainable layers: ',
              ['{}\t{}'.format(k, v) for k, v in model.layer_configs()])

    # load pretrained weights
    if pretrained_model:
        pretrained_dict = model.load_weights(pretrained_model, ['fc', 'dense'])
        print_msg('Loaded weights from {}: '.format(pretrained_model),
                  sorted(pretrained_dict.keys()))

    # define loss and optimizier
    MSELoss = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=learning_rate,
                                momentum=0.9,
                                nesterov=True,
                                weight_decay=0.0005)
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0005)

    # learning rate warmup and decay
    milestones = [160, 230]
    warmup_epoch = 10
    warmup_batch = (len(train_loader) // batch_size) * warmup_epoch

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                        milestones=milestones,
                                                        gamma=0.1)
    warmup_scheduler = WarmupLRScheduler(optimizer, warmup_batch,
                                         learning_rate)

    # train
    max_kappa = 0
    record_epochs, accs, losses = [], [], []
    model.train()
    for epoch in range(1, epochs + 1):
        # resampling weight update
        weighted_sampler.step()

        # learning rate update
        lr_scheduler.step()
        if epoch in milestones:
            curr_lr = optimizer.param_groups[0]['lr']
            print_msg('Learning rate decayed to {}'.format(curr_lr))
        if epoch > 1 and epoch <= warmup_epoch:
            curr_lr = optimizer.param_groups[0]['lr']
            print_msg('Learning rate warmup to {}'.format(curr_lr))

        epoch_loss = 0
        correct = 0
        total = 0
        progress = tqdm(enumerate(train_loader))
        for step, train_data in progress:
            if epoch <= warmup_epoch:
                warmup_scheduler.step()

            X, y = train_data
            X, y = X.cuda(), y.float().cuda()

            # forward
            y_pred = model(X)
            loss = MSELoss(y_pred, y)

            # backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # metrics
            epoch_loss += loss.item()
            total += y.size(0)
            correct += accuracy(y_pred, y) * y.size(0)
            avg_loss = epoch_loss / (step + 1)
            avg_acc = correct / total
            progress.set_description(
                'epoch: {}, loss: {:.6f}, acc: {:.4f}'.format(
                    epoch, avg_loss, avg_acc))

        # save model
        c_matrix = np.zeros((5, 5), dtype=int)
        acc = _eval(model, val_loader, c_matrix)
        kappa = quadratic_weighted_kappa(c_matrix)
        print('validation accuracy: {}, kappa: {}'.format(acc, kappa))
        if kappa > max_kappa:
            torch.save(model, save_path)
            max_kappa = kappa
            print_msg('Model save at {}'.format(save_path))

        # record
        record_epochs.append(epoch)
        accs.append(acc)
        losses.append(avg_loss)

    return record_epochs, accs, losses