Exemple #1
0
def evaluate(model, test_loader, device, criterion, n_class=41):
    total_iou = 0.0
    total_num_data = 0.0
    total_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            x = data['audio']
            xlabel = data['label']
            num_data = len(xlabel[0])
            x = x.to(device)
            xlabel_enc = two_hot_encode(xlabel[0], xlabel[1], n_dim=n_class)
            xlabel_enc = xlabel_enc.to(device)
            out = model(x)

            logit, pred = out
            loss = criterion(logit, xlabel_enc)

            total_iou += calculate_iou(pred, xlabel)
            total_num_data += num_data
            total_loss += loss.item()
        del x, xlabel

    torch.cuda.empty_cache()
    logger.info('test loss: {loss:.4f}\ttest iou: {iou:.4f}'.format(
        loss=total_loss / (i + 1), iou=total_iou / total_num_data))
    return total_loss / (i + 1), total_iou / total_num_data
Exemple #2
0
def evaluate(model, test_loader, device, criterion):
    correct = 0.0
    num_data = 0.0
    total_loss = 0.0
    label = []
    prediction = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            x = data['image']
            xlabel = data['label']
            x = x.to(device)
            xlabel = xlabel.to(device)
            out = model(x)

            logit, pred = out
            loss = criterion(logit, xlabel)

            correct += torch.sum(pred == xlabel).item()
            num_data += xlabel.size(0)
            total_loss += loss.item()
            label = label + xlabel.tolist()
            prediction = prediction + pred.detach().cpu().tolist()
        del x, xlabel

    torch.cuda.empty_cache()

    f1_array = f1_score(label, prediction, average=None)
    f1_mean = gmean(f1_array)
    logger.info(
        'test loss: {loss:.4f}\ttest acc: {acc:.4f}\ttest F1: {f1:.4f}'.format(
            loss=total_loss / (i + 1), acc=correct / num_data, f1=f1_mean))
    return total_loss / (i + 1), correct / num_data, f1_mean
Exemple #3
0
def unclassified_predict(model, unclassified_loader, device, n_class=5):
    predictedData = [[] for i in range(n_class)]
    lenul = len(unclassified_loader)
    x2 = None
    with torch.no_grad():
        for i, data in enumerate(unclassified_loader):
            img_name = data['image_name']
            x = data['image']
            if "Ensemble" in model.name:
                x2 = data['image_2']
                x2 = x2.to(device)
            category_oneh = data['category_onehot']
            category = data['category']

            category = category.to(device)
            category_oneh = category_oneh.to(device)
            x = x.to(device)

            if 'Ensemble' in model.name:
                out = model(x, x2, category_oneh, category)
            elif 'Trainable' in model.name:
                out = model(x, category_oneh, category)
            else:
                out = model(x, category_oneh)
            logit, pred = out

            for item in zip(img_name, pred, logit):
                predict = int(item[1])
                predictedData[predict].append(
                    (float(item[2][predict]), item[0],
                     predict))  # prob, fname, predict

            if i % 100 == 0:  #작업 경과 출력
                logger.info(f'predict unclassied data {i} / {lenul}')
    return predictedData
def embedding_training(model, train_loader, optimizer, criterion, device,
                       epoch, total_epochs):
    running_loss = 0.0
    total_loss = 0.0
    correct = 0.0
    category_correct = 0.0
    num_data = 0.0

    for i, data in enumerate(train_loader):
        start = time.time()
        x = data['image']
        xlabel = data['label']
        category_pos = data['category_possible']
        category = data['category']
        category_oneh = data['category_onehot']

        x = x.to(device)
        xlabel = xlabel.to(device)
        category = category.to(device)
        category_pos = category_pos.to(device)
        category_oneh = category_oneh.to(device)

        optimizer.zero_grad()  # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨

        out = model(x, category_oneh, category)
        logit, pred = out

        if isinstance(criterion, torch.nn.CrossEntropyLoss):
            loss = criterion(logit, xlabel)
        elif isinstance(criterion, LabelSmoothingLoss):
            loss = criterion(logit, xlabel, category_pos)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        total_loss += loss.item()

        category_pred = torch.argmax(logit * category_pos, dim=-1)
        category_correct += torch.sum(category_pred == xlabel).item()

        correct += torch.sum(pred == xlabel).item()
        num_data += xlabel.size(0)
        if i % 100 == 0:  # print every 100 mini-batches
            logger.info(
                "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec".
                format(epoch, total_epochs, i, len(train_loader),
                       running_loss / 2000,
                       time.time() - start))
            running_loss = 0.0

    logger.info(
        '[{}/{}]\tloss: {:.4f}\tacc: {:.4f} \tcategory_acc : {:.4f}'.format(
            epoch, total_epochs, total_loss / (i + 1), correct / num_data,
            category_correct / num_data))
    del x, xlabel
    torch.cuda.empty_cache()
    return total_loss / (i + 1), correct / num_data
Exemple #5
0
    def rotate_checkpoints(use_mtime=False) -> None:
        if training_args.save_total_limit is None or training_args.save_total_limit <= 0:
            return

        # Check if we should delete older checkpoint(s)
        checkpoints_sorted = sorted_checkpoints(use_mtime=use_mtime)
        if len(checkpoints_sorted) <= training_args.save_total_limit:
            return

        number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - training_args.save_total_limit)
        checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
        for checkpoint in checkpoints_to_be_deleted:
            logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
            shutil.rmtree(checkpoint)
Exemple #6
0
def train(model,
          train_loader,
          optimizer,
          criterion,
          device,
          epoch,
          total_epochs,
          n_class=41):
    running_loss = 0.0
    total_loss = 0.0
    total_iou = 0.0
    total_num_data = 0
    for i, data in enumerate(train_loader):
        start = time.time()
        x = data['audio']
        xlabel = data['label']
        num_data = len(xlabel[0])
        xlabel_enc = two_hot_encode(xlabel[0], xlabel[1], n_dim=n_class)
        x = x.to(device)
        xlabel_enc = xlabel_enc.to(device)

        optimizer.zero_grad()  # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨
        out, pred = model(x)

        logit = out
        loss = criterion(logit, xlabel_enc)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        total_loss += loss.item()
        total_iou += calculate_iou(pred, xlabel)
        total_num_data += num_data
        if i % 100 == 0:  # print every 100 mini-batches
            logger.info(
                "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec".
                format(epoch, total_epochs, i, len(train_loader),
                       running_loss / 2000,
                       time.time() - start))
            running_loss = 0.0

    logger.info('[{}/{}]\tloss: {:.4f}\tiou: {:.4f}'.format(
        epoch, total_epochs, total_loss / (i + 1), total_iou / total_num_data))
    del x, xlabel
    torch.cuda.empty_cache()
    return total_loss / (i + 1), total_iou / total_num_data
Exemple #7
0
def select_samples(labeled_file,
                   unlabeled_file,
                   o_labeled_file,
                   o_unlabeled_file,
                   query_strategy='random'):
    labeled_file = f'labeled_{labeled_file}.json'
    unlabeled_file = f'unlabeled_{unlabeled_file}.json'
    o_labeled_file = f'labeled_{o_labeled_file}.json'
    o_unlabeled_file = f'unlabeled_{o_unlabeled_file}.json'

    low_cnt = 5740

    d_unlabeled = json.load((Path(data_dir) / unlabeled_file).open())
    d_labeled = json.load((Path(data_dir) / labeled_file).open())

    # mid = len(d_unlabeled) // 2
    # span = (len(d_labeled) + len(d_unlabeled)) // 10
    # start = max(0, mid - span)
    # end = mid + span

    d_unlabeled = sorted(d_unlabeled, key=lambda x: float(x['score']))

    if query_strategy != 'random':
        low_samples, d_unlabeled = d_unlabeled[:low_cnt], d_unlabeled[low_cnt:]
        random_samples = []
    else:
        low_samples = []
        random_samples = random.sample(d_unlabeled, low_cnt)

    for _ in random_samples:
        d_unlabeled.remove(_)

    d_to_add = low_samples + random_samples

    def trans(data):
        for dic in data:
            dic['label'] = dic['true_label']
            dic.pop('pred_label')
            dic.pop('true_label')
            dic.pop('score')

    trans(d_to_add)
    trans(d_unlabeled)

    json.dump(d_to_add + d_labeled,
              (Path(data_dir) / o_labeled_file).open('w'),
              ensure_ascii=False,
              indent=2)
    json.dump(d_unlabeled, (Path(data_dir) / o_unlabeled_file).open('w'),
              ensure_ascii=False,
              indent=2)

    logger.info(f'input: {labeled_file} - {unlabeled_file}')
    logger.info(f'd_to_add size: {len(d_to_add)}')
    logger.info(f'd_as_unlabeled size: {len(d_unlabeled)}')
    logger.info(f'd_to_add + d_labeled size: {len(d_to_add) + len(d_labeled)}')
Exemple #8
0
def train_process(args, model, train_loader, test_loader, optimizer, criterion,
                  device):
    best_acc = 0.0
    for epoch in range(args.num_epoch):
        model.train()
        train_loss, train_acc = train(model=model,
                                      train_loader=train_loader,
                                      optimizer=optimizer,
                                      criterion=criterion,
                                      device=device,
                                      epoch=epoch,
                                      total_epochs=args.num_epoch)
        model.eval()
        test_loss, test_acc, test_f1 = evaluate(model=model,
                                                test_loader=test_loader,
                                                device=device,
                                                criterion=criterion)

        report_dict = dict()
        report_dict["train__loss"] = train_loss
        report_dict["train__acc"] = train_acc
        report_dict["test__loss"] = test_loss
        report_dict["test__acc"] = test_acc
        report_dict["test__f1"] = test_f1
        report_dict["train__lr"] = optimizer.param_groups[0]['lr']
        nsml.report(False, step=epoch, **report_dict)
        if best_acc < test_acc:
            checkpoint = 'best'
            logger.info(
                f'[{epoch}] Find the best model! Change the best model.')
            nsml.save(checkpoint)
            best_acc = test_acc
        if (epoch + 1) % 5 == 0:
            checkpoint = f'ckpt_{epoch + 1}'
            nsml.save(checkpoint)

        if (epoch + 1) % args.annealing_period == 0:
            for g in optimizer.param_groups:
                g['lr'] = g['lr'] / args.learning_anneal
            logger.info(
                'Learning rate annealed to : {lr:.6f} @epoch{epoch}'.format(
                    epoch=epoch, lr=optimizer.param_groups[0]['lr']))
Exemple #9
0
def train(model, train_loader, optimizer, criterion, device, epoch,
          total_epochs):
    running_loss = 0.0
    total_loss = 0.0
    correct = 0.0
    num_data = 0.0
    for i, data in enumerate(train_loader):
        start = time.time()
        x = data['image']
        xlabel = data['label']
        x = x.to(device)
        xlabel = xlabel.to(device)

        optimizer.zero_grad()  # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨
        out = model(x)

        logit, pred = out
        loss = criterion(logit, xlabel)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        total_loss += loss.item()
        correct += torch.sum(pred == xlabel).item()
        num_data += xlabel.size(0)
        if i % 100 == 0:  # print every 100 mini-batches
            logger.info(
                "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec".
                format(epoch, total_epochs, i, len(train_loader),
                       running_loss / 2000,
                       time.time() - start))
            running_loss = 0.0

    logger.info('[{}/{}]\tloss: {:.4f}\tacc: {:.4f}'.format(
        epoch, total_epochs, total_loss / (i + 1), correct / num_data))
    del x, xlabel
    torch.cuda.empty_cache()
    return total_loss / (i + 1), correct / num_data
Exemple #10
0
    def save_model(output_dir, model):
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f'Saving model checkpoint to {output_dir}')

        model_to_save = model.module if hasattr(model, 'module') else model

        model_to_save.config.architectures = [model_to_save.__class__.__name__]  # architectures是什么

        output_model_file = os.path.join(output_dir, 'pytorch.bin')
        torch.save(model_to_save.state_dict(), output_model_file)
        logger.info(f'Model weights saved in {output_model_file}')

        output_config_file = os.path.join(output_dir, 'config.json')
        model_to_save.config.to_json_file(output_config_file)
        logger.info(f'Configuration saved in {output_config_file}')

        torch.save(training_args, os.path.join(output_dir, 'training_args.bin'))
Exemple #11
0
def evaluate(model, test_loader, device, criterion):
    correct = 0.0
    category_correct = 0.0
    num_data = 0.0
    total_loss = 0.0
    cat2correct = 0.0
    label = []
    prediction = []
    cat_prediction = []
    cat2_prediction = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            x = data['image']
            xlabel = data['label']
            category_pos = data['category_possible']
            category_oneh = data['category_onehot']
            cat2possible = data['cat2possible']

            cat2possible = cat2possible.to(device)
            category_pos = category_pos.to(device)
            category_oneh = category_oneh.to(device)
            x = x.to(device)
            xlabel = xlabel.to(device)

            out = model(x, category_oneh)

            logit, pred = out
            if isinstance(criterion, torch.nn.CrossEntropyLoss):
                loss = criterion(logit, xlabel)
            elif isinstance(criterion, LabelSmoothingLoss):
                loss = criterion(logit, xlabel, category_pos)

            correct += torch.sum(pred == xlabel).item()

            # category_pred = torch.argmax(logit*category_pos, dim=-1)
            # category_correct += torch.sum(category_pred == xlabel).item()

            cat2pred = torch.argmax(logit * cat2possible, dim=-1)
            cat2correct += torch.sum(cat2pred == xlabel).item()

            num_data += xlabel.size(0)
            total_loss += loss.item()
            label = label + xlabel.tolist()

            prediction = prediction + pred.detach().cpu().tolist()
            # cat_prediction = cat_prediction + category_pred.cpu().tolist()
            cat2_prediction = cat2_prediction + cat2pred.cpu().tolist()
        del x, xlabel

    torch.cuda.empty_cache()

    confusion = confusion_matrix(label, cat2_prediction)
    confusion_norm = confusion_matrix(label, cat2_prediction, normalize='true')
    logger.info(f'\n{confusion}')
    logger.info(f'\n{confusion_norm}')

    f1_array = f1_score(label, cat2_prediction, average=None)

    logger.info(f"f1 score : {f1_array}")
    f1_mean = gmean(f1_array)
    logger.info(
        'validation loss: {loss:.4f}\v validation acc: {acc:.4f} \t validation category acc: {cat_acc:.4f}\v validation F1: {f1:.4f}'
        .format(loss=total_loss / (i + 1),
                acc=correct / num_data,
                f1=f1_mean,
                cat_acc=cat2correct / num_data))
    return total_loss / (i + 1), correct / num_data, f1_mean
Exemple #12
0
def main():
    # Argument Settings
    parser = argparse.ArgumentParser(
        description='Image Tagging Classification from Naver Shopping Reviews')
    parser.add_argument('--sess_name',
                        default='example',
                        type=str,
                        help='Session name that is loaded')
    parser.add_argument('--checkpoint',
                        default='best',
                        type=str,
                        help='Checkpoint')
    parser.add_argument('--batch_size',
                        default=256,
                        type=int,
                        help='batch size')
    parser.add_argument('--num_workers',
                        default=16,
                        type=int,
                        help='The number of workers')
    parser.add_argument('--num_epoch',
                        default=100,
                        type=int,
                        help='The number of epochs')
    parser.add_argument('--model_name',
                        default='mobilenet_v2',
                        type=str,
                        help='[resnet50, rexnet, dnet1244, dnet1222]')
    parser.add_argument('--weight_file', default='model.pth', type=str)
    parser.add_argument('--optimizer', default='SGD', type=str)
    parser.add_argument('--lr', default=1e-2, type=float)
    parser.add_argument('--weight_decay', default=1e-5, type=float)
    parser.add_argument('--learning_anneal', default=1.1, type=float)
    parser.add_argument('--annealing_period', default=10, type=int)
    parser.add_argument('--num_gpu', default=1, type=int)
    parser.add_argument('--pretrain', action='store_true', default=False)
    parser.add_argument('--mode', default='train', help='Mode')
    parser.add_argument('--pause', default=0, type=int)
    parser.add_argument('--iteration', default=0, type=str)
    args = parser.parse_args()

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Model
    logger.info('Build Model')
    model = select_model(args.model_name, pretrain=args.pretrain, n_class=41)
    total_param = sum([p.numel() for p in model.parameters()])
    logger.info(f'Model size: {total_param} tensors')
    load_weight(model, args.weight_file)
    model = model.to(device)

    nu.bind_model(model)
    nsml.save('best')

    if args.pause:
        nsml.paused(scope=locals())

    if args.num_epoch == 0:
        return

    # Set the dataset
    logger.info('Set the dataset')
    df = pd.read_csv(f'{DATASET_PATH}/train/train_label')
    train_size = int(len(df) * 0.8)

    trainset = TagImageDataset(data_frame=df[:train_size],
                               root_dir=f'{DATASET_PATH}/train/train_data',
                               transform=train_transform)
    testset = TagImageDataset(data_frame=df[train_size:],
                              root_dir=f'{DATASET_PATH}/train/train_data',
                              transform=test_transform)

    train_loader = DataLoader(dataset=trainset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    criterion = nn.CrossEntropyLoss(reduction='mean')
    optimizer = select_optimizer(model.parameters(), args.optimizer, args.lr,
                                 args.weight_decay)

    criterion = criterion.to(device)

    if args.mode == 'train':
        logger.info('Start to train!')
        train_process(args=args,
                      model=model,
                      train_loader=train_loader,
                      test_loader=test_loader,
                      optimizer=optimizer,
                      criterion=criterion,
                      device=device)

    elif args.mode == 'test':
        nsml.load(args.checkpoint, session=args.sess_name)
        logger.info('[NSML] Model loaded from {}'.format(args.checkpoint))

        model.eval()
        logger.info('Start to test!')
        test_loss, test_acc, test_f1 = evaluate(model=model,
                                                test_loader=test_loader,
                                                device=device,
                                                criterion=criterion)
        logger.info(test_loss, test_acc, test_f1)
Exemple #13
0
from vaal.solver import VAE, Discriminator
from vaal.training_args import TrainingArguments

num_init_samples = 500
pool_batch_size = 128


@dataclass
class DataTrainingArguments:
    max_seq_length: int = field(default=200)


parser = HfArgumentParser((DataTrainingArguments, TrainingArguments))
data_args, training_args = parser.parse_args_into_dataclasses()

logger.info(f'n_gpu: {training_args.n_gpu}')


#######################################
# data prepare
#######################################
class IntentDataset(data.Dataset):
    def __init__(self, file_name):
        self.data, self.targets = zip(*[(_['text'], _['label'])
                                        for _ in json.load(file_name.open())
                                        if _['label'] != '负样本'])
        self._label2id()

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]
Exemple #14
0
def train(train_generator, dev_generator, pool_generator, task_model, vae,
          discriminator, args):
    num_epoch = args.epoch_num
    device = args.device
    n_gpu = args.n_gpu
    beta = args.beta

    optim_task = optim.Adam(task_model.parameters(), lr=5e-5)
    optim_vae = optim.Adam(vae.parameters(), lr=5e-5)
    optim_discriminator = optim.Adam(discriminator.parameters(), lr=5e-5)

    task_model.zero_grad()

    best_epoch, best_acc = 0, 0
    for e in range(num_epoch):

        if args.check_debug and e > 0: break

        task_model.train()
        vae.train()
        discriminator.train()
        for idx, (labeled_batch, unlabeld_batch) in enumerate(
                zip(train_generator, pool_generator)):

            if args.check_debug and idx > 0:
                break

            raw_text = labeled_batch[-1]
            labeled_batch = [_.to(device) for _ in labeled_batch[:-1]]

            X_ids, Y_ids, V_ids, Mask = labeled_batch

            ####################
            # task model step
            ####################
            preds, task_loss = task_model(X_ids, Y_ids)
            if n_gpu > 0:
                task_loss = task_loss.mean()
            task_loss.backward()
            optim_task.step()
            task_model.zero_grad()

            #############
            # vae step
            #############
            recon, mu, logvar, z = vae(V_ids, Mask)
            vae_loss, mse_loss_value, kld_loss_value = vae_loss_func(
                V_ids, recon, mu, logvar, beta, Mask)
            if n_gpu > 0:
                vae_loss = vae_loss.mean()

            un_X_ids, _, un_V_ids, un_Mask = [
                _.to(device) for _ in unlabeld_batch[:-1]
            ]
            un_recon, un_mu, un_logvar, un_z = vae(un_V_ids, un_Mask)
            un_vae_loss, un_mse_loss_value, un_kld_loss_value = vae_loss_func(
                un_V_ids, un_recon, un_mu, un_logvar, beta, Mask)
            if n_gpu > 0:
                un_vae_loss = un_vae_loss.mean()

            labeled_pred = discriminator(mu)
            unlabeled_pred = discriminator(un_mu)
            labeled_real_target = torch.ones(X_ids.size()[0], device=device)
            unlabeled_real_target = torch.ones(un_X_ids.size()[0],
                                               device=device)
            dsc_loss_in_vae = bce_loss(
                labeled_pred, labeled_real_target) + bce_loss(
                    unlabeled_pred, unlabeled_real_target)
            if n_gpu > 0:
                dsc_loss_in_vae = dsc_loss_in_vae.mean()

            total_loss = vae_loss + un_vae_loss + dsc_loss_in_vae

            vae.zero_grad()
            total_loss.backward()
            optim_vae.step()

            #####################
            # discriminate step
            #####################
            mu_no_grad = mu.detach()
            un_mu_no_grad = un_mu.detach()

            labeled_pred = discriminator(mu_no_grad)
            unlabeled_pred = discriminator(un_mu_no_grad)

            labeled_real_target = torch.ones(X_ids.size()[0], device=device)
            unlabeled_fake_target = torch.zeros(un_X_ids.size()[0],
                                                device=device)

            dsc_loss = bce_loss(labeled_pred, labeled_real_target) + bce_loss(
                unlabeled_pred, unlabeled_fake_target)
            if n_gpu > 0:
                dsc_loss = dsc_loss.mean()

            discriminator.zero_grad()
            dsc_loss.backward()
            optim_discriminator.step()

            if idx % 10 == 0 and idx != 0:
                logger.info(
                    f'epoch: {e} - batch: {idx}/{len(train_generator)}')
                logger.info(f'task_model loss: {task_loss}')
                logger.info(f'labeled vae loss: {vae_loss}')
                logger.info(f'labeled mse loss: {mse_loss_value}')
                logger.info(f'labeled kld loss: {kld_loss_value}')
                logger.info(f'unlabeled vae loss: {un_vae_loss}')
                logger.info(f'unlabeled mse loss: {un_mse_loss_value}')
                logger.info(f'unlabeled kld loss: {un_kld_loss_value}')
                logger.info(f'dsc_loss_in_vae: {dsc_loss_in_vae}')
                logger.info(f'dsc_loss: {dsc_loss}')

        task_model.eval()

        correct = 0
        for idx, batch in enumerate(dev_generator):

            if args.check_debug and idx > 1:
                break

            raw_text = batch[-1]
            batch = [_.to(device) for _ in batch[:-1]]
            X_ids, Y_ids, _, _ = batch

            with torch.no_grad():
                logits, _ = task_model(X_ids, Y_ids)
                logits = torch.argmax(logits, dim=-1)
                correct += logits.eq(Y_ids).sum()

        acc = correct.item() / dev_generator.total_data_size

        if acc > best_acc:
            best_acc = acc
            best_epoch = e

        logger.info(
            f'epoch {e} - acc: {acc} - best_acc: {best_acc} - best_epoch: {best_epoch}'
        )

    return best_acc
Exemple #15
0
                X_ids, num_inference_samples).double(),
                                            non_blocking=True)

    with torch.no_grad():
        candidate_batch = batchbald.get_batchbald_batch(
            logits_N_K_C.exp_(),
            acquisition_batch_size,
            num_samples,
            dtype=torch.double,
            device=training_args.device)

    targets = get_targets(active_learning_data.pool_dataset)
    dataset_indices = active_learning_data.get_dataset_indices(
        candidate_batch.indices)

    logger.info(f"Dataset indices: {dataset_indices}")
    logger.info(f"Scores: {candidate_batch.scores}")
    logger.info(f"Labels: {targets[candidate_batch.indices]}")
    logger.info(
        f"Labels name: {[intent_labels[idx] for idx in targets[candidate_batch.indices].detach().cpu().numpy()]}"
    )
    logger.info('Metric: ')
    logger.info(metric)

    active_learning_data.acquire(candidate_batch.indices)
    added_indices.append(dataset_indices)
    pbar.update(len(dataset_indices))

    record.append({
        'added indices':
        dataset_indices,
Exemple #16
0
def binary_train(model, train_loader, optimizer, device, epoch, total_epochs):
    running_loss = 0.0
    total_loss = 0.0
    correct = 0.0
    category_correct = 0.0
    num_data = 0.0

    for i, data in enumerate(train_loader):
        start = time.time()
        x = data['image']
        xlabel = data['label']
        pred = torch.zeros(xlabel.shape[0]).long().to(device)

        category_pos = data['category_possible']
        category_oneh = data['category_onehot']
        category = data['category']

        x = x.to(device)
        xlabel = xlabel.to(device)
        category = category.to(device)
        category_pos = category_pos.to(device)
        category_oneh = category_oneh.to(device)

        optimizer.zero_grad()  # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨

        out = model(x,category_oneh, category if model.cat_embed else None)
        b_out, class_out, unclass_idx, class_idx = out
        
        if class_idx.shape[0] > 0:
            pred[class_idx] = torch.argmax(class_out[class_idx], dim=-1)
        pred[unclass_idx] = 4

        binary_label = (xlabel[unclass_idx] == 4).float()
        class_label = xlabel[class_idx]
        falpos_idx = (class_label == 4).nonzero().squeeze(1)
        trupos_idx = (class_label < 4).nonzero().squeeze(1)

        binary_label = torch.cat([binary_label, torch.ones(falpos_idx.shape[0]).to(device)])
        b_out = torch.cat([b_out[unclass_idx], b_out[falpos_idx]])

        class_out = class_out[trupos_idx]
        class_label = class_label[trupos_idx]

        bin_loss = model.criterion_1(b_out, binary_label)
        class_loss = model.criterion_2(class_out, class_label) 
        loss = bin_loss + class_loss

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        total_loss += loss.item()

        # category_pred = torch.argmax(logit*category_pos, dim=-1)
        # category_correct += torch.sum(category_pred == xlabel).item()

        correct += torch.sum(pred == xlabel).item()
        num_data += xlabel.size(0)

        if i % 100 == 0:  # print every 100 mini-batches
            logger.info("epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec \t binary_loss {:.4f} \t class_loss {:.4f}".format(epoch+1, total_epochs, i,
                                                                                              len(train_loader),
                                                                                              running_loss / 2000,
                                                                                              time.time() - start,
                                                                                              bin_loss,
                                                                                              class_loss))
            running_loss = 0.0

    logger.info(
        '[{}/{}]\tloss: {:.4f}\tacc: {:.4f}'.format(epoch+1, total_epochs, total_loss / (i + 1), correct / num_data))
    del x, xlabel
    torch.cuda.empty_cache()
    return total_loss / (i + 1), correct / num_data
Exemple #17
0
def train_main(p):

    in_file = Path(data_dir) / f'labeled_{p}.json' if isinstance(
        p, int) else Path(common_data_path) / 'intent_data' / p

    ###############################################
    # args
    ###############################################
    @dataclass
    class ModelArguments:
        model_path_or_name: str = field(default=str(bert_model_path))
        # model_path_or_name: str = field(default=str(roberta_model_path))
        # model_path_or_name: str = field(default=str(Path(data_dir)/'checkpoints'/'checkpoint-6000'))

    @dataclass
    class DataTrainingArguments:
        max_seq_length: int = field(default=200)

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    global_step = 0

    ###############################################
    # distant debug
    ###############################################
    if training_args.server_ip and training_args.server_port:
        import ptvsd
        print('Waiting for debugger attach')
        ptvsd.enable_attach(address='')

    ###############################################
    # model
    ###############################################
    num_labels = len(intent_labels)
    config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path=model_args.model_path_or_name,
        num_labels=num_labels)
    model = BertForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=model_args.model_path_or_name,
        config=config,
        num_labels=num_labels)

    ###############################################
    # data process
    ###############################################
    train = [(_['text'], _['label']) for _ in json.load(in_file.open())]
    dev = [(_['text'], _['label'])
           for _ in json.load((Path(common_data_path) / 'intent_data' /
                               'dev_data.json').open())]

    vocabulary = load_vocab()
    # vocabulary = load_vocab(vocab_file=(Path(roberta_model_path) / 'vocab.txt'))

    train_loader = DataGenerator(train,
                                 training_args,
                                 data_args,
                                 vocabulary,
                                 intent_labels,
                                 shuffle=True)
    dev_loader = DataGenerator(dev, training_args, data_args, vocabulary,
                               intent_labels)

    ###############################################
    # optimizer
    ###############################################
    def get_optimizer(num_training_steps):
        no_decay = ['bias', 'LayerNorm.weight']
        optimize_group_params = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            training_args.weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        optimizer = AdamW(optimize_group_params,
                          lr=training_args.learning_rate,
                          weight_decay=training_args.weight_decay)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=training_args.warmup_steps,
            num_training_steps=num_training_steps)

        return optimizer, scheduler

    optimizer, scheduler = get_optimizer(num_training_steps=len(train_loader) *
                                         training_args.epoch_num /
                                         training_args.batch_size)

    ###############################################
    # continue training from checkpoints
    ###############################################
    if ('checkpoint' in model_args.model_path_or_name and os.path.isfile(
            os.path.join(model_args.model_path_or_name, 'optimizer.pt'))
            and os.path.isfile(
                os.path.join(model_args.model_path_or_name, 'scheduler.pt'))):
        optimizer.load_state_dict(
            torch.load(
                os.path.join(model_args.model_path_or_name, "optimizer.pt"),
                map_location='cuda' if torch.cuda.is_available() else 'cpu'))
        scheduler.load_state_dict(
            torch.load(
                os.path.join(model_args.model_path_or_name, "scheduler.pt"),
                map_location='cuda' if torch.cuda.is_available() else 'cpu'))

    epoch_trained = 0
    step_trained_cur_epoch = 0
    if 'checkpoint' in model_args.model_path_or_name:
        global_step = int(
            str(Path(
                model_args.model_path_or_name)).split('-')[-1].split('/')[0])
        epoch_trained = global_step // (
            train_loader.steps // training_args.gradient_accumulation_steps)
        step_trained_cur_epoch = global_step % (
            train_loader.steps // training_args.gradient_accumulation_steps)

        logger.info(
            ' Continuing Training from checkpoint, will skip to saved global_step'
        )
        logger.info(f' Continuing Training from epoch {epoch_trained}')
        logger.info(f' Continuing Training from global step {global_step}')
        logger.info(
            f' Will skip the first {step_trained_cur_epoch} steps in the first epoch'
        )

    ###############################################
    # tensorboard
    ###############################################
    tb_writer = SummaryWriter(log_dir=Path(data_dir) / 'logs')

    def tb_log(logs):
        for k_, v_ in logs.items():
            tb_writer.add_scalar(k_, v_, global_step)

    tb_writer.add_text('args', training_args.to_json_string())
    tb_writer.add_hparams(training_args.to_sanitized_dict(), metric_dict={})

    ###############################################
    # save
    ###############################################
    def save_model(output_dir, model):
        os.makedirs(output_dir, exist_ok=True)
        logger.info(f'Saving model checkpoint to {output_dir}')

        model_to_save = model.module if hasattr(model, 'module') else model

        model_to_save.config.architectures = [
            model_to_save.__class__.__name__
        ]  # architectures是什么

        output_model_file = os.path.join(output_dir, 'pytorch.bin')
        torch.save(model_to_save.state_dict(), output_model_file)
        logger.info(f'Model weights saved in {output_model_file}')

        output_config_file = os.path.join(output_dir, 'config.json')
        model_to_save.config.to_json_file(output_config_file)
        logger.info(f'Configuration saved in {output_config_file}')

        torch.save(training_args, os.path.join(output_dir,
                                               'training_args.bin'))

    def sorted_checkpoints(checkpoint_prefix="checkpoint", use_mtime=False):
        ordering_and_checkpoint_path = []

        glob_checkpoints = [
            str(x) for x in Path(training_args.output_dir).glob(
                f"{checkpoint_prefix}-*")
        ]

        for path in glob_checkpoints:
            if use_mtime:
                ordering_and_checkpoint_path.append(
                    (os.path.getmtime(path), path))
            else:
                regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
                if regex_match and regex_match.groups():
                    ordering_and_checkpoint_path.append(
                        (int(regex_match.groups()[0]), path))

        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
        checkpoints_sorted = [
            checkpoint[1] for checkpoint in checkpoints_sorted
        ]
        return checkpoints_sorted

    def rotate_checkpoints(use_mtime=False) -> None:
        if training_args.save_total_limit is None or training_args.save_total_limit <= 0:
            return

        # Check if we should delete older checkpoint(s)
        checkpoints_sorted = sorted_checkpoints(use_mtime=use_mtime)
        if len(checkpoints_sorted) <= training_args.save_total_limit:
            return

        number_of_checkpoints_to_delete = max(
            0,
            len(checkpoints_sorted) - training_args.save_total_limit)
        checkpoints_to_be_deleted = checkpoints_sorted[:
                                                       number_of_checkpoints_to_delete]
        for checkpoint in checkpoints_to_be_deleted:
            logger.info(
                "Deleting older checkpoint [{}] due to args.save_total_limit".
                format(checkpoint))
            shutil.rmtree(checkpoint)

    ###############################################
    # train
    ###############################################
    model.to(training_args.device)
    if training_args.n_gpu > 1:
        model = nn.DataParallel(model)

    best_acc = 0
    best_epoch = 0
    model.zero_grad()
    for e in range(epoch_trained, training_args.epoch_num):
        # for e in range(1):  # debug
        model.train()
        t_loss = 0
        logging_loss = 0
        for step, batch in enumerate(train_loader):

            # if step > 0: break  # debug

            if step_trained_cur_epoch > 0:
                step_trained_cur_epoch -= 1
                continue

            raw_text = batch[-1]
            batch = [_.to(training_args.device) for _ in batch[:-1]]
            X_ids, Y_ids, Mask = batch
            if step < 5: logger.info(f'batch_size: {X_ids.size()}')
            loss, logits = model(X_ids, Y_ids, Mask)

            if training_args.n_gpu > 1:
                loss = loss.mean()
            loss.backward()
            t_loss += loss.item()

            if training_args.gradient_accumulation_steps > 1:
                loss = loss / training_args.gradient_accumulation_steps

            if ((step + 1) % training_args.gradient_accumulation_steps == 0
                    or (train_loader.steps <=
                        training_args.gradient_accumulation_steps)
                    and step + 1 == train_loader.steps):
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    max_norm=training_args.max_gradient_norm)
                optimizer.step()
                scheduler.step()
                model.zero_grad()

                global_step += 1
                epoch = e + (step + 1) / train_loader.steps

                if global_step % training_args.logging_steps == 0:
                    train_logs = {
                        'loss':
                        (t_loss - logging_loss) / training_args.logging_steps,
                        'learning_rate': scheduler.get_lr()[0],
                        'epoch': epoch
                    }

                    logging_loss = t_loss
                    tb_log(train_logs)

                    logger.info(
                        f'epoch: {e} - batch: {step}/{train_loader.steps} - loss: {t_loss / (step + 1): 6f}'
                    )

                # if global_step % training_args.saving_steps == 0:
                #     output_dir = os.path.join(training_args.output_dir, f'checkpoint-{global_step}')
                #
                #     save_model(output_dir, model)
                #     rotate_checkpoints()
                #
                #     torch.save(optimizer.state_dict(), Path(output_dir)/'optimizer.pt')
                #     torch.save(scheduler.state_dict(), Path(output_dir)/'scheduler.pt')
                #     logger.info(f'Saving optimizer and scheduler states to {output_dir}')

        model.eval()
        dev_acc = 0
        eval_loss = 0
        err = []
        cat = defaultdict(lambda: 1e-10)
        for k, batch in enumerate(dev_loader):

            # if k > 0: break  # debug

            raw_text = batch[-1]
            batch = [_.to(training_args.device) for _ in batch[:-1]]
            X_ids, Y_ids, Mask = batch
            with torch.no_grad():
                loss, logits = model(X_ids, Y_ids, Mask)
                if training_args.n_gpu > 1:
                    loss = loss.mean()
                eval_loss += loss.item()

            for logit, y_id, t in zip(logits, Y_ids, raw_text):
                logit = logit.detach().cpu().numpy()
                true_label = y_id.detach().cpu().numpy()

                pred_label = np.argmax(logit)

                # metric 1
                if true_label == pred_label:
                    dev_acc += 1
                else:
                    score = max(logit)
                    err.append({
                        'text': t,
                        'pred': intent_labels[pred_label],
                        'true': intent_labels[true_label],
                        'score': f'{score: .4f}'
                    })
                # metric 2
                cat[f'{intent_labels[true_label]}_A'] += int(
                    pred_label == true_label)
                cat[f'{intent_labels[true_label]}_B'] += 1
                cat[f'{intent_labels[pred_label]}_C'] += 1
        acc = dev_acc / len(dev_loader)

        eval_logs = {
            'eval_acc': acc,
            'eval_loss': eval_loss / dev_loader.steps,
        }
        tb_log(eval_logs)

        if acc > best_acc:
            # if acc >= best_acc:  # debug
            best_acc = acc
            best_epoch = e

            # save #
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(),
                       Path(data_dir) / f'cls_model_{p}.pt')

            # save #
            json.dump(err, (Path(data_dir) / 'err.json').open('w'),
                      ensure_ascii=False,
                      indent=4)

        logger.info(
            f'epoch: {e} - dev_acc: {acc:.5f} {dev_acc}/{len(dev_loader)} - best_score: {best_acc:.5f} - best_epoch: {best_epoch} '
        )
        for t in intent_labels:
            logger.info(
                f'cat: {t} - '
                f'precision: {cat[t + "_A"] / cat[t + "_C"]:.5f} - '
                f'recall: {cat[t + "_A"] / cat[t + "_B"]:.5f} - '
                f'f1: {2 * cat[t + "_A"] / (cat[t + "_B"] + cat[t + "_C"]):.5f}'
            )

    tb_writer.close()
Exemple #18
0
def binary_evaluate(model, test_loader, device):
    correct = 0.0
    category_correct = 0.0
    num_data = 0.0
    total_loss = 0.0

    label = []
    prediction = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            x = data['image']
            xlabel = data['label']
            pred = torch.zeros(xlabel.shape[0]).long().to(device)

            category_pos = data['category_possible']
            category_oneh = data['category_onehot']
            category = data['category']

            category = category.to(device)
            category_pos = category_pos.to(device)
            category_oneh = category_oneh.to(device)
            x = x.to(device)
            xlabel = xlabel.to(device)

            out = model(x,category_oneh, category if model.cat_embed else None)
            b_out, class_out, unclass_idx, class_idx = out
            
            if class_idx.shape[0] > 0:
                pred[class_idx] = torch.argmax(class_out[class_idx], dim=-1)
            pred[unclass_idx] = 4

            binary_label = (xlabel[unclass_idx] == 4).float()
            class_label = xlabel[class_idx]
            falpos_idx = (class_label == 4).nonzero().squeeze(1)
            trupos_idx = (class_label < 4).nonzero().squeeze(1)

            binary_label = torch.cat([binary_label, torch.ones(falpos_idx.shape[0]).to(device)])
            b_out = torch.cat([b_out[unclass_idx], b_out[falpos_idx]])

            class_out = class_out[trupos_idx]
            class_label = class_label[trupos_idx]

            bin_loss = model.criterion_1(b_out, binary_label)
            class_loss = model.criterion_2(class_out, class_label) 
            loss = bin_loss + class_loss

            # category_pred = torch.argmax(logit*category_pos, dim=-1)
            # category_correct += torch.sum(category_pred == xlabel).item()
            correct += torch.sum(pred == xlabel).item()

            num_data += xlabel.size(0)
            total_loss += loss.item()
            label = label + xlabel.tolist()
            prediction = prediction + pred.detach().cpu().tolist()
        del x, xlabel

    torch.cuda.empty_cache()

    confusion = confusion_matrix(label,prediction)
    confusion_norm = confusion_matrix(label,prediction, normalize='true')
    logger.info(f'\n{confusion}')
    logger.info(f'\n{confusion_norm}')
    
    f1_array = f1_score(label, prediction, average=None)
    
    logger.info(f"f1 score : {f1_array}")
    f1_mean = gmean(f1_array)
    logger.info('validation loss: {loss:.4f}\v validation acc: {acc:.4f} \v validation F1: {f1:.4f}'
                .format(loss=total_loss / (i + 1), acc=correct / num_data, f1=f1_mean))
    return total_loss / (i + 1), correct / num_data, f1_mean
def train_main(train_loader, vocabulary, id2embeddings):
    ###############################################
    # args
    ###############################################
    @dataclass
    class ModelArguments:
        model_path_or_name: str = field(default=str(bert_model_path))
        # model_path_or_name: str = field(default=str(roberta_model_path))
        # model_path_or_name: str = field(default=str(Path(data_dir)/'checkpoints'/'checkpoint-6000'))

    @dataclass
    class DataTrainingArguments:
        max_seq_length: int = field(default=200)

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    global_step = 0

    ###############################################
    # distant debug
    ###############################################
    if training_args.server_ip and training_args.server_port:
        import ptvsd
        print('Waiting for debugger attach')
        ptvsd.enable_attach(address='')

    ###############################################
    # model
    ###############################################
    id2embeddings = torch.tensor(id2embeddings,
                                 dtype=torch.float).to(training_args.device)

    num_labels = len(intent_labels)
    model = TextCnnMC(id2embeddings, num_labels)

    ###############################################
    # data process
    ###############################################
    dev = [(_['text'], _['label'])
           for _ in json.load((Path(common_data_path) / 'intent_data' /
                               'dev_data.json').open())]

    dev_loader = DataGeneratorW2V(dev, training_args.batch_size, data_args,
                                  vocabulary, intent_labels)

    ###############################################
    # optimizer
    ###############################################
    optimizer = torch.optim.Adam(
        [p for n, p in list(model.named_parameters())], lr=5e-5)

    ###############################################
    # train
    ###############################################
    model.to(training_args.device)
    logger.info(f'gpu num: {training_args.n_gpu}')
    if training_args.n_gpu > 1:
        model = nn.DataParallel(model)

    loss_func = LabelSmoothLoss(num_labels)
    best_acc = 0
    best_epoch = 0
    model.zero_grad()
    for e in range(training_args.epoch_num):
        # for e in range(1):  # debug
        model.train()
        t_loss = 0
        logging_loss = 0
        for step, batch in enumerate(train_loader):

            # if step > 0: break  # debug

            raw_text = batch[-1]
            batch = [_.to(training_args.device) for _ in batch[:-1]]
            X_ids, Y_ids = batch
            if step < 1: logger.info(f'batch_size: {X_ids.size()[0]}')
            logits = model(X_ids, 1).squeeze(1)
            loss = loss_func(Y_ids, logits)

            if training_args.n_gpu > 1:
                loss = loss.mean()
            loss.backward()
            t_loss += loss.item()

            if training_args.gradient_accumulation_steps > 1:
                loss = loss / training_args.gradient_accumulation_steps

            if ((step + 1) % training_args.gradient_accumulation_steps == 0
                    or (train_loader.steps <=
                        training_args.gradient_accumulation_steps)
                    and step + 1 == train_loader.steps):
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(),
                    max_norm=training_args.max_gradient_norm)
                optimizer.step()
                model.zero_grad()

                global_step += 1

                # if global_step % training_args.logging_steps == 0:
            logger.info(
                f'epoch: {e} - batch: {step}/{train_loader.steps} - loss: {t_loss / (step + 1): 6f}'
            )

        model.eval()
        dev_acc = 0
        eval_loss = 0
        err = []
        cat = defaultdict(lambda: 1e-10)
        for k, batch in enumerate(dev_loader):

            # if k > 0: break  # debug

            raw_text = batch[-1]
            batch = [_.to(training_args.device) for _ in batch[:-1]]
            X_ids, Y_ids = batch
            with torch.no_grad():
                logits = model(X_ids, 10).squeeze(1)
                logits = torch.logsumexp(logits, dim=1) - math.log(10)
                loss = loss_func(Y_ids, logits)

                if training_args.n_gpu > 1:
                    loss = loss.mean()
                eval_loss += loss.item()

            for logit, y_id, t in zip(logits, Y_ids, raw_text):
                logit = logit.detach().cpu().numpy()
                true_label = y_id.detach().cpu().numpy()

                pred_label = np.argmax(logit)

                # metric 1
                if true_label == pred_label:
                    dev_acc += 1
                else:
                    score = max(logit)
                    err.append({
                        'text': t,
                        'pred': intent_labels[pred_label],
                        'true': intent_labels[true_label],
                        'score': f'{score: .4f}'
                    })
                # metric 2
                cat[f'{intent_labels[true_label]}_A'] += int(
                    pred_label == true_label)
                cat[f'{intent_labels[true_label]}_B'] += 1
                cat[f'{intent_labels[pred_label]}_C'] += 1
        acc = dev_acc / (len(dev_loader) * training_args.batch_size)

        if acc > best_acc:
            # if acc >= best_acc:  # debug
            best_acc = acc
            best_epoch = e

            metric = {'epoch': best_epoch, 'acc': best_acc}

            # save #
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(),
                       Path(data_dir) / f'cls_model.pt')

            # save #
            json.dump(err, (Path(data_dir) / 'err.json').open('w'),
                      ensure_ascii=False,
                      indent=4)

        logger.info(
            f'epoch: {e} - dev_acc: {acc:.5f} {dev_acc}/{len(dev_loader)*training_args.batch_size} - '
            f'best_score: {best_acc:.5f} - best_epoch: {best_epoch} ')
        # for t in intent_labels:
        #     logger.info(f'cat: {t} - '
        #                 f'precision: {cat[t + "_A"] / cat[t + "_C"]:.5f} - '
        #                 f'recall: {cat[t + "_A"] / cat[t + "_B"]:.5f} - '
        #                 f'f1: {2 * cat[t + "_A"] / (cat[t + "_B"] + cat[t + "_C"]):.5f}')

    return metric, model_to_save
def ensemble_training(model, train_loader, optimizer, criterion, device, epoch,
                      total_epochs):
    running_loss = 0.0
    total_loss = 0.0
    correct = 0.0
    category_correct = 0.0
    num_data = 0.0
    cat2correct = 0.0

    ytrues = []
    ypreds = []
    for i, data in enumerate(train_loader):
        start = time.time()
        x = data['image']
        xlabel = data['label']
        category_pos = data['category_possible']
        category_oneh = data['category_onehot']
        category = data['category']
        cat2possible = data['cat2possible']

        cat2possible = cat2possible.to(device)
        category = category.to(device)
        x = x.to(device)
        xlabel = xlabel.to(device)
        category_pos = category_pos.to(device)
        category_oneh = category_oneh.to(device)

        optimizer.zero_grad()  # step과 zero_grad는 쌍을 이루는 것이라고 생각하면 됨

        out = model(x, category_oneh, category)
        logit, pred = out
        num_data += xlabel.size(0)

        if model.mode == "xgb":
            ytrues.append(xlabel)
            ypreds.append(logit)
        else:
            if isinstance(criterion, torch.nn.CrossEntropyLoss):
                loss = criterion(logit, xlabel)
            elif isinstance(criterion, LabelSmoothingLoss):
                loss = criterion(logit, xlabel, category_pos)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            total_loss += loss.item()

            # category_pred = torch.argmax(logit*category_pos, dim=-1)
            # category_correct += torch.sum(category_pred == xlabel).item()

            cat2pred = torch.argmax(logit * cat2possible, dim=-1)
            cat2correct += torch.sum(cat2pred == xlabel).item()

            correct += torch.sum(pred == xlabel).item()

            if i % 100 == 0:  # print every 100 mini-batches
                logger.info(
                    "epoch: {}/{} | step: {}/{} | loss: {:.4f} | time: {:.4f} sec"
                    .format(epoch + 1, total_epochs, i, len(train_loader),
                            running_loss / 2000,
                            time.time() - start))
                running_loss = 0.0

    if model.mode == "xgb":
        ypreds = torch.cat(ypreds, axis=0)
        ytrues = torch.cat(ytrues, axis=0)

        ypreds = ypreds.detach().cpu().numpy()
        ytrues = ytrues.detach().cpu().numpy()

        model.xgb_classifier.fit(ypreds, ytrues)
        ypreds = model.xgb_classifier.predict(ypreds)

        cat2correct = np.sum((ytrues == ypreds).astype(int))
    elif model.mode == "hard":
        logger.info(
            f"\n###############\nEnsemble {model.num_model} number of models weight = {model.w} \n##############"
        )

    logger.info(
        '[{}/{}]\tloss: {:.4f}\tacc: {:.4f} \tcategory_acc : {:.4f}'.format(
            epoch + 1, total_epochs, total_loss / (i + 1), correct / num_data,
            cat2correct / num_data))
    del x, xlabel
    torch.cuda.empty_cache()
    return total_loss / (i + 1), cat2correct / num_data