Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--test_data_path",
                        default='./data/test_stt.pkl',
                        type=str,
                        help="test data path")
    args_ = parser.parse_args()
    pretrained = torch.load(pretrained_model_path)
    args = torch.load(args_path)
    args.test_data_path = args_.test_data_path
    args.eval_batch_size = 64

    bert_config = BertConfig(config_path)
    bert_config.num_labels = 7

    model = BertForEmotionClassification(bert_config).to(device)
    model.load_state_dict(pretrained, strict=False)
    args.n_gpu = 2
    loss, acc, f1, total_y_hat, cm = test(model, args)
    print("loss : {} \nacc : {} \nf1 : {}".format(loss, acc, f1))

    draw_cm(cm)
    tmp = pd.read_pickle(args.test_data_path)

    # remove duplicates
    tmp = tmp[['Sentence', 'Emotion']].drop_duplicates().reset_index(drop=True)

    tmp['Pred'] = [label_list[i] for i in total_y_hat]
    tmp.to_csv('./result/test_result.csv')
    print("results are saved to result folder")
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    # parser.add_argument("--test_data_path", default='./data/korean_single_test.csv', type=str,
    #                     help="test data path")
    parser.add_argument("--test_data_path",
                        default='./data/toon_test.csv',
                        type=str,
                        help="test data path")
    args_ = parser.parse_args()
    pretrained = torch.load(pretrained_model_path)
    args = torch.load(args_path)
    args.test_data_path = args_.test_data_path
    args.eval_batch_size = 64

    bert_config = BertConfig(config_path)
    bert_config.num_labels = 7

    model = BertForEmotionClassification(bert_config).to(device)
    model.load_state_dict(pretrained, strict=False)

    loss, acc, f1, total_y_hat, cm = test(model, args)
    print("loss : {} \nacc : {} \nf1 : {}".format(loss, acc, f1))

    draw_cm(cm)
    tmp = pd.read_csv(args.test_data_path)
    tmp['Pred'] = [label_list[i] for i in total_y_hat]
    tmp.to_csv('./result/test_result_toon.csv')
    print("results are saved to result folder")
Example #3
0
def train(args):
    set_seed(args)
    # Set device
    if args.device == 'cuda':
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        logger.info('use cuda')
    else:
        device = torch.device('cpu')
        logger.info('use cpu')

    # Set label list for classification
    if args.num_label == 'multi':
        label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오']
    elif args.num_label == 'binary':
        label_list = ['긍정', '부정']
    logger.info('use {} labels for training'.format(len(label_list)))

    # Load pretrained model and model configuration
    pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type)
    if args.pretrained_model_path is None:
        # Use pretrained bert model(etri/skt)
        pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin')
    else:
        # Use further-pretrained bert model
        pretrained_model_path = args.pretrained_model_path
    logger.info('Pretrain Model : {}'.format(pretrained_model_path))
    pretrained = torch.load(pretrained_model_path)
    
    if args.pretrained_type == 'skt' and 'bert.' not in list(pretrained.keys())[0]:
        logger.info('modify parameter names')
        # Change parameter name for consistency
        new_keys_ = ['bert.' + k for k in pretrained.keys()]
        old_values_ = pretrained.values()
        pretrained = {k: v for k, v in zip(new_keys_, old_values_)}

    bert_config = BertConfig(os.path.join(pretrained_path + '/bert_config.json'))
    bert_config.num_labels = len(label_list)
    model = BertForEmotionClassification(bert_config).to(device)
    model.load_state_dict(pretrained, strict=False)

    # Load Datasets
    tr_set = Datasets(file_path=args.train_data_path,
                      label_list=label_list,
                      pretrained_type=args.pretrained_type,
                      max_len=args.max_len)
    # Use custom batch function
    collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx)
    tr_loader = DataLoader(dataset=tr_set,
                           batch_size=args.train_batch_size,
                           shuffle=True,
                           num_workers=8,
                           pin_memory=True,
                           collate_fn=collate_fn)

    dev_set = Datasets(file_path=args.dev_data_path,
                       label_list=label_list,
                       pretrained_type=args.pretrained_type,
                       max_len=args.max_len)
    dev_loader = DataLoader(dataset=dev_set,
                            batch_size=args.eval_batch_size,
                            num_workers=8,
                            pin_memory=True,
                            drop_last=False,
                            collate_fn=collate_fn)

    # optimizer
    optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay)

    # lr scheduler
    t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs
    warmup_steps = int(t_total * args.warmup_percent)
    logger.info('total training steps : {}, lr warmup steps : {}'.format(t_total, warmup_steps))
    # Use gradual warmup and linear decay
    scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total)

    # for low-precision training
    if args.fp16:
        try:
            from apex import amp
            logger.info('Use fp16')
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0)

    # tensorboard setting
    save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format(
        args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total,
        args.warmup_percent, args.max_len, args.pretrained_type)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    writer = SummaryWriter(save_path)

    # Save best model results with resultwriter
    result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv")
    model.zero_grad()

    best_val_loss = 1e+9
    global_step = 0
    
    train_loss, train_acc, train_f1 = 0, 0, 0
    logging_loss, logging_acc, logging_f1 = 0, 0, 0

    logger.info('***** Training starts *****')
    total_result = []
    for epoch in tqdm(range(args.epochs), desc='epochs'):
        for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)):
            model.train()
            x_train, mask_train, y_train = map(lambda x: x.to(device), batch)

            inputs = {
                'input_ids': x_train,
                'attention_mask': mask_train,
                'classification_label': y_train,
            }

            output, loss = model(**inputs)
            y_max = output.max(dim=1)[1]

            cr = classification_report(y_train.tolist(),
                                       y_max.tolist(),
                                       labels=list(range(len(label_list))),
                                       target_names=label_list,
                                       output_dict=True)
            # Get accuracy(micro f1)
            if 'micro avg' not in cr.keys():
                batch_acc = list(cr.items())[len(label_list)][1]
            else:
                # If at least one of labels does not exists in mini-batch, use micro average instead
                batch_acc = cr['micro avg']['f1-score']
            # macro f1
            batch_macro_f1 = cr['macro avg']['f1-score']

            # accumulate measures
            grad_accu = args.gradient_accumulation_steps
            if grad_accu > 1:
                loss /= grad_accu
                batch_acc /= grad_accu
                batch_macro_f1 /= grad_accu

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            train_loss += loss.item()
            train_acc += batch_acc
            train_f1 += batch_macro_f1

            if (global_step + 1) % grad_accu == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm)

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                
                if global_step % args.logging_step == 0:
                    acc_ = (train_acc - logging_acc) / args.logging_step
                    f1_ = (train_f1 - logging_f1) / args.logging_step
                    loss_ = (train_loss - logging_loss) / args.logging_step
                    writer.add_scalars('loss', {'train': loss_}, global_step)
                    writer.add_scalars('acc', {'train': acc_}, global_step)
                    writer.add_scalars('macro_f1', {'train': f1_}, global_step)

                    logger.info('[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}'.format(
                        global_step, t_total, loss_, acc_, f1_
                    ))
                    logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss

                    # Get f1 score for each label
                    f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)]
                    f1_log = "\n".join(["{} : {}".format(l, f) for l, f in f1_results])
                    logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format(
                        confusion_matrix(y_train.tolist(), y_max.tolist())))

        # Validation
        val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device)
        val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format(
            global_step, t_total, val_loss, val_acc, val_macro_f1
        )

        writer.add_scalars('loss', {'val': val_loss}, global_step)
        writer.add_scalars('acc', {'val': val_acc}, global_step)
        writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step)
        logger.info(val_result)
        total_result.append(val_result)

        if val_loss < best_val_loss:
            # Save model checkpoints
            torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin'))
            torch.save(args, os.path.join(save_path, 'training_args.bin'))
            logger.info('Saving model checkpoint to %s', save_path)
            best_val_loss = val_loss
            best_val_acc = val_acc
            best_val_macro_f1 = val_macro_f1

    # Save results in 'model_saved_finetuning/results.csv'
    results = {
        'val_loss': best_val_loss,
        'val_acc': best_val_acc,
        'val_macro_f1' : best_val_macro_f1,
        'save_dir': save_path,
        'pretrained_path': pretrained_path,
    }
    result_writer.update(args, **results)
    return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
    logging.critical("Unexpected error : %s", e)
    sys.exit()

# print(DIRNAME)

pretrained_model_path = os.path.join(MODEL_ABS_PATH, constant.MODEL_BIN_NAME)
# print(pretrained_model_path)

config_path = os.path.join(DIRNAME, constant.BERT_CONFIG_NAME)
# print(config_path)

pretrained = torch.load(pretrained_model_path, map_location='cpu')
bert_config = BertConfig(config_path)
bert_config.num_labels = 7

model = BertForEmotionClassification(bert_config)
model.load_state_dict(pretrained, strict=False)
model.eval()
softmax = torch.nn.Softmax(dim=1)

tokenizer, vocab = get_pretrained_model('etri')

# '공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'
# 'angry', 'surprise', 'angry', 'sad', 'neutral', 'joy', 'disgust'
obj = dict()
emotion = ['scare', 'surprise', 'angry', 'sad', 'neutral', 'joy', 'disgust']


def get_prediction(sentence):
    sentence = Datasets.normalize_string(sentence)
    sentence = tokenizer.tokenize(sentence)