def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--bert_model", default='bert-base-cased', type=str,
                        help="transformers中的模型都可: bert-base-uncased, roberta-base.")
    parser.add_argument("--output_dir",
                        default='output',
                        type=str,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--output_file",
                        # default='output_batch4_gpu4_large_qo_lamda10_fp16.txt',
                        default='output_file.txt',
                        type=str,
                        help="The output directory where the model checkpoints will be written.")
    parser.add_argument("--train_file",
                        default='data/sem/ntrain.tsv',
                        type=str)
    parser.add_argument("--test_file",
                        default='data/sem/ntest.tsv',
                        type=str)
    parser.add_argument("--dev_file",
                        default='data/sem/ndev.tsv',
                        type=str)
    parser.add_argument('--n_gpu',
                        type=int, default=2,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    parser.add_argument("--max_seq_length",
                        default=512,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--train_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=4,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-6,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=50.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",#用uncased无大小写模型时要这个
                        default=True,
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=4,#原来是4
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
    #增加dev集
    parser.add_argument("--dev_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for dev.")
    parser.add_argument("--print_step",
                        default=50,
                        type=int,
                        help="多少步进行模型保存以及日志信息写入")
    parser.add_argument("--early_stop", type=int, default=50, help="提前终止,多少次dev acc 不再连续增大,就不再训练")

    parser.add_argument("--label_list",
                        default=["0", "1", "2", "3", "4"],
                        type=list,
                        help="我自己加的类别标签")
    parser.add_argument("--predict_test_file",
                        default='ntest_sg_label.tsv',
                        type=str)
    parser.add_argument("--log_dir",
                        default="log_dir",
                        type=str,
                        help="日志目录,主要用于 tensorboard 分析")


    args = parser.parse_args()
    logger.info(args)
    output_eval_file = os.path.join(args.output_dir, args.output_file)
    os.makedirs(args.output_dir, exist_ok=True)
    os.makedirs(args.log_dir, exist_ok=True)#如果已经存在,不抛出异常

    with open(output_eval_file, "w") as writer:
        writer.write("%s\t\n" % args)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = args.n_gpu
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = args.n_gpu
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    #为了复现
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)  # 为所有GPU设置随机种子
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(args.seed)  # 为了禁止hash随机化,使得实验可复现。
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2 ** 32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    #读数据,生成dataframe
    df_train = pd.read_csv(args.train_file, sep='\t')
    df_dev = pd.read_csv(args.dev_file, sep='\t')
    df_test = pd.read_csv(args.test_file, sep='\t')

    # Load the pretrained Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    model = AutoModelForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,
                                                               output_attentions=False, output_hidden_states=False)
    # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,
    #                                                            output_attentions=False, output_hidden_states=False)


    model.to(device)

    if args.fp16:
        model.half()

    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    param_optimizer = list(model.named_parameters())
    # hack to remove pooler, which is not used# thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]

    def encode_fn(text_list):
        all_input_ids = []
        for text in text_list:
            input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=128, return_tensors='pt',pad_to_max_length=True)  # 这个长度得改!!!
            all_input_ids.append(input_ids)
        all_input_ids = torch.cat(all_input_ids, dim=0)
        return all_input_ids

    criterion = torch.nn.CrossEntropyLoss()#加了torch
    criterion = criterion.to(device)

    if args.do_train:
        # Create the data loader
        train_text_values = df_train['sentence'].values
        all_input_ids = encode_fn(train_text_values)
        labels = df_train['label'].values
        labels = torch.tensor(labels - 1)  # 减一,让标签从0开始
        train_data = TensorDataset(all_input_ids, labels)
        train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True,worker_init_fn=seed_worker)  # _init_fn

        dev_text_values = df_dev['sentence'].values
        dall_input_ids = encode_fn(dev_text_values)
        dlabels = df_dev['label'].values
        dlabels = torch.tensor(dlabels - 1)  # 减一,让标签从0开始
        dev_data = TensorDataset(dall_input_ids, dlabels)
        dev_dataloader = DataLoader(dev_data, batch_size=args.dev_batch_size, worker_init_fn=seed_worker)

        num_train_steps = int(
            len(df_train) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)

        # create optimizer and learning rate schedule
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False)  # 要重现BertAdam特定的行为,需设置correct_bias = False
        #total_steps = len(train_dataloader) * args.epoch
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion*num_train_steps), num_training_steps=num_train_steps)#num_warmup_steps不知道

        logger.info("***** Running training *****transformers")
        logger.info("  Num examples = %d", len(df_train))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        logger.info("***** Running dev *****")
        logger.info("  Num examples = %d", len(df_dev))
        logger.info("  Batch size = %d", args.dev_batch_size)
        with open(output_eval_file, "a") as writer:###
            writer.write("\t\n***** Running training *****transformers\t\n")
            writer.write("  Num examples = %d\t\n" % len(df_train))
            writer.write("  Batch size = %d\t\n" % args.train_batch_size)
            writer.write("  Num steps = %d\t\n" % num_train_steps)
            writer.write("\t\n***** Running dev *****transformers\t\n")
            writer.write("  Num examples = %d\t\n" % len(df_dev))
            writer.write("  Batch size = %d\t\n" % args.dev_batch_size)

        global_step = 0
        best_acc = 0
        early_stop_times = 0

        writer = SummaryWriter(
            log_dir=args.log_dir + '/' + time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time())))

        num_model = 0
        num_bestacc=0
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):

            if early_stop_times >= args.early_stop:
                print('early_stop......')
                break

            print(f'---------------- Epoch: {epoch + 1:02} ----------')

            epoch_loss = 0
            all_preds = np.array([], dtype=int)
            all_labels = np.array([], dtype=int)
            train_steps = 0

            for step, batch in enumerate(tqdm(train_dataloader, ncols=50, desc="Iteration")):#新增ncols,进度条长度。默认是10

                model.train()  # 这个位置正确,保证每一个batch都能进入model.train()的模式

                ##传统的训练函数进来一个batch的数据,计算一次梯度,更新一次网络,而这里用了梯度累加(gradient accumulation)
                ##梯度累加就是,每次获取1个batch的数据,计算1次梯度,梯度不清空,不断累加,累加一定次数后,根据累加的梯度更新网络参数,然后清空梯度,进行下一次循环。
                # 梯度累加步骤:1. input output 获取loss:输入文本和标签,通过infer计算得到预测值,计算损失函数
                out1 = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0] > 0).to(device),
                             labels=batch[1].to(device))
                loss, logits = out1[:2]

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale

                # 2.loss.backward() 反向传播,计算当前梯度 2.1 loss regularization
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                train_steps += 1

                # 2.2 back propagation
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()## 反向传播求解梯度

                # 用于画图和分析的数据
                epoch_loss += loss.item()
                preds = logits.detach().cpu().numpy()
                outputs = np.argmax(preds, axis=1)
                all_preds = np.append(all_preds, outputs)
                label_ids = batch[1].to('cpu').numpy()
                all_labels = np.append(all_labels, label_ids)

                # 3. 多次循环步骤1-2,不清空梯度,使梯度累加在已有梯度上 update parameters of net
                #梯度累加了一定次数后,先optimizer.step() 根据累计的梯度更新网络参数,然后optimizer.zero_grad() 清空过往梯度,为下一波梯度累加做准备
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)#optimizer_grouped_parameters
                    # 梯度裁剪不再在AdamW中了#大于1的梯度将其设为1.0, 以防梯度爆炸。解决神经网络训练过拟合。只在训练的时候使用,在测试的时候不用
                    optimizer.step()## 更新权重参数 # update parameters of net
                    scheduler.step()
                    optimizer.zero_grad()## 梯度清零 # reset gradient
                    global_step += 1
                    #新增dev数据集调参
                    if global_step % args.print_step == 0 and global_step != 0:
                        num_model += 1
                        train_loss = epoch_loss / train_steps
                        train_acc, train_report = classifiction_metric(all_preds, all_labels, args.label_list)
                        dev_loss, dev_acc, dev_report, _, _, _ = evaluate(model, dev_dataloader, criterion, device, args.label_list)

                        c = global_step // args.print_step
                        writer.add_scalar("loss/train", train_loss, c)
                        writer.add_scalar("loss/dev", dev_loss, c)

                        writer.add_scalar("micro_f1/train", train_acc, c)##acc/train
                        writer.add_scalar("micro_f1/dev", dev_acc, c)##acc/dev

                        for label in args.label_list:
                            writer.add_scalar(label + "_" + "f1/train", train_report[label]['f1-score'], c)
                            writer.add_scalar(label + "_" + "f1/dev",
                                              dev_report[label]['f1-score'], c)

                        print_list = ['macro', 'weighted']
                        for label in print_list:
                            writer.add_scalar(label + "_avg_" +"f1/train",
                                              train_report[label+' avg']['f1-score'], c)
                            writer.add_scalar(label + "_avg_" + "f1/dev",
                                              dev_report[label+' avg']['f1-score'], c)

                        # 以 acc 取优
                        if dev_acc > best_acc:
                            num_bestacc += 1
                            best_acc = dev_acc
                            # Save a trained model
                            model_to_save = model.module if hasattr(model,'module') else model  # Only save the model it-self
                            output_model_file = os.path.join(args.output_dir, "_pytorch_model.bin")
                            torch.save(model_to_save.state_dict(), output_model_file)
                            early_stop_times = 0
                        else:
                            early_stop_times += 1

        with open(output_eval_file, "a") as writer:###
            writer.write("\t\n***** Ending dev *****transformers\t\n")
            writer.write("  global_step : %d\t\n" % global_step)
            writer.write("  num_model : %d\t\n" % num_model)
            writer.write("  num_bestacc : %d\t\n" % num_bestacc)

    if args.do_eval:
        # dataframe保存带标签的预测文件ntest_label.tsv,格式:id,text,label,predict_label
        df = pd.DataFrame(columns=['text', 'label', 'predict_label'])
        df['text']=df_test['sentence']

        # Create the test data loader
        test_text_values = df_test['sentence'].values
        tall_input_ids = encode_fn(test_text_values)
        tlabels = df_test['label'].values
        tlabels = torch.tensor(tlabels - 1)  # 减一,让标签从0开始
        pred_data = TensorDataset(tall_input_ids,tlabels)
        pred_dataloader = DataLoader(pred_data, batch_size=args.eval_batch_size, worker_init_fn=seed_worker)

        logger.info("***** Running evaluation *****transformers")
        logger.info("  Num examples = %d", len(df_test))
        logger.info("  Batch size = %d", args.eval_batch_size)

        output_eval_file = os.path.join(args.output_dir, "result.txt")
        output_model_file = os.path.join(args.output_dir, "_pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = AutoModelForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,state_dict=model_state_dict,
                                                                   output_attentions=False, output_hidden_states=False)
        # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,state_dict=model_state_dict,
        #                                                            output_attentions=False, output_hidden_states=False)

        model.to(device)
        logger.info("Start evaluating")

        print("=======================")
        print("test_total...")
        _,eval_accuracy, eval_report, all_logits, all_preds, all_labels = evaluate(model, pred_dataloader,criterion, device, args.label_list)

        df['predict_label'] = all_preds
        df['label'] = all_labels
        ntest_sg_label = os.path.join(args.output_dir, args.predict_test_file)
        df.to_csv(ntest_sg_label, sep='\t')

        eval_macro_f1 = eval_report['macro avg']['f1-score']
        result = {'eval_accuracy': eval_accuracy,'eval_macro_f1':eval_macro_f1}

        with open(output_eval_file, "a") as writer:
            writer.write("***** Running evaluation *****transformers\t\n")
            writer.write("  Num examples = %d\t\n" % df.shape[0])
            writer.write("  Batch size = %d\t\n" % args.eval_batch_size)

            logger.info("***** Eval results *****transformers")
            writer.write("\t\n***** Eval results   %s *****transformers\t\n" % (
                 time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\t" % (key, str(result[key])))
            writer.write("\t\n")

        np.savetxt(args.output_dir+'/all_logits_transf.txt', all_logits.reshape(-1,5))
Esempio n. 2
0
def train(args, config, tokenizer, model):
    """train model"""
    #Load and prepare data
    posterior_dic = pickle.load(
        open(os.path.join(args.posterior_dir, 'posterior-trn.pkl'), 'rb'))
    train_examples = read_examples(os.path.join(args.data_dir, 'ppl-trn.pkl'),
                                   posterior_dic=posterior_dic)
    train_features = convert_examples_to_features(train_examples,
                                                  tokenizer,
                                                  args,
                                                  stage='training')
    all_event_ids = torch.tensor([f.event_ids for f in train_features],
                                 dtype=torch.long)
    all_context_ids = torch.tensor([f.context_ids for f in train_features],
                                   dtype=torch.long)
    all_target_ids = torch.tensor([f.target_ids for f in train_features],
                                  dtype=torch.long)
    all_posterior = torch.tensor([f.posterior for f in train_features],
                                 dtype=torch.long)
    train_data = TensorDataset(all_event_ids, all_context_ids, all_target_ids,
                               all_posterior)

    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)
    train_dataloader = cycle(train_dataloader)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=args.train_steps)

    #Running training
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size= %d", args.train_batch_size)
    logger.info("  Batch size (including gradient_accumulation_steps)= %d",
                args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Num steps = %d",
                args.train_steps * args.gradient_accumulation_steps)

    dev_dataset = {}
    model.train()
    global_step, tr_re_loss, tr_context_loss, tr_reward, tr_clf_loss, nb_tr_examples, nb_tr_steps, best_bleu, best_bleu, eval_flag = 0, 0, 0, 0, 0, 0, 0, 0, 0, True
    bar = tqdm(range(args.train_steps * args.gradient_accumulation_steps),
               total=args.train_steps * args.gradient_accumulation_steps)
    for step in bar:
        batch = next(train_dataloader)
        batch = tuple(t.to(args.device) for t in batch)
        event_ids, context_ids, target_ids, posterior = batch

        with torch.no_grad():
            context_ids, context_ids_random = model(context_ids=context_ids,
                                                    posterior=posterior)
        (re_loss, context_loss,
         reward), _, _ = model(event_ids=event_ids,
                               context_ids=context_ids,
                               context_ids_random=context_ids_random,
                               target_ids=target_ids,
                               posterior=posterior)

        # mean() to average on multi-gpu.
        if args.n_gpu > 1:
            re_loss = re_loss.mean()
            context_loss = context_loss.mean()
            reward = reward.mean()

        if args.fp16 and args.loss_scale != 1.0:
            re_loss = re_loss * args.loss_scale
            context_loss = context_loss * args.loss_scale
            reward = reward * args.loss_scale

        if args.gradient_accumulation_steps > 1:
            re_loss = re_loss / args.gradient_accumulation_steps
            context_loss = context_loss / args.gradient_accumulation_steps
            reward = reward / args.gradient_accumulation_steps

        #print loss information
        tr_re_loss += re_loss.item()
        tr_context_loss += context_loss.item()
        tr_reward += reward.item()
        train_re_loss = round(
            tr_re_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1),
            4)
        train_context_loss = round(
            tr_context_loss * args.gradient_accumulation_steps /
            (nb_tr_steps + 1), 4)
        train_reward = round(
            tr_reward * args.gradient_accumulation_steps / (nb_tr_steps + 1),
            4)
        bar.set_description("re_loss {}, context_loss {}, reward {}".format(
            train_re_loss, train_context_loss, train_reward))
        nb_tr_examples += event_ids.size(0)
        nb_tr_steps += 1

        #backward
        loss = re_loss + context_loss
        if args.fp16:
            optimizer.backward(loss)
        else:
            loss.backward()

        #update parameter
        if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                lr_this_step = args.learning_rate * warmup_linear.get_lr(
                    global_step, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            eval_flag = True

        #Running evaluation
        if ((global_step + 1) % args.eval_steps == 0) and eval_flag:
            tr_re_loss, tr_context_loss, tr_reward, tr_clf_loss, nb_tr_examples, nb_tr_steps, eval_flag = 0, 0, 0, 0, 0, 0, False
            result = evaluate(args,
                              config,
                              tokenizer,
                              model,
                              os.path.join(args.data_dir, 'ppl-dev.pkl'),
                              num_sample=10000)
            if 'event2mind' in args.data_dir:
                category = ["<oReact>", "<xIntent>", "<xReact>"]
            else:
                category = [
                    "<oEffect>", "<oReact>", "<oWant>", "<xAttr>", "<xEffect>",
                    "<xIntent>", "<xNeed>", "<xReact>", "<xWant>"
                ]
            overall_bleu = 0
            overall_dist = []
            for c in category:
                bleu, dist = test(args, config, tokenizer, model,
                                  os.path.join(args.data_dir, 'gen-dev.pkl'),
                                  c, 10, 3000 // len(category))
                result[c + ' (bleu,dist1,dist2)'] = [
                    bleu, dist1(dist), dist2(dist)
                ]
                result[c + ' (bleu,dist1,dist2)'] = ' '.join(
                    [str(x) for x in result[c + ' (bleu,dist1,dist2)']])
                overall_bleu += bleu
                overall_dist += dist
            overall_bleu = round(overall_bleu / len(category), 1)
            result['Overall (bleu-2,dist1,dist2)'] = [
                overall_bleu,
                dist1(overall_dist),
                dist2(overall_dist)
            ]
            result['Overall (bleu-2,dist1,dist2)'] = ' '.join(
                [str(x) for x in result['Overall (bleu-2,dist1,dist2)']])
            result['global_step'] = global_step + 1
            result['train_loss'] = round(train_re_loss, 5)
            logger.info("***** Result *****")
            #print result
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
            logger.info("  " + "*" * 20)

            if overall_bleu >= best_bleu:
                logger.info("  Best bleu:%s", overall_bleu)
                logger.info("  " + "*" * 20)
                best_bleu = overall_bleu
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1):
    # ------------------判断CUDA模式----------------------
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()  # 多GPU
        # n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1

    model.to(device)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
    # ---------------------优化器-------------------------

    t_total = num_train_steps

    # Prepare optimizer and scheduler (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_step, num_training_steps=t_total
    )

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # ---------------------模型初始化----------------------
    model.zero_grad()
    set_seed(args)

    global_train_loss, global_eval_loss = [], []
    train_acc_obj_class_word, train_f1_obj_class_word = [], []
    train_acc_express_word, train_f1_express_word = [], []
    eval_acc_obj_class_word, eval_f1_obj_class_word  = [], []
    eval_acc_express_word, eval_f1_express_word = [], []

    history = {
        "train_loss": global_train_loss,
        "eval_loss": global_eval_loss,

        "train_acc_obj_class_word": train_acc_obj_class_word,
        "train_f1_obj_class_word": train_f1_obj_class_word,

        "train_acc_express_word": train_acc_express_word,
        "train_f1_express_word": train_f1_express_word,

        "eval_acc_obj_class_word": eval_acc_obj_class_word,
        "eval_f1_obj_class_word": eval_f1_obj_class_word,

        "eval_acc_express_word": eval_acc_express_word,
        "eval_f1_express_word": eval_f1_express_word
    }

    # ------------------------训练------------------------------
    start = time.time()
    best_obj_word_f1 = 0
    global_step = 0

    model.zero_grad()
    set_seed(args)
    for e in range(num_epoch):
        model.train()
        train_obj_predicts, train_obj_labels, train_express_predicts, train_express_labels = [], [], [], []
        loss_epoch = 0
        for step, batch in enumerate(training_iter):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, obj_ids, express_ids, _, start_poses, end_poses = batch
            obj_classify, express_classify, start_logits, end_logits, _, _= model(input_ids, segment_ids, input_mask)
            # 预测对象类词, 真实对象类词, 预测表示词, 真实表示词, 起始位置, 结束位置, 预测起始位置,预测结束位置
            train_loss = loss_fn(obj_classify, obj_ids, express_classify, express_ids,
                                 start_poses, end_poses, start_logits, end_logits)

            if n_gpu > 1:
                train_loss = train_loss.mean()
            if args.gradient_accumulation_steps > 1:
                train_loss = train_loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                optimizer.backward(train_loss)
            else:
                train_loss.backward()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

            obj_classify = obj_classify.cpu()
            obj_ids = obj_ids.cpu()
            train_obj_acc, train_obj_prf = evaluate(obj_classify, obj_ids)

            express_classify = express_classify.cpu()
            express_ids = express_ids.cpu()
            train_express_acc, train_express_prf = evaluate(express_classify, express_ids)

            start_poses = start_poses.cpu()
            start_logits = start_logits.cpu()
            end_poses = end_poses.cpu()
            end_logits = end_logits.cpu()
            acc_start_pos, f1_start_pos, acc_end_pos, f1_end_pos, start_or_end_crt_acc, start_and_end_crt_acc = \
                evaluate_pos(start_poses, start_logits, end_poses, end_logits)
            loss_epoch += train_loss.item()
            pbar.show_process(train_loss.item(), train_obj_acc, train_obj_prf[2], train_express_acc, train_express_prf[2],
                              acc_start_pos, f1_start_pos, acc_end_pos, f1_end_pos,
                              start_or_end_crt_acc, start_and_end_crt_acc,
                              time.time() - start, step)
            if global_step % 100 == 0:
                train_obj_predicts.append(obj_classify)
                train_obj_labels.append(obj_ids)
                train_express_predicts.append(express_classify)
                train_express_labels.append(express_ids)

        train_obj_predicted = torch.cat(train_obj_predicts, dim=0).cpu()
        train_obj_labeled = torch.cat(train_obj_labels, dim=0).cpu()
        train_express_predicted = torch.cat(train_express_predicts, dim=0).cpu()
        train_express_labeled = torch.cat(train_express_labels, dim=0).cpu()
        del train_obj_predicts, train_obj_labels, train_express_predicts, train_express_labels

        all_train_obj_acc, all_train_obj_prf = evaluate(train_obj_predicted, train_obj_labeled)
        all_train_express_acc, all_train_express_prf = evaluate(train_express_predicted, train_express_labeled)
        global_train_loss.append(loss_epoch / (step + 1))
        train_acc_obj_class_word.append(all_train_obj_acc)
        train_f1_obj_class_word.append(all_train_obj_prf[2])
        train_acc_express_word.append(all_train_express_acc)
        train_f1_express_word.append(all_train_express_prf[2])
        del all_train_obj_acc, all_train_obj_prf, all_train_express_acc, all_train_express_prf

        # -----------------------验证----------------------------
        count = 0
        eval_obj_predicts, eval_obj_labels, eval_express_predicts, eval_express_labels = [], [], [], []
        eval_start_pos_preds, eval_start_pos_true = [], []
        eval_end_pos_preds, eval_end_pos_true = [], []
        eval_losses = 0

        model.eval()
        with torch.no_grad():
            for step, batch in enumerate(eval_iter):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, obj_ids, express_ids, _, start_poses, end_poses = batch
                obj_classify, express_classify, start_logits, end_logits, _, _ = model(input_ids, segment_ids, input_mask)
                # 预测对象类词, 真实对象类词, 预测表示词, 真实表示词, 起始位置, 结束位置, 预测起始位置,预测结束位置
                eval_loss = loss_fn(obj_classify, obj_ids, express_classify, express_ids,
                                    start_poses, end_poses, start_logits, end_logits)

                eval_losses += eval_loss
                count += 1

                eval_obj_predicts.append(obj_classify)
                eval_obj_labels.append(obj_ids)
                eval_express_predicts.append(express_classify)
                eval_express_labels.append(express_ids)

                eval_start_pos_preds.append(start_logits)
                eval_start_pos_true.append(start_poses)
                eval_end_pos_preds.append(end_logits)
                eval_end_pos_true.append(end_poses)

            eval_obj_predicted = torch.cat(eval_obj_predicts, dim=0).cpu()
            eval_obj_labeled = torch.cat(eval_obj_labels, dim=0).cpu()
            eval_express_predicted = torch.cat(eval_express_predicts, dim=0).cpu()
            eval_express_labeled = torch.cat(eval_express_labels, dim=0).cpu()

            eval_obj_acc, eval_obj_prf = evaluate(eval_obj_predicted, eval_obj_labeled)
            eval_express_acc, eval_express_prf = evaluate(eval_express_predicted, eval_express_labeled)

            eval_acc_obj_class_word.append(eval_obj_acc)
            eval_f1_obj_class_word.append(eval_obj_prf[2])
            eval_acc_express_word.append(eval_express_acc)
            eval_f1_express_word.append(eval_express_prf[2])

            eval_start_pos_preds = torch.cat(eval_start_pos_preds, dim=0).cpu()
            eval_start_pos_true = torch.cat(eval_start_pos_true, dim=0).cpu()
            eval_end_pos_preds = torch.cat(eval_end_pos_preds, dim=0).cpu()
            eval_end_pos_true = torch.cat(eval_end_pos_true, dim=0).cpu()

            acc_start_pos, f1_start_pos, acc_end_pos, f1_end_pos, start_or_end_crt_acc, start_and_end_crt_acc = \
                evaluate_pos(eval_start_pos_true, eval_start_pos_preds, eval_end_pos_true, eval_end_pos_preds)

            avg_eval_loss = eval_losses.item() / count
            global_eval_loss.append(avg_eval_loss)
            logger.info(f"""\nEpoch {e + 1}/{num_epoch} - eval_loss: {avg_eval_loss:.4f} - 
                        eval_obj_acc: {eval_obj_acc:.4f} eval_obj_f1:{eval_obj_prf[2]:.4f} - 
                        eval_express_acc: {eval_express_acc:.4f} eval_express_f1: {eval_express_prf[2]:.4f} - 
                        eval_acc_start_pos: {acc_start_pos:.4f} eval_f1_start_pos: {f1_start_pos:.4f} - 
                        eval_acc_end_pos: {acc_end_pos:.4f} eval_f1_end_pos: {f1_end_pos:.4f} - 
                        any_crt_acc: {start_or_end_crt_acc:.4f} all_crt_acc {start_and_end_crt_acc:.4f}\n""")
            # 保存最好的模型
            if eval_obj_prf[2] > best_obj_word_f1:
                best_obj_word_f1 = eval_obj_prf[2]
                save_model(model, optimizer, scheduler, args.output_dir)

    #         if e % verbose == 0:
    #             train_losses.append(train_loss.item())
    #             train_f1.append(best_train_f1)
    #             eval_losses.append(eval_loss.item() / count)
    #             eval_f1.append(_eval_f1)
    logger.info(f"best object class word: {best_obj_word_f1:.4f}")
    loss_acc_f1_plot(history, path=args.output_dir + "loss_acc_f1_plot.png")
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--pregenerated_data", type=Path, required=True)
    parser.add_argument("--teacher_model",
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument("--student_model",
                        default=None,
                        type=str,
                        required=True)
    parser.add_argument("--output_dir", default=None, type=str, required=True)

    # Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")

    parser.add_argument(
        "--reduce_memory",
        action="store_true",
        help=
        "Store training data as on-disc memmaps to massively reduce memory usage"
    )
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--weight_decay',
                        '--wd',
                        default=1e-4,
                        type=float,
                        metavar='W',
                        help='weight decay')
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--continue_train',
                        action='store_true',
                        help='Whether to train from checkpoints')

    # Additional arguments
    parser.add_argument('--eval_step', type=int, default=1000)

    args = parser.parse_args()
    logger.info('args:{}'.format(args))

    samples_per_epoch = []
    for i in range(int(args.num_train_epochs)):
        epoch_file = args.pregenerated_data / "epoch_{}.json".format(i)
        metrics_file = args.pregenerated_data / "epoch_{}_metrics.json".format(
            i)
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                "Warning! There are fewer epochs of pregenerated data ({}) than training epochs ({})."
                .format(i, args.num_train_epochs))
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.num_train_epochs

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.teacher_model,
                                              do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(int(args.num_train_epochs)):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples /
                                       args.train_batch_size /
                                       args.gradient_accumulation_steps)
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    if args.continue_train:
        student_model = TinyBertForPreTraining.from_pretrained(
            args.student_model)
    else:
        student_model = TinyBertForPreTraining.from_scratch(args.student_model)
    teacher_config = BertConfig.from_pretrained(args.teacher_model)
    teacher_config.output_hidden_states = True
    teacher_config.output_attentions = True
    teacher_model = BertModel.from_pretrained(args.teacher_model,
                                              config=teacher_config)

    # student_model = TinyBertForPreTraining.from_scratch(args.student_model, fit_size=teacher_model.config.hidden_size)
    student_model.to(device)
    teacher_model.to(device)

    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        teacher_model = DDP(teacher_model)
    elif n_gpu > 1:
        student_model = torch.nn.DataParallel(student_model)
        teacher_model = torch.nn.DataParallel(teacher_model)

    size = 0
    for n, p in student_model.named_parameters():
        logger.info('n: {}'.format(n))
        logger.info('p: {}'.format(p.nelement()))
        size += p.nelement()

    logger.info('Total parameters: {}'.format(size))

    # Prepare optimizer
    param_optimizer = list(student_model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    loss_mse = MSELoss()

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=args.learning_rate,
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(num_train_optimization_steps *
                             args.warmup_proportion),
        num_training_steps=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info("  Num examples = {}".format(total_train_examples))
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)

    for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
        epoch_dataset = PregeneratedDataset(
            epoch=epoch,
            training_path=args.pregenerated_data,
            tokenizer=tokenizer,
            num_data_epochs=num_data_epochs,
            reduce_memory=args.reduce_memory)
        if args.local_rank == -1:
            train_sampler = RandomSampler(epoch_dataset)
        else:
            train_sampler = DistributedSampler(epoch_dataset)
        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        tr_loss = 0.
        tr_att_loss = 0.
        tr_rep_loss = 0.
        student_model.train()
        nb_tr_examples, nb_tr_steps = 0, 0
        with tqdm(total=len(train_dataloader),
                  desc="Epoch {}".format(epoch)) as pbar:
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", ascii=True)):
                batch = tuple(t.to(device) for t in batch)

                input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
                if input_ids.size()[0] != args.train_batch_size:
                    continue

                att_loss = 0.
                rep_loss = 0.

                student_atts, student_reps = student_model(
                    input_ids, segment_ids, input_mask)
                teacher_outputs = teacher_model(input_ids=input_ids,
                                                attention_mask=input_mask,
                                                token_type_ids=segment_ids)
                teacher_reps = teacher_outputs[2]
                teacher_atts = teacher_outputs[3]

                teacher_reps = [
                    teacher_rep.detach() for teacher_rep in teacher_reps
                ]  # speedup 1.5x
                teacher_atts = [
                    teacher_att.detach() for teacher_att in teacher_atts
                ]

                teacher_layer_num = len(teacher_atts)
                student_layer_num = len(student_atts)
                assert teacher_layer_num % student_layer_num == 0
                layers_per_block = int(teacher_layer_num / student_layer_num)
                new_teacher_atts = [
                    teacher_atts[i * layers_per_block + layers_per_block - 1]
                    for i in range(student_layer_num)
                ]

                for student_att, teacher_att in zip(student_atts,
                                                    new_teacher_atts):
                    student_att = torch.where(
                        student_att <= -1e2,
                        torch.zeros_like(student_att).to(device), student_att)
                    teacher_att = torch.where(
                        teacher_att <= -1e2,
                        torch.zeros_like(teacher_att).to(device), teacher_att)
                    att_loss += loss_mse(student_att, teacher_att)

                new_teacher_reps = [
                    teacher_reps[i * layers_per_block]
                    for i in range(student_layer_num + 1)
                ]
                new_student_reps = student_reps

                for student_rep, teacher_rep in zip(new_student_reps,
                                                    new_teacher_reps):
                    rep_loss += loss_mse(student_rep, teacher_rep)

                loss = att_loss + rep_loss

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_att_loss += att_loss.item()
                tr_rep_loss += rep_loss.item()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)

                mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                mean_att_loss = tr_att_loss * args.gradient_accumulation_steps / nb_tr_steps
                mean_rep_loss = tr_rep_loss * args.gradient_accumulation_steps / nb_tr_steps

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    global_step += 1

                    if (global_step + 1) % args.eval_step == 0:
                        result = {}
                        result['global_step'] = global_step
                        result['loss'] = mean_loss
                        result['att_loss'] = mean_att_loss
                        result['rep_loss'] = mean_rep_loss
                        output_eval_file = os.path.join(
                            args.output_dir, "log.txt")
                        with open(output_eval_file, "a") as writer:
                            logger.info("***** Eval results *****")
                            for key in sorted(result.keys()):
                                logger.info("  %s = %s", key, str(result[key]))
                                writer.write("%s = %s\n" %
                                             (key, str(result[key])))

                        # Save a trained model
                        model_name = "step_{}_{}".format(
                            global_step, "pytorch_model.bin")
                        logging.info(
                            "** ** * Saving fine-tuned model ** ** * ")
                        # Only save the model it-self
                        model_to_save = student_model.module if hasattr(
                            student_model, 'module') else student_model

                        output_model_file = os.path.join(
                            args.output_dir, model_name)
                        output_config_file = os.path.join(
                            args.output_dir, "config.json")

                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        model_to_save.config.to_json_file(output_config_file)
                        tokenizer.save_vocabulary(args.output_dir)

            model_name = "step_{}_{}".format(global_step, "pytorch_model.bin")
            logging.info("** ** * Saving fine-tuned model ** ** * ")
            model_to_save = student_model.module if hasattr(
                student_model, 'module') else student_model

            output_model_file = os.path.join(args.output_dir, model_name)
            output_config_file = os.path.join(args.output_dir, "config.json")

            torch.save(model_to_save.state_dict(), output_model_file)
            model_to_save.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)
Esempio n. 5
0
def train(args, config, tokenizer, model):
    """train model"""
    #Load and prepare data
    train_examples = read_examples(os.path.join(args.data_dir, 'gen-trn.pkl'))
    prior_dic = pickle.load(
        open(os.path.join(args.prior_distribution_dir, 'prior-trn.pkl'), 'rb'))
    train_features = convert_examples_to_features(train_examples,
                                                  tokenizer,
                                                  args,
                                                  stage='training',
                                                  prior_dic=prior_dic)
    all_event_ids = torch.tensor([f.event_ids for f in train_features],
                                 dtype=torch.long)
    all_prior = torch.tensor([f.prior for f in train_features],
                             dtype=torch.long)
    train_data = TensorDataset(all_event_ids, all_prior)
    if args.local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)
    train_dataloader = cycle(train_dataloader)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer]
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=args.train_steps)

    #Running training
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size= %d", args.train_batch_size)
    logger.info("  Batch size (including gradient_accumulation_steps)= %d",
                args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Num steps = %d",
                args.train_steps * args.gradient_accumulation_steps)

    dev_dataset = {}
    model.train()
    global_step, tr_loss, nb_tr_examples, nb_tr_steps, best_loss, eval_flag = 0, 0, 0, 0, 1e4, True
    bar = tqdm(range(args.train_steps * args.gradient_accumulation_steps),
               total=args.train_steps * args.gradient_accumulation_steps)
    for step in bar:
        batch = next(train_dataloader)
        batch = tuple(t.to(args.device) for t in batch)
        event_ids, prior = batch
        loss = model(event_ids=event_ids, prior=prior)

        if args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu.
        if args.fp16 and args.loss_scale != 1.0:
            loss = loss * args.loss_scale
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        #print loss information
        tr_loss += loss.item()
        train_loss = round(
            tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4)
        bar.set_description("loss {}".format(train_loss))
        nb_tr_examples += event_ids.size(0)
        nb_tr_steps += 1

        #backward
        if args.fp16:
            optimizer.backward(loss)
        else:
            loss.backward()

        #update parameter
        if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                lr_this_step = args.learning_rate * warmup_linear.get_lr(
                    global_step, args.warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1
            eval_flag = True

        #Running evaluation
        if ((global_step + 1) % args.eval_steps == 0) and eval_flag:
            tr_loss, nb_tr_examples, nb_tr_steps, eval_flag = 0, 0, 0, False
            prior_dic = pickle.load(
                open(
                    os.path.join(args.prior_distribution_dir, 'prior-dev.pkl'),
                    'rb'))
            result = test(args,
                          config,
                          tokenizer,
                          model,
                          os.path.join(args.data_dir, 'gen-dev.pkl'),
                          prior_dic=prior_dic)
            result['global_step'] = global_step + 1
            result['train_loss'] = round(train_loss, 5)
            #print result
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
            logger.info("  " + "*" * 20)
            if result['eval_loss'] < best_loss:
                logger.info("  Best loss:%s", round(result['eval_loss'], 5))
                logger.info("  " + "*" * 20)
                best_loss = result['eval_loss']
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
Esempio n. 6
0
def main(args):
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    os.makedirs(args.output_dir, exist_ok=True)
    json.dump(args.__dict__,
              open(
                  os.path.join(args.output_dir,
                               'opt_{}.json'.format(args.task_name)), 'w'),
              sort_keys=True,
              indent=2)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)

    amp_handle = None
    if args.fp16:
        from apex import amp
        amp_handle = amp.init(enable_caching=True)

    # Prepare model
    if (args.model_recover_path is None) or len(args.model_recover_path) == 0:
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, num_labels=num_labels)
    else:
        if not os.path.exists(args.model_recover_path):
            logger.info("Path does not exist: {0}".format(
                args.model_recover_path))
            sys.exit(0)
        logger.info("***** Recover model: {0} *****".format(
            args.model_recover_path))
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model,
            state_dict=torch.load(args.model_recover_path),
            num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps)
    if args.do_train:
        t_total = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)
    else:
        t_total = 1
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      correct_bias=False)
    if args.local_rank != -1:
        t_total = t_total // torch.distributed.get_world_size()
    if args.fp16:
        try:
            from apex.fp16_utils.fp16_optimizer import FP16_Optimizer
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)

    logger.info("***** CUDA.empty_cache() *****")
    torch.cuda.empty_cache()

    if args.task_name == 'sts-b':
        if args.fp16:
            lbl_type = torch.half
        else:
            lbl_type = torch.float
    else:
        lbl_type = torch.long

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", t_total)
        train_data = convert_features_to_dataset(train_features, lbl_type)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        best_result = 0.0

        for i_epoch in trange(1, args.num_train_epochs + 1, desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            model.train()
            iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)')
            for step, batch in enumerate(iter_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                outputs = model(input_ids,
                                attention_mask=input_mask,
                                token_type_ids=segment_ids,
                                labels=label_ids)
                loss = outputs[0]
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                    if amp_handle:
                        amp_handle._clear_cache()
                else:
                    loss.backward()

                tr_loss += loss.item()
                iter_bar.set_description('Iter (loss=%5.3f)' % loss.item())
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            # Perform validation
            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer)
            eval_data = convert_features_to_dataset(eval_features, lbl_type)
            eval_segment = processor.get_dev_segments()[0]
            logger.info("***** Running evaluation: {0}-{1} *****".format(
                eval_segment, i_epoch))
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)

            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_result = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            all_logits, all_label_ids = [], []
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    outputs = model(input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    labels=label_ids)
                    tmp_eval_loss = outputs[0]
                    logits = outputs[1]
                    if amp_handle:
                        amp_handle._clear_cache()

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                all_logits.append(logits)
                all_label_ids.append(label_ids)

                eval_loss += tmp_eval_loss.mean().item()

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps

            # compute evaluation metric
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            metric_func = processor.get_metric_func()
            eval_result = metric_func(all_logits, all_label_ids)
            # logging the results
            logger.info("***** Eval results for {0}: {1} *****".format(
                eval_segment, eval_result))
            if eval_result > best_result:
                best_result = eval_result
                # Save a trained model
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(
                    args.output_dir, "{0}.pt".format(args.task_name))
                torch.save(model_to_save.state_dict(), output_model_file)
                logger.info(
                    "  Saved best model to {0}".format(output_model_file))

    # delete unused variables
    del optimizer
    del param_optimizer
    del optimizer_grouped_parameters

    # Load a trained model that you have fine-tuned
    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        logger.info("***** CUDA.empty_cache() *****")
        torch.cuda.empty_cache()
        del model

        output_model_file = os.path.join(args.output_dir,
                                         "{0}.pt".format(args.task_name))
        model_state_dict = torch.load(output_model_file)
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model,
            state_dict=model_state_dict,
            num_labels=num_labels)
        model.to(device)

        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        eval_set_list = []
        for eval_segment in processor.get_dev_segments():
            eval_examples = processor.get_dev_examples(args.data_dir,
                                                       segment=eval_segment)
            eval_set_list.append((eval_segment, eval_examples))
            break

        for eval_segment, eval_examples in eval_set_list:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer)
            eval_data = convert_features_to_dataset(eval_features, lbl_type)
            logger.info("***** Running evaluation: %s *****", eval_segment)
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data,
                                         sampler=eval_sampler,
                                         batch_size=args.eval_batch_size)

            model.eval()
            eval_loss, eval_result = 0, 0
            nb_eval_steps, nb_eval_examples = 0, 0
            all_logits, all_label_ids = [], []
            for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    outputs = model(input_ids,
                                    attention_mask=input_mask,
                                    token_type_ids=segment_ids,
                                    labels=label_ids)
                    tmp_eval_loss = outputs[0]
                    logits = outputs[1]
                    if amp_handle:
                        amp_handle._clear_cache()

                logits = logits.detach().cpu().numpy()
                label_ids = label_ids.to('cpu').numpy()
                all_logits.append(logits)
                all_label_ids.append(label_ids)

                eval_loss += tmp_eval_loss.mean().item()

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps

            # compute evaluation metric
            all_logits = np.concatenate(all_logits, axis=0)
            all_label_ids = np.concatenate(all_label_ids, axis=0)
            metric_func = processor.get_metric_func()
            eval_result = metric_func(all_logits, all_label_ids)
            # logging the results
            logger.info("***** Eval results for {0}: {1} *****".format(
                eval_segment, eval_result))
Esempio n. 7
0
class BertModel(object):
    def __init__(self, config):
        self.config = config
        self.init_config()
        self.init_random_seeds()
        self.init_bert()

    def init_config(self):
        self.args = munchify(self.config)
        self.pretrained_model = self.args.pretrained_model
        self.device = self.args.device
        self.n_gpu = (len(self.args.gpu_ids.split(",")) if "gpu_ids"
                      in self.config else torch.cuda.device_count())
        if "gpu_ids" in self.config:
            os.environ["CUDA_VISIBLE_DEVICES"] = self.config["gpu_ids"]

    def init_bert(self):
        self.model = BertForSequenceClassification.from_pretrained(
            self.pretrained_model,
            num_labels=self.args.num_labels,
        )
        print_transformer(self.model)
        self.tokenizer = AutoTokenizer.from_pretrained(self.args.tokenizer)
        if self.args.fp16:
            self.model.half()
        self.model.to(self.device)
        if self.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model)

    def init_optimizer(self, n_examples):
        num_train_optimization_steps = (
            int(n_examples / self.args.batch_size /
                self.args.gradient_accumulation_steps) * self.args.epochs)
        # Prepare optimizer
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        if self.args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer, FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use"
                    " distributed and fp16 training.")

            optimizer = FusedAdam(
                optimizer_grouped_parameters,
                lr=self.args.lr,
                bias_correction=False,
                max_grad_norm=1.0,
            )
            if self.args.loss_scale == 0:
                self.optimizer = FP16_Optimizer(optimizer,
                                                dynamic_loss_scale=True)
            else:
                self.optimizer = FP16_Optimizer(
                    optimizer, static_loss_scale=self.args.loss_scale)
            self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                warmup=self.args.warmup_proportion,
                t_total=num_train_optimization_steps,
            )

        else:
            self.optimizer = AdamW(self.model.parameters(),
                                   lr=self.args.lr,
                                   correct_bias=False)
            self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=num_train_optimization_steps,
                num_training_steps=self.args.warmup_proportion *
                num_train_optimization_steps,
            )

    def init_random_seeds(self):
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)
        torch.manual_seed(self.args.seed)
        if self.n_gpu > 0:
            torch.cuda.manual_seed_all(self.args.seed)

    def save_pretrained(self, path):
        model_to_save = (self.model.module
                         if hasattr(self.model, "module") else self.model)
        model_to_save.save_pretrained(path)

    @timeit
    def train_an_epoch(self, train_dataloader):
        self.model.train()
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
            batch = tuple(t.to(self.args.device) for t in batch)
            input_ids, input_mask, label_ids = batch
            if self.args.is_multilabel:
                logits = self.model(
                    input_ids,
                    token_type_ids=None,
                    attention_mask=input_mask,
                )[0]
                loss = F.binary_cross_entropy_with_logits(
                    logits, label_ids.float())
            else:
                loss, logits = self.model(
                    input_ids,
                    token_type_ids=None,
                    attention_mask=input_mask,
                    labels=label_ids,
                )
                if self.n_gpu > 1:
                    loss = loss.mean()
                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                if self.args.fp16:
                    self.optimizer.backward(loss)
                else:
                    loss.backward()
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            self.tr_loss += loss.item()
            self.nb_tr_steps += 1
            if (step + 1) % self.args.gradient_accumulation_steps == 0:
                if self.args.fp16:
                    lr_this_step = self.args.learning_rate * warmup_linear(
                        self.iterations / self.num_train_optimization_steps,
                        self.args.warmup_proportion,
                    )
                    for param_group in self.optimizer.param_groups:
                        param_group["lr"] = lr_this_step
                self.optimizer.step()
                self.scheduler.step()
                self.optimizer.zero_grad()
                self.iterations += 1

    def train(self, train_set, dev_set):
        self.iterations, self.nb_tr_steps, self.tr_loss = 0, 0, 0
        self.best_valid_metric, self.unimproved_iters = 0, 0
        self.early_stop = False
        if self.args.gradient_accumulation_steps < 1:
            raise ValueError(
                "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
                .format(self.args.gradient_accumulation_steps))

        self.args.batch_size = (self.args.batch_size //
                                self.args.gradient_accumulation_steps)
        self.init_optimizer(len(train_set))

        train_dataset = convert_df_to_dataset(train_set, self.tokenizer,
                                              self.args.max_seq_length)
        dev_dataset = convert_df_to_dataset(dev_set, self.tokenizer,
                                            self.args.max_seq_length)

        train_dataloader = DataLoader(
            train_dataset,
            sampler=RandomSampler(train_dataset),
            batch_size=self.args.batch_size,
        )
        dev_dataloader = DataLoader(
            dev_dataset,
            sampler=SequentialSampler(dev_dataset),
            batch_size=self.args.batch_size,
        )

        for epoch in trange(int(self.args.epochs), desc="Epoch"):
            self.train_an_epoch(train_dataloader)
            tqdm.write(f"[Epoch {epoch}] loss: {self.tr_loss}".format(
                epoch, self.best_valid_metric))
            self.tr_loss = 0
            eval_result = self.eval(dev_dataloader)
            # Update validation results
            if eval_result[self.args.valid_metric] > self.best_valid_metric:
                self.unimproved_iters = 0
                self.best_valid_metric = eval_result[self.args.valid_metric]
                print_dict_as_table(
                    eval_result,
                    tag=f"[Epoch {epoch}]performance on validation set",
                    columns=["metrics", "values"],
                )
                ensureDir(self.args.model_save_dir)
                self.save_pretrained(self.args.model_save_dir)
            else:
                self.unimproved_iters += 1
                if self.unimproved_iters >= self.args.patience:
                    self.early_stop = True
                    tqdm.write(
                        "Early Stopping. Epoch: {}, best_valid_metric ({}): {}"
                        .format(epoch, self.args.valid_metric,
                                self.best_valid_metric))
                    break

    def test(self, test_set):
        """Get a evaluation result for a test set.

        Args:
            test_set:

        Returns:

        """
        test_dataset = convert_df_to_dataset(test_set, self.tokenizer,
                                             self.args.max_seq_length)

        test_dataloader = DataLoader(
            test_dataset,
            sampler=SequentialSampler(test_dataset),
            batch_size=self.args.batch_size,
        )
        return self.eval(test_dataloader)

    def scores(self, test_dataloader):
        """Get predicted label scores for a test_dataloader

        Args:
            test_dataloader:

        Returns:
            ndarray: An array of predicted label scores.

        """
        self.model.eval()
        predicted_labels, target_labels = list(), list()
        for input_ids, input_mask, label_ids in tqdm(test_dataloader,
                                                     desc="Evaluating"):
            input_ids = input_ids.to(self.args.device)
            input_mask = input_mask.to(self.args.device)
            label_ids = label_ids.to(self.args.device)
            with torch.no_grad():
                logits = self.model(input_ids,
                                    token_type_ids=None,
                                    attention_mask=input_mask)[0]
            if self.args.is_multilabel:
                predicted_labels.extend(
                    F.sigmoid(logits).round().long().cpu().detach().numpy())
            else:
                predicted_labels.extend(
                    torch.argmax(logits, dim=1).cpu().detach().numpy())
            target_labels.extend(label_ids.cpu().detach().numpy())
        return np.array(predicted_labels), np.array(target_labels)

    @timeit
    def eval(self, test_dataloader):
        """Get the evaluation performance of a test_dataloader

        Args:
            test_dataloader:

        Returns:
            dict: A result dict containing result of "accuracy", "precision", "recall"
                and "F1".

        """
        # test loader tensor: input_ids, input_mask, segment_ids, label_ids
        predicted_labels, target_labels = self.scores(test_dataloader)

        if self.args.num_labels > 2:
            accuracy = metrics.accuracy_score(target_labels, predicted_labels)
            macro_precision = metrics.precision_score(target_labels,
                                                      predicted_labels,
                                                      average="macro")
            macro_recall = metrics.recall_score(target_labels,
                                                predicted_labels,
                                                average="macro")
            macro_f1 = metrics.f1_score(target_labels,
                                        predicted_labels,
                                        average="macro")
            micro_precision = metrics.precision_score(target_labels,
                                                      predicted_labels,
                                                      average="micro")
            micro_recall = metrics.recall_score(target_labels,
                                                predicted_labels,
                                                average="micro")
            micro_f1 = metrics.f1_score(target_labels,
                                        predicted_labels,
                                        average="micro")

            return {
                "accuracy": accuracy,
                "macro_precision": macro_precision,
                "macro_recall": macro_recall,
                "macro_f1": macro_f1,
                "micro_precision": micro_precision,
                "micro_recall": micro_recall,
                "micro_f1": micro_f1,
            }

        else:
            accuracy = metrics.accuracy_score(target_labels, predicted_labels)
            precision = metrics.precision_score(target_labels,
                                                predicted_labels,
                                                average="binary")
            recall = metrics.recall_score(target_labels,
                                          predicted_labels,
                                          average="binary")
            f1 = metrics.f1_score(target_labels,
                                  predicted_labels,
                                  average="binary")

            return {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "f1": f1,
            }

    @timeit
    def predict(self, test_set):
        """

        Args:
            test_set: list of :obj:InputExample

        Returns:
            ndarray: An array of predicted label scores.
        """
        test_dataset = convert_df_to_dataset(test_set, self.tokenizer,
                                             self.args.max_seq_length)

        test_dataloader = DataLoader(
            test_dataset,
            sampler=SequentialSampler(test_dataset),
            batch_size=self.args.batch_size,
        )
        return self.scores(test_dataloader)[0]
Esempio n. 8
0
class BERTModel(BaseModel):
    def __init__(self, args):
        super().__init__()
        self.estimator = None
        self.label_mapping = None
        self.train_examples = None
        self.num_train_optimization_steps = None
        # Hyperparams
        self.max_seq_length = args.max_seq_length
        self.train_batch_size = args.train_batch_size
        self.eval_batch_size = args.eval_batch_size
        # Initial learning rate for Adam optimizer
        self.learning_rate = args.learning_rate
        self.num_epochs = args.epochs
        # Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training.
        self.warmup_steps = args.warmup_steps
        self.no_cuda = args.no_cuda
        # Number of updates steps to accumulate before performing a backward/update pass.
        self.gradient_accumulation_steps = args.gradient_accumulation_steps
        self.seed = args.seed
        # Use 16 bit float precision (instead of 32bit)
        self.fp16 = args.fp16
        # Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.
        # 0 (default value): dynamic loss scaling. Positive power of 2: static loss scaling value.
        self.loss_scale = args.loss_scale
        # Meta params
        self.write_test_output = args.write_test_output
        self.output_attentions = args.output_attentions
        self.eval_after_epoch = args.eval_after_epoch
        self.username = args.username
        # model
        self.model_type = args.model_type
        # paths
        self.train_data_path = os.path.join(args.data_path, args.train_data, 'train.tsv')
        self.dev_data_path = os.path.join(args.data_path, args.dev_data, 'dev.tsv')
        self.test_data_path = os.path.join(args.data_path, args.test_data, 'test.tsv')
        self.other_path = args.other_path
        self.default_output_folder = 'output'
        self.output_path = self.generate_output_path(args.output_path)
        self.model_path = os.path.join(self.other_path, 'bert')
        self.all_args = vars(args)

    def generate_output_path(self, output_path):
        if output_path is None:
            output_path = os.path.join(self.default_output_folder, f"{time.strftime('%Y_%m_%d-%-H_%M_%S')}-{str(uuid.uuid4())[:4]}-{self.username}")
        return output_path

    def create_dirs(self):
        for _dir in [self.output_path]:
            logger.info(f'Creating directory {_dir}')
            os.makedirs(_dir)

    def train(self):
        # Setup
        self._setup_bert()

        # Prepare optimizer
        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        if self.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
            self.optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=self.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if self.loss_scale == 0:
                self.optimizer = FP16_Optimizer(self.optimizer, dynamic_loss_scale=True)
            else:
                self.optimizer = FP16_Optimizer(self.optimizer, static_loss_scale=self.loss_scale)
        else:
            self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate)
            self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=self.warmup_steps, num_training_steps=self.num_train_optimization_steps)

        # Run training
        global_step = 0
        tr_loss = 0
        train_features = self.convert_examples_to_features(self.train_examples)
        logger.debug("***** Running training *****")
        logger.debug("  Num examples = %d", len(self.train_examples))
        logger.debug("  Batch size = %d", self.train_batch_size)
        logger.debug("  Num steps = %d", self.num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.train_batch_size)
        loss_vs_time = []
        for epoch in range(int(self.num_epochs)):
            self.model.train()
            nb_tr_examples, nb_tr_steps = 0, 0
            epoch_loss = 0
            pbar = tqdm(train_dataloader)
            for step, batch in enumerate(pbar):
                batch = tuple(t.to(self.device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, logits = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
                if self.n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if self.gradient_accumulation_steps > 1:
                    loss = loss / self.gradient_accumulation_steps
                if self.fp16:
                    self.optimizer.backward(loss)
                else:
                    loss.backward()
                loss = loss.item()
                tr_loss += loss
                epoch_loss += loss
                if step > 0:
                    pbar.set_description("Loss: {:8.4f} | Average loss/it: {:8.4f}".format(loss, epoch_loss/step))
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % self.gradient_accumulation_steps == 0:
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()
                    global_step += 1
            # evaluate model
            if self.eval_after_epoch:
                self.model.eval()
                nb_train_steps, nb_train_examples = 0, 0
                train_accuracy, train_loss = 0, 0
                for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Evaluating"):
                    input_ids = input_ids.to(self.device)
                    input_mask = input_mask.to(self.device)
                    segment_ids = segment_ids.to(self.device)
                    label_ids = label_ids.to(self.device)
                    with torch.no_grad():
                        loss, logits = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
                    train_accuracy += self.accuracy(logits.to('cpu').numpy(), label_ids.to('cpu').numpy())
                    train_loss += loss.mean().item()
                    nb_train_examples += input_ids.size(0)
                    nb_train_steps += 1
                train_loss = train_loss / nb_train_steps
                train_accuracy = 100 * train_accuracy / nb_train_examples
                print("{bar}\nEpoch {}:\nTraining loss: {:8.4f} | Training accuracy: {:.2f}%\n{bar}".format(epoch+1, train_loss, train_accuracy, bar=80*'='))

        # Save model
        model_to_save = self.model.module if hasattr(self.model, 'module') else self.model  # Only save the model it-self
        output_model_file = os.path.join(self.output_path, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(self.output_path, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
        args_output_file = os.path.join(self.output_path, 'args.json')
        with open(args_output_file, 'w') as f:
            json.dump(self.all_args, f)


    def test(self):
        # Setup
        self._setup_bert(setup_mode='test')
        # Run test
        eval_examples = self.processor.get_dev_examples(self.dev_data_path)
        eval_features = self.convert_examples_to_features(eval_examples)
        logger.debug("***** Running evaluation *****")
        logger.debug("  Num examples = %d", len(eval_examples))
        logger.debug("  Batch size = %d", self.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size)
        self.model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        result = {'prediction': [], 'label': [], 'text': []}
        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)
            label_ids = label_ids.to(self.device)
            tmp_eval_loss, logits = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            result['prediction'].extend(np.argmax(logits, axis=1).tolist())
            result['label'].extend(label_ids.tolist())
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
        eval_loss = eval_loss / nb_eval_steps
        label_mapping = self.get_label_mapping()
        result_out = self.performance_metrics(result['label'], result['prediction'], label_mapping=label_mapping)
        if self.write_test_output:
            test_output = self.get_full_test_output(result['prediction'], result['label'], label_mapping=label_mapping,
                    test_data_path=self.dev_data_path)
            result_out = {**result_out, **test_output}
        return result_out

    def save_results(self, results):
        result_path = os.path.join(self.output_path, 'results.json')
        logger.info(f'Writing output results to {result_path}...')
        with open(result_path, 'w') as f:
            json.dump(results, f)

    def predict(self, data):
        """Predict data (list of strings)"""
        # Setup
        self._setup_bert(setup_mode='predict', data=data)
        # Run predict
        predict_examples = self.processor.get_test_examples(data)
        predict_features = self.convert_examples_to_features(predict_examples)
        all_input_ids = torch.tensor([f.input_ids for f in predict_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in predict_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in predict_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in predict_features], dtype=torch.long)
        predict_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        predict_sampler = SequentialSampler(predict_data)
        predict_dataloader = DataLoader(predict_data, sampler=predict_sampler, batch_size=self.eval_batch_size)
        self.model.eval()
        result = []
        for input_ids, input_mask, segment_ids, label_ids in predict_dataloader:
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)
            output = self.model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids)
            logits = output[0]
            probabilities = torch.nn.functional.softmax(logits, dim=1)
            probabilities = probabilities.detach().cpu().numpy()
            res = self.format_predictions(probabilities, label_mapping=self.label_mapping)
            result.extend(res)
        return result

    def fine_tune(self):
        raise NotImplementedError

    def _setup_bert(self, setup_mode='train', data=None):
        # Create necessary dirctory structure
        if setup_mode == 'train':
            self.create_dirs()

        # GPU config
        self.device = torch.device("cuda" if torch.cuda.is_available() and not self.no_cuda else "cpu")
        self.n_gpu = torch.cuda.device_count()
        if self.no_cuda:
            self.n_gpu = 0
        if setup_mode == 'train':
            logger.info("Initialize BERT: device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}".format(self.device, self.n_gpu, False, self.fp16))
        if self.gradient_accumulation_steps < 1:
            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(self.gradient_accumulation_steps))
        self.train_batch_size = self.train_batch_size // self.gradient_accumulation_steps

        # seed
        random.seed(self.seed)
        np.random.seed(self.seed)
        torch.manual_seed(self.seed)
        if self.n_gpu > 0:
            torch.cuda.manual_seed_all(self.seed)

        # label mapping
        if setup_mode == 'train':
            self.label_mapping = self.set_label_mapping()
        elif setup_mode in ['test', 'predict']:
            self.label_mapping = self.get_label_mapping()

        # Build model
        self.processor = SentimentClassificationProcessor(self.train_data_path, self.label_mapping)
        num_labels = len(self.label_mapping)
        self.do_lower_case = 'uncased' in self.model_type
        self.tokenizer = BertTokenizer.from_pretrained(self.model_type, do_lower_case=self.do_lower_case)
        if setup_mode == 'train':
            self.train_examples = self.processor.get_train_examples(self.train_data_path)
            self.num_train_optimization_steps = int(len(self.train_examples) / self.train_batch_size / self.gradient_accumulation_steps) * self.num_epochs

        # Prepare model
        if setup_mode == 'train':
            # if self.fine_tune_path:
            #     logger.info('Loading fine-tuned model {} of type {}...'.format(self.fine_tune_path, self.model_type))
            #     config = BertConfig(os.path.join(self.fine_tune_path, CONFIG_NAME))
            #     weights = torch.load(os.path.join(self.fine_tune_path, WEIGHTS_NAME))
            #     self.model = BertForSequenceClassification.from_pretrained(self.model_type, cache_dir=self.model_path, num_labels=num_labels, state_dict=weights)
            # else:
            #     logger.info('Loading pretrained model {}...'.format(self.model_type))
            #     self.model = BertForSequenceClassification.from_pretrained(self.model_type, cache_dir=self.model_path, num_labels = num_labels)
            self.model = BertForSequenceClassification.from_pretrained(self.model_type, cache_dir=self.model_path, num_labels = num_labels)
            if self.fp16:
                self.model.half()
        else:
            # Load a trained model and config that you have trained
            self.model = BertForSequenceClassification.from_pretrained(self.output_path)
        self.model.to(self.device)
        if self.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model)

    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
        """Truncates a sequence pair in place to the maximum length."""
        # This is a simple heuristic which will always truncate the longer sequence
        # one token at a time. This makes more sense than truncating an equal percent
        # of tokens from each, since if one sequence is very short then each token
        # that's truncated likely contains more information than a longer sequence.
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= max_length:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def accuracy(self, out, labels):
        outputs = np.argmax(out, axis=1)
        return np.sum(outputs == labels)

    def convert_examples_to_features(self, examples):
        """Loads a data file into a list of `InputBatch`s."""
        features = []
        for (ex_index, example) in enumerate(examples):
            tokens_a = self.tokenizer.tokenize(str(example.text_a))
            tokens_b = None
            if example.text_b:
                tokens_b = self.tokenizer.tokenize(str(example.text_b))
                # Modifies `tokens_a` and `tokens_b` in place so that the total
                # length is less than the specified length.
                # Account for [CLS], [SEP], [SEP] with "- 3"
                self._truncate_seq_pair(tokens_a, tokens_b, self.max_seq_length - 3)
            else:
                # Account for [CLS] and [SEP] with "- 2"
                if len(tokens_a) > self.max_seq_length - 2:
                    tokens_a = tokens_a[:(self.max_seq_length - 2)]
            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids: 0   0   0   0  0     0 0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambigiously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
            segment_ids = [0] * len(tokens)
            if tokens_b:
                tokens += tokens_b + ["[SEP]"]
                segment_ids += [1] * (len(tokens_b) + 1)
            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1] * len(input_ids)
            # Zero-pad up to the sequence length.
            padding = [0] * (self.max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding
            assert len(input_ids) == self.max_seq_length
            assert len(input_mask) == self.max_seq_length
            assert len(segment_ids) == self.max_seq_length
            label_id = self.label_mapping[example.label]
            if ex_index < 5:
                logger.debug("*** Example ***")
                logger.debug("guid: %s" % (example.guid))
                logger.debug("tokens: %s" % " ".join(
                        [str(x) for x in tokens]))
                logger.debug("input_ids: %s" % " ".join([str(x) for x in input_ids]))
                logger.debug("input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.debug(
                        "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
                logger.debug("label: %s (id = %d)" % (example.label, label_id))
            features.append(
                    InputFeatures(input_ids=input_ids,
                                  input_mask=input_mask,
                                  segment_ids=segment_ids,
                                  label_id=label_id))
        return features