Example #1
0
        # 查看现在使用的设备
        print('current device:', torch.cuda.current_device())
        n_gpu = 1
        params.n_gpu = n_gpu

    # Set the random seed for reproducible experiments
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    params.seed = args.seed
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    # Set the logger
    utils.set_logger(save=True, log_path=os.path.join(params.params_path, 'train.log'))
    logging.info("Model type: ")
    logging.info("device: {}".format(params.device))

    logging.info('Init pre-train model...')
    bert_config = BertConfig.from_json_file(os.path.join(params.bert_model_dir, 'bert_config.json'))
    model = BertForTokenClassification(config=bert_config, params=params)
    nezha_utils.torch_init_model(model, os.path.join(params.bert_model_dir, 'pytorch_model.bin'))
    # 保存bert config
    model.to(params.device)
    if params.n_gpu > 1 and args.multi_gpu:
        model = torch.nn.DataParallel(model)
    logging.info('-done')

    # Train and evaluate the model
    logging.info("Starting training for {} epoch(s)".format(args.epoch_num))
    train_and_evaluate(model, params, args.restore_file)
def train(train_iter, test_iter, config):
    """"""
    # Prepare model
    # Prepare model
    # reload weights from restore_file if specified  如果指定就加载已经训练的权重
    if config.pretrainning_model == 'nezha':  #哪吒模型
        Bert_config = BertConfig.from_json_file(config.bert_config_file)
        model = BertForTokenClassification(config=Bert_config, params=config)
        nezha_utils.torch_init_model(model, config.bert_file)
    elif config.pretrainning_model == 'albert':
        Bert_config = AlbertConfig.from_pretrained(config.model_path)
        model = BertForTokenClassification.from_pretrained(config.model_path,
                                                           config=Bert_config)
    else:
        Bert_config = RobertaConfig.from_pretrained(config.bert_config_file,
                                                    output_hidden_states=True)
        model = BertForTokenClassification.from_pretrained(
            config=Bert_config,
            params=config,
            pretrained_model_name_or_path=config.model_path)

    Bert_config.output_hidden_states = True  # 获取每一层的输出

    model.to(device)
    """多卡训练"""
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # optimizer
    # Prepare optimizer
    # fine-tuning
    # 取模型权重
    param_optimizer = list(model.named_parameters())
    # pretrain model param       预训练的参数
    param_pre = [(n, p) for n, p in param_optimizer
                 if 'bert' in n or 'electra' in n]  # nezha的命名为bert
    # middle model param         中等参数
    param_middle = [
        (n, p) for n, p in param_optimizer
        if not any([s in n for s in ('bert', 'crf', 'electra',
                                     'albert')]) or 'dym_weight' in n
    ]
    # crf param
    # 不进行衰减的权重
    no_decay = ['bias', 'LayerNorm', 'dym_weight', 'layer_norm']
    # 将权重分组
    optimizer_grouped_parameters = [
        # pretrain model param  预训练的参数
        # 衰减
        {
            'params':
            [p for n, p in param_pre if not any(nd in n for nd in no_decay)],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.embed_learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_pre if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
            'lr': config.embed_learning_rate
        },
        # middle model     中等参数
        # 衰减
        {
            'params': [
                p for n, p in param_middle
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            config.decay_rate,
            'lr':
            config.learning_rate
        },
        # 不衰减
        {
            'params':
            [p for n, p in param_middle if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0,
            'lr':
            config.learning_rate
        },
    ]
    num_train_optimization_steps = train_iter.num_records // config.gradient_accumulation_steps * config.train_epoch
    optimizer = BertAdam(optimizer_grouped_parameters,
                         warmup=config.warmup_proportion,
                         schedule="warmup_cosine",
                         t_total=num_train_optimization_steps)
    logger.info("***** Running training *****")
    logger.info("  Batch size = %d", config.batch_size)
    logger.info("  Num epochs = %d", config.train_epoch)
    logger.info("  Learning rate = %f", config.learning_rate)

    cum_step = 0
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(
        os.path.join(config.save_model, "runs_" + str(gpu_id), timestamp))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print("Writing to {}\n".format(out_dir))

    draw_step_list = []
    draw_loss_list = []
    for i in range(config.train_epoch):
        model.train()
        for input_ids_list, input_mask_list, segment_ids_list, label_ids_list, tokens_list in tqdm(
                train_iter):
            # 转成张量
            loss = model(input_ids=list2ts2device(input_ids_list),
                         token_type_ids=list2ts2device(segment_ids_list),
                         attention_mask=list2ts2device(input_mask_list),
                         labels=list2ts2device(label_ids_list))
            if n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu.
            # 梯度累加
            if config.gradient_accumulation_steps > 1:
                loss = loss / config.gradient_accumulation_steps

            if cum_step % 10 == 0:
                draw_step_list.append(cum_step)
                draw_loss_list.append(loss)
                if cum_step % 100 == 0:
                    format_str = 'step {}, loss {:.4f} lr {:.5f}'
                    print(
                        format_str.format(cum_step, loss,
                                          config.learning_rate))

            loss.backward()  # 反向传播,得到正常的grad
            if (cum_step + 1) % config.gradient_accumulation_steps == 0:
                # performs updates using calculated gradients
                optimizer.step()
                model.zero_grad()
            cum_step += 1
        p, r, f1 = set_test(model, test_iter)
        # lr_scheduler学习率递减 step

        print('dev set : step_{},precision_{}, recall_{}, F1_{}'.format(
            cum_step, p, r, f1))

        # 保存模型
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            os.path.join(
                out_dir, 'model_{:.4f}_{:.4f}_{:.4f}_{}.bin'.format(
                    p, r, f1, str(cum_step))))
        torch.save(model_to_save, output_model_file)

    with open(Config().processed_data + 'step_loss_data.pickle', 'wb') as mf:
        draw_dict = {'step': draw_step_list, 'loss': draw_loss_list}
        pickle.dump(draw_dict, mf)