Exemple #1
0
def main():
    args = get_argparse().parse_args()
    # 模型保存目录
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    args.output_dir = args.output_dir + '{}'.format(args.model_type)
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    # CUDA, GPU
    if torch.cuda.is_available() and not args.no_cuda:
        args.device = torch.device("cuda:0")
    else:
        args.device = torch.device("cpu")
    # 打印参数
    time_ = time.strftime("%Y-%m-%d", time.localtime())
    init_logger(log_file=args.output_dir + f'/{args.model_type}-{args.task_name}-{time_}.log')
    logger.info("="*20+" args "+"="*20)
    for para in args.__dict__:
        msg = para + " = " + str(args.__dict__[para])
        logger.info(msg)
    # Set seed
    seed_everything(args.seed)
    # Prepare NER task
    processor = NerProcessor()
    label_list = processor.get_labels()
    args.id2label = {i: label for i, label in enumerate(label_list)}
    args.label2id = {label: i for i, label in enumerate(label_list)}
    num_labels = len(label_list)
    num_labels = int((num_labels+1)/2)  # 部分B I
    pad_token_label_id = CrossEntropyLoss().ignore_index  # -100

    # Load pretrained model and tokenizer
    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                          num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, )    
    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
                                                do_lower_case=args.do_lower_case,
                                                cache_dir=args.cache_dir if args.cache_dir else None, )
    setattr(config, 'soft_label', args.soft_label)
    model = model_class(config=config)
 
    # 训练
    if args.do_train:
        if args.continue_train:
            model = model_class.from_pretrained(args.continue_train_checkpoint, config=config)
            print(f"Continue training from {args.continue_train_checkpoint}")
        # 基础预训练模型
        elif args.model_type.lower() == "electra":
            model.BaseModel = ElectraModel.from_pretrained(args.model_name_or_path)
            logger.info(f"Loading Electra from {args.model_name_or_path}...")
        elif args.model_type.lower() == "bert" :
            model.BaseModel = BertModel.from_pretrained(args.model_name_or_path)
            logger.info(f"Loading Bert from {args.model_name_or_path}...")        
        elif args.model_type.lower() == "albert":
            model.BaseModel = AlbertModel.from_pretrained(args.model_name_or_path)
            logger.info(f"Loading AlBert from {args.model_name_or_path}...")                     

        print(model)
        model.to(args.device)
        
        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='train')
        global_step, lr_loss = train(args, train_dataset, model, tokenizer, label_list, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step, lr_loss)
        # 保存
        logger.info("Saving model checkpoint to %s", args.output_dir)
        model.save_pretrained(args.output_dir)
        tokenizer.save_vocabulary(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # 测试集
    if args.do_predict:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        logger.info("Predict the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            model = model_class.from_pretrained(checkpoint, config=config)
            model.to(args.device)
            predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=prefix)
Exemple #2
0
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""):
    pred_output_dir = args.output_dir
    if not os.path.exists(pred_output_dir):
        os.makedirs(pred_output_dir)
    test_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id,data_type='test')
    # Note that DistributedSampler samples randomly
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1, collate_fn=collate_fn)
    ### 
    span_labels = []
    for label in label_list:
        label = label.split('-')[-1]
        if label not in span_labels:
            span_labels.append(label)
    span_map = {i: label for i, label in enumerate(span_labels)}

    # Eval
    logger.info("***** Running prediction %s *****", prefix)
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", 1)
    results = []   # 全部测试结果
    error_results=[]   # 预测错误结果
    true_labels = []   # 真实标签
    predict_labels = []   # 预测标签
    output_predict_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt")
    error_predict_file = os.path.join(pred_output_dir, prefix, "Error_test_prediction.txt")
    pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting")

    if isinstance(model, torch.nn.DataParallel):  # 多GPU训练
        model = model.module
    for step, batch in enumerate(test_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1],
                      "start_positions": batch[5], "end_positions": batch[6]}
            if args.model_type != "distilbert":   # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            tmp_eval_loss, start_logits, end_logits = outputs[:3]

        start_preds = start_logits.detach().cpu().numpy() 
        end_preds = end_logits.detach().cpu().numpy()

        start_preds = np.argmax(start_preds, axis=2)  
        end_preds = np.argmax(end_preds, axis=2)

        start_preds_list = [span_map[j] for j in start_preds[0][1:-1]]
        end_preds_list = [span_map[j] for j in end_preds[0][1:-1]]

        batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1]
        batch_true_labels = [args.id2label.get(i) for i in batch_true_labels]
        true_labels.append(batch_true_labels)
        
        batch_predict_labels = convert_span_to_bio([start_preds_list], [end_preds_list])
        predict_labels.extend(batch_predict_labels)
        input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1]
        sent = ""

        ifError=False
        for input_id,pre,lab in zip(input_ids, batch_predict_labels[0], batch_true_labels):
            sent+=" ".join([tokenizer.ids_to_tokens[input_id],lab,pre])+"\n"
            if lab != pre:
                ifError=True
        sent+="\n"
        results.append(sent)
        if ifError:
            error_results.append(sent)
            ifError = False
        pbar(step)
        # 计算测试集 acc, recall, f1

    logger.info("\n测试集结果统计:")
    logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
    logger.info("\n")

    with open(output_predict_file, "w",encoding="utf-8") as writer:
        for record in results:
            writer.write(record)

    with open(error_predict_file, "w",encoding="utf-8") as writer:
        for record in error_results:
            writer.write(record)
Exemple #3
0
def predict(args, model, tokenizer, label_list, pad_token_label_id, prefix=""):
    pred_output_dir = args.output_dir
    if not os.path.exists(pred_output_dir):
        os.makedirs(pred_output_dir)
    test_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           label_list,
                                           pad_token_label_id,
                                           data_type='test')
    # Note that DistributedSampler samples randomly
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 sampler=test_sampler,
                                 batch_size=1,
                                 collate_fn=collate_fn)  # 每次只有一条数据
    # Eval
    logger.info("***** Running prediction %s *****", prefix)
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", 1)
    results = []  # 全部测试结果
    error_results = []  # 预测错误结果
    true_labels = []  # 真实标签
    predict_labels = []  # 预测标签
    output_predict_file = os.path.join(pred_output_dir, prefix,
                                       "test_prediction.txt")
    error_predict_file = os.path.join(pred_output_dir, prefix,
                                      "Error_test_prediction.txt")
    pbar = ProgressBar(n_total=len(test_dataloader), desc="Predicting")

    if isinstance(model, torch.nn.DataParallel):  # 多GPU训练
        model = model.module
    for step, batch in enumerate(test_dataloader):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": None,
                'input_lens': batch[4]
            }
            if args.model_type != "distilbert":  # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type
                                            in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            logits = outputs[0]
            batch_predict_labels = model.crf.decode(logits,
                                                    inputs['attention_mask'])

        batch_predict_labels = batch_predict_labels[0][
            1:-1]  # [CLS]XXXX[SEP] 每次只有一条数据
        batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()[1:-1]
        input_ids = inputs["input_ids"].squeeze(0).cpu().numpy().tolist()[1:-1]

        sent = ""

        ifError = False
        for input_id, pre, lab in zip(input_ids, batch_predict_labels,
                                      batch_true_labels):
            sent += " ".join([
                tokenizer.ids_to_tokens[input_id], args.id2label[lab],
                args.id2label[pre]
            ]) + "\n"
            if args.id2label[lab] != args.id2label[pre]:
                ifError = True
        sent += "\n"
        results.append(sent)
        if ifError:
            error_results.append(sent)
            ifError = False
        pbar(step)
        # 计算测试集 acc, recall, f1
        batch_true = [args.id2label.get(i) for i in batch_true_labels]
        batch_predict = [args.id2label.get(i) for i in batch_predict_labels]
        assert len(batch_true) == len(batch_predict)
        true_labels.append(batch_true)
        predict_labels.append(batch_predict)

    logger.info("\n测试集结果统计:")
    logger.info("accuary: %s", str(accuracy_score(true_labels,
                                                  predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(
        str(
            classification_report(true_labels,
                                  predict_labels,
                                  mode='strict',
                                  scheme=IOB2)))
    logger.info("\n")

    with open(output_predict_file, "w", encoding="utf-8") as writer:
        for record in results:
            writer.write(record)

    with open(error_predict_file, "w", encoding="utf-8") as writer:
        for record in error_results:
            writer.write(record)
Exemple #4
0
def evaluate(args, model, tokenizer, label_list, pad_token_label_id):
    eval_output_dir = args.output_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)
    eval_dataset = load_and_cache_examples(args, args.task_name, tokenizer, label_list, pad_token_label_id, data_type='dev')
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    ##
    span_labels = []
    for label in label_list:
        label = label.split('-')[-1]
        if label not in span_labels:
            span_labels.append(label)
    span_map = {i: label for i, label in enumerate(span_labels)}

    # Eval
    logger.info("***** Running evaluation %s *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    true_labels = []
    predict_labels = []
    model.eval()
    pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1],
                      "start_positions": batch[5], "end_positions": batch[6]}
            if args.model_type != "distilbert":   # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            tmp_eval_loss, start_logits, end_logits = outputs[:3]
        if args.n_gpu > 1:
            tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
        eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1
        
        start_preds = start_logits.detach().cpu().numpy() # [64, 128, 5]
        end_preds = end_logits.detach().cpu().numpy()

        start_preds = np.argmax(start_preds, axis=2)  # [64, 128]
        end_preds = np.argmax(end_preds, axis=2)

        start_preds_list = []
        end_preds_list = []

        batch_true_labels = batch[4].squeeze(0).cpu().numpy().tolist()
        for index, input_length in enumerate(batch[3]):  # batch[3] 每句长度
            start_preds_list.append([span_map[j] for j in start_preds[index][:input_length]][1:-1])
            end_preds_list.append([span_map[j] for j in end_preds[index][:input_length]][1:-1])
            batch_true = [args.id2label.get(i) for i in batch_true_labels[index][:input_length]][1:-1]
            true_labels.append(batch_true)
        
        batch_predict_labels = convert_span_to_bio(start_preds_list, end_preds_list)
        predict_labels.extend(batch_predict_labels)

        pbar(step)

    logger.info("\n")
    logger.info("average eval_loss: %s", str(eval_loss/nb_eval_steps))
    logger.info("accuary: %s", str(accuracy_score(true_labels, predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(str(classification_report(true_labels, predict_labels, mode='strict', scheme=IOB2)))
Exemple #5
0
def evaluate(args, model, tokenizer, label_list, pad_token_label_id):
    eval_output_dir = args.output_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)
    eval_dataset = load_and_cache_examples(args,
                                           args.task_name,
                                           tokenizer,
                                           label_list,
                                           pad_token_label_id,
                                           data_type='dev')
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=collate_fn)
    # Eval
    logger.info("***** Running evaluation %s *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    true_labels = []
    predict_labels = []
    model.eval()
    pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
    for step, batch in enumerate(eval_dataloader):
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3],
                'input_lens': batch[4]
            }
            if args.model_type != "distilbert":  # XLM and RoBERTa don"t use segment_ids
                inputs["token_type_ids"] = (batch[2] if args.model_type
                                            in ["bert", "xlnet"] else None)
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]
            batch_predict_labels = model.crf.decode(logits,
                                                    inputs['attention_mask'])
        if args.n_gpu > 1:
            tmp_eval_loss = tmp_eval_loss.mean(
            )  # mean() to average on multi-gpu parallel evaluating
        eval_loss += tmp_eval_loss.item()
        nb_eval_steps += 1

        batch_true_labels = batch[3].squeeze(0).cpu().numpy().tolist()
        pbar(step)
        for index, input_length in enumerate(batch[4]):
            batch_true = [
                args.id2label.get(i)
                for i in batch_true_labels[index][:input_length]
            ][1:-1]
            batch_predict = [
                args.id2label.get(i)
                for i in batch_predict_labels[index][:input_length]
            ][1:-1]
            true_labels.append(batch_true)
            predict_labels.append(batch_predict)

    logger.info("\n")
    logger.info("average eval_loss: %s", str(eval_loss / nb_eval_steps))
    logger.info("accuary: %s", str(accuracy_score(true_labels,
                                                  predict_labels)))
    logger.info("p: %s", str(precision_score(true_labels, predict_labels)))
    logger.info("r: %s", str(recall_score(true_labels, predict_labels)))
    logger.info("f1: %s", str(f1_score(true_labels, predict_labels)))
    logger.info("classification report: ")
    logger.info(
        str(
            classification_report(true_labels,
                                  predict_labels,
                                  mode='strict',
                                  scheme=IOB2)))