def one(id, train_i, args):
    # 加载PaddleHub ERNIE预训练模型
    module = hub.Module(name=args.model)

    # ERNIE预训练模型输入变量inputs、输出变量outputs、以及模型program
    inputs, outputs, program = module.context(trainable=True,
                                              max_seq_len=args.max_seq_len)

    # 加载竞赛数据集并使用ReadingComprehensionReader读取数据
    dataset = DuReader(id)
    reader = hub.reader.ReadingComprehensionReader(
        dataset=dataset,
        vocab_path=module.get_vocab_path(),
        max_seq_len=args.max_seq_len,
        doc_stride=128,
        max_query_length=args.max_que_len)

    # 取ERNIE的字级别预训练输出
    seq_output = outputs["sequence_output"]

    # 设置运行program所需的feed_list
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # 选择Fine-tune优化策略
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate,
        warmup_proportion=args.warmup_proportion)

    # 设置运行配置
    config = hub.RunConfig(eval_interval=200,
                           use_pyreader=False,
                           use_data_parallel=args.use_data_parallel,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir + str(id),
                           strategy=strategy)

    # 定义阅读理解Fine-tune Task
    # 由于竞赛数据集与cmrc2018数据集格式比较相似,此处sub_task应为cmrc2018
    # 否则运行可能出错
    reading_comprehension_task = hub.ReadingComprehensionTask(
        data_reader=reader,
        feature=seq_output,
        feed_list=feed_list,
        config=config,
        sub_task="cmrc2018",
    )
    reading_comprehension_task.main_program.random_seed = args.seed
    change_task(reading_comprehension_task, id)
    # 调用finetune_and_eval API,将会自动进行训练、评估以及保存最佳模型
    reading_comprehension_task.finetune_and_eval()

    # 竞赛数据集测试集部分数据用于预测
    data = dataset.predict_examples
    # 调用predict接口, 打开return_result(True),将自动返回预测结果
    all_prediction = reading_comprehension_task.predict(data=data,
                                                        return_result=True)
    # 写入预测结果
    json.dump(all_prediction,
              open('./work/result/submit{}_{}.json'.format(train_i, id), 'w'),
              ensure_ascii=False)
    value = [id, reading_comprehension_task.best_score] + list(
        args.__dict__.values())
    value = [str(x) for x in value]
    with open('./work/log/MRC_log.txt', 'a', encoding='utf-8') as f:
        f.write(','.join(value) + ',-\n')
    return reading_comprehension_task.best_score, value[2:]
Esempio n. 2
0
    # Use "sequence_output" for token-level output.
    seq_output = outputs["sequence_output"]

    # Setup feed list for data feeder
    feed_list = [
        inputs["input_ids"].name,
        inputs["position_ids"].name,
        inputs["segment_ids"].name,
        inputs["input_mask"].name,
    ]

    # Setup runing config for PaddleHub Finetune API
    config = hub.RunConfig(
        use_data_parallel=False,
        use_cuda=args.use_gpu,
        batch_size=args.batch_size,
        checkpoint_dir=args.checkpoint_dir,
        strategy=hub.AdamWeightDecayStrategy())

    # Define a reading comprehension finetune task by PaddleHub's API
    reading_comprehension_task = hub.ReadingComprehensionTask(
        data_reader=reader,
        feature=seq_output,
        feed_list=feed_list,
        config=config)

    # Data to be predicted
    data = dataset.dev_examples[:10]
    reading_comprehension_task.predict(data=data)
Esempio n. 3
0
    dataset = hub.dataset.SQUAD(version_2_with_negative=False,
                                tokenizer=tokenizer,
                                max_seq_len=args.max_seq_len)
    # dataset = hub.dataset.SQUAD(version_2_with_negative=True)

    # Select fine-tune strategy, setup config and fine-tune
    strategy = hub.AdamWeightDecayStrategy(
        weight_decay=args.weight_decay,
        learning_rate=args.learning_rate,
        warmup_proportion=args.warmup_proportion)

    # Setup RunConfig for PaddleHub Fine-tune API
    config = hub.RunConfig(eval_interval=100,
                           use_data_parallel=args.use_data_parallel,
                           use_cuda=args.use_gpu,
                           num_epoch=args.num_epoch,
                           batch_size=args.batch_size,
                           checkpoint_dir=args.checkpoint_dir,
                           strategy=strategy)

    # Define a reading comprehension fine-tune task by PaddleHub's API
    reading_comprehension_task = hub.ReadingComprehensionTask(
        dataset=dataset,
        feature=outputs["sequence_output"],
        config=config,
        sub_task="squad",
    )

    # Fine-tune by PaddleHub's API
    reading_comprehension_task.finetune_and_eval()