def one(id, train_i, args): # 加载PaddleHub ERNIE预训练模型 module = hub.Module(name=args.model) # ERNIE预训练模型输入变量inputs、输出变量outputs、以及模型program inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # 加载竞赛数据集并使用ReadingComprehensionReader读取数据 dataset = DuReader(id) reader = hub.reader.ReadingComprehensionReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, doc_stride=128, max_query_length=args.max_que_len) # 取ERNIE的字级别预训练输出 seq_output = outputs["sequence_output"] # 设置运行program所需的feed_list feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # 选择Fine-tune优化策略 strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, warmup_proportion=args.warmup_proportion) # 设置运行配置 config = hub.RunConfig(eval_interval=200, use_pyreader=False, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir + str(id), strategy=strategy) # 定义阅读理解Fine-tune Task # 由于竞赛数据集与cmrc2018数据集格式比较相似,此处sub_task应为cmrc2018 # 否则运行可能出错 reading_comprehension_task = hub.ReadingComprehensionTask( data_reader=reader, feature=seq_output, feed_list=feed_list, config=config, sub_task="cmrc2018", ) reading_comprehension_task.main_program.random_seed = args.seed change_task(reading_comprehension_task, id) # 调用finetune_and_eval API,将会自动进行训练、评估以及保存最佳模型 reading_comprehension_task.finetune_and_eval() # 竞赛数据集测试集部分数据用于预测 data = dataset.predict_examples # 调用predict接口, 打开return_result(True),将自动返回预测结果 all_prediction = reading_comprehension_task.predict(data=data, return_result=True) # 写入预测结果 json.dump(all_prediction, open('./work/result/submit{}_{}.json'.format(train_i, id), 'w'), ensure_ascii=False) value = [id, reading_comprehension_task.best_score] + list( args.__dict__.values()) value = [str(x) for x in value] with open('./work/log/MRC_log.txt', 'a', encoding='utf-8') as f: f.write(','.join(value) + ',-\n') return reading_comprehension_task.best_score, value[2:]
# Use "sequence_output" for token-level output. seq_output = outputs["sequence_output"] # Setup feed list for data feeder feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, use_cuda=args.use_gpu, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=hub.AdamWeightDecayStrategy()) # Define a reading comprehension finetune task by PaddleHub's API reading_comprehension_task = hub.ReadingComprehensionTask( data_reader=reader, feature=seq_output, feed_list=feed_list, config=config) # Data to be predicted data = dataset.dev_examples[:10] reading_comprehension_task.predict(data=data)
dataset = hub.dataset.SQUAD(version_2_with_negative=False, tokenizer=tokenizer, max_seq_len=args.max_seq_len) # dataset = hub.dataset.SQUAD(version_2_with_negative=True) # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, warmup_proportion=args.warmup_proportion) # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig(eval_interval=100, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a reading comprehension fine-tune task by PaddleHub's API reading_comprehension_task = hub.ReadingComprehensionTask( dataset=dataset, feature=outputs["sequence_output"], config=config, sub_task="squad", ) # Fine-tune by PaddleHub's API reading_comprehension_task.finetune_and_eval()