def generate(self, texts): # Add 0x02 between characters to match the format of training data, # otherwise the length of prediction results will not match the input string # if the input string contains non-Chinese characters. formatted_text_a = list(map("\002".join, texts)) # Use the appropriate tokenizer to preprocess the data # For ernie_tiny, it use BertTokenizer too. tokenizer = hub.BertTokenizer(vocab_file=self.module.get_vocab_path()) encoded_data = [ tokenizer.encode(text=text, max_seq_len=128) for text in formatted_text_a ] results = self.gen_task.generate(data=encoded_data, label_list=self.label_list, accelerate_mode=False) results = [[ "".join(sample_result) for sample_result in sample_results ] for sample_results in results] return results
# yapf: enable. if __name__ == '__main__': # Load Paddlehub ERNIE pretrained model. module = hub.Module(name="ernie") # Pairwise task needs: query, title_left, right_title (3 slots). inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len, num_slots=3) # Tokenizer tokenizes the text data and encodes the data as model needed. # If you use transformer modules (ernie, bert, roberta and so on), tokenizer should be hub.BertTokenizer. # Otherwise, tokenizer should be hub.CustomTokenizer. tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(), tokenize_chinese_chars=True) # Load dataset dataset = hub.dataset.DuEL(tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network. # Use sequence-level output. query = outputs["sequence_output"] left = outputs['sequence_output_2'] right = outputs['sequence_output_3'] # Select fine-tune strategy. strategy = hub.AdamWeightDecayStrategy() # Setup RunConfig for PaddleHub Fine-tune API.
if __name__ == '__main__': # Load Paddlehub ERNIE Tiny pretrained model module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=args.max_seq_len) # Use the appropriate tokenizer to preprocess the data set # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 if module.name == "ernie_tiny": tokenizer = hub.ErnieTinyTokenizer( vocab_file=module.get_vocab_path(), spm_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) else: tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) dataset = hub.dataset.ChnSentiCorp( tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate)
def get_task(args, schema_labels, id): # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" model_name = args.model_name module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # if args.model=='mcls': if (args.do_model == 'mcls' or args.do_model == 'mcls_onlysentence'): tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path() ) # 加载数据并通过SequenceLabelReader读取数据 dataset = CCksDataset(args.data_dir, schema_labels, model=args.do_model, tokenizer=tokenizer, max_seq_len=args.max_seq_len) reader = MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 构建序列标注任务迁移网络 # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入 output = outputs["pooled_output"] elif (args.do_model == 'mrc_relation'): print(schema_labels) dataset = MRCrelationDataset(args.data_dir, schema_labels, model=args.do_model) reader = ClassifyReader(dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 构建序列标注任务迁移网络 # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入 output = outputs["pooled_output"] else: # 加载数据并通过SequenceLabelReader读取数据 dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = SequenceLabelReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 构建序列标注任务迁移网络 # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入 output = outputs["sequence_output"] # else: # sequence_output = outputs["sequence_output"] # sequence_output = fluid.layers.dropout( # x=sequence_output , # dropout_prob=args.dropout, # dropout_implementation="upscale_in_train") # 设置模型program需要输入的变量feed_list # 必须按照以下顺序设置 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # 选择优化策略 strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # 配置运行设置 config = hub.RunConfig( log_interval=100, eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, # enable_memory_optim=True, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # 构建序列标注迁移任务 if args.do_model == 'mcls' or args.do_model == 'mcls_onlysentence': task = MultiLabelClassifierTask(data_reader=reader, feature=output, feed_list=feed_list, num_classes=dataset.num_labels, config=config) elif (args.do_model == 'mrc_relation'): print(dataset.num_labels) task = TextClassifierTask(data_reader=reader, feature=output, feed_list=feed_list, num_classes=dataset.num_labels, config=config, metrics_choices=['acc']) else: task = SequenceLabelTask(data_reader=reader, feature=output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) task.main_program.random_seed = args.random_seed add_hook(args, task, id) return task, reader
def finetune(args): module = hub.Module(name="ernie", max_seq_len=args.max_seq_len) # Use the appropriate tokenizer to preprocess the data set # For ernie_tiny, it will do word segmentation to get subword. More details: https://www.jiqizhixin.com/articles/2019-11-06-9 if module.name == "ernie_tiny": tokenizer = hub.ErnieTinyTokenizer( vocab_file=module.get_vocab_path(), spm_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path(), ) else: tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) dataset = hub.dataset.ChnSentiCorp(tokenizer=tokenizer, max_seq_len=args.max_seq_len) with fluid.dygraph.guard(): tc = TransformerClassifier(num_classes=dataset.num_labels, transformer=module) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=tc.parameters()) state_dict_path = os.path.join(args.checkpoint_dir, 'dygraph_state_dict') if os.path.exists(state_dict_path + '.pdparams'): state_dict, _ = fluid.load_dygraph(state_dict_path) tc.load_dict(state_dict) loss_sum = acc_sum = cnt = 0 for epoch in range(args.num_epoch): for batch_id, data in enumerate( dataset.batch_records_generator( phase="train", batch_size=args.batch_size, shuffle=True, pad_to_batch_max_seq_len=False)): batch_size = len(data["input_ids"]) input_ids = np.array(data["input_ids"]).astype( np.int64).reshape([batch_size, -1, 1]) position_ids = np.array(data["position_ids"]).astype( np.int64).reshape([batch_size, -1, 1]) segment_ids = np.array(data["segment_ids"]).astype( np.int64).reshape([batch_size, -1, 1]) input_mask = np.array(data["input_mask"]).astype( np.float32).reshape([batch_size, -1, 1]) labels = np.array(data["label"]).astype(np.int64).reshape( [batch_size, 1]) pred = tc(input_ids, position_ids, segment_ids, input_mask) acc = fluid.layers.accuracy(pred, to_variable(labels)) loss = fluid.layers.cross_entropy(pred, to_variable(labels)) avg_loss = fluid.layers.mean(loss) avg_loss.backward() adam.minimize(avg_loss) loss_sum += avg_loss.numpy() * labels.shape[0] acc_sum += acc.numpy() * labels.shape[0] cnt += labels.shape[0] if batch_id % args.log_interval == 0: print('epoch {}: loss {}, acc {}'.format( epoch, loss_sum / cnt, acc_sum / cnt)) loss_sum = acc_sum = cnt = 0 if batch_id % args.save_interval == 0: state_dict = tc.state_dict() fluid.save_dygraph(state_dict, state_dict_path)
def finetune(args): module = hub.Module(name="ernie", max_seq_len=args.max_seq_len) # Use the appropriate tokenizer to preprocess the data set tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) dataset = hub.dataset.MSRA_NER(tokenizer=tokenizer, max_seq_len=args.max_seq_len) with fluid.dygraph.guard(): ts = TransformerSeqLabeling(num_classes=dataset.num_labels, transformer=module) adam = AdamOptimizer(learning_rate=1e-5, parameter_list=ts.parameters()) state_dict_path = os.path.join(args.checkpoint_dir, 'dygraph_state_dict') if os.path.exists(state_dict_path + '.pdparams'): state_dict, _ = fluid.load_dygraph(state_dict_path) ts.load_dict(state_dict) loss_sum = total_infer = total_label = total_correct = cnt = 0 for epoch in range(args.num_epoch): for batch_id, data in enumerate( dataset.batch_records_generator( phase="train", batch_size=args.batch_size, shuffle=True, pad_to_batch_max_seq_len=False)): batch_size = len(data["input_ids"]) input_ids = np.array(data["input_ids"]).astype( np.int64).reshape([batch_size, -1, 1]) position_ids = np.array(data["position_ids"]).astype( np.int64).reshape([batch_size, -1, 1]) segment_ids = np.array(data["segment_ids"]).astype( np.int64).reshape([batch_size, -1, 1]) input_mask = np.array(data["input_mask"]).astype( np.float32).reshape([batch_size, -1, 1]) labels = np.array(data["label"]).astype(np.int64).reshape( -1, 1) seq_len = np.array(data["seq_len"]).astype(np.int64).reshape( -1, 1) pred, ret_infers = ts(input_ids, position_ids, segment_ids, input_mask) loss = fluid.layers.cross_entropy(pred, to_variable(labels)) avg_loss = fluid.layers.mean(loss) avg_loss.backward() adam.minimize(avg_loss) loss_sum += avg_loss.numpy() * labels.shape[0] label_num, infer_num, correct_num = chunk_eval( labels, ret_infers.numpy(), seq_len, dataset.num_labels, 1) cnt += labels.shape[0] total_infer += infer_num total_label += label_num total_correct += correct_num if batch_id % args.log_interval == 0: precision, recall, f1 = calculate_f1( total_label, total_infer, total_correct) print('epoch {}: loss {}, f1 {} recall {} precision {}'. format(epoch, loss_sum / cnt, f1, recall, precision)) loss_sum = total_infer = total_label = total_correct = cnt = 0 if batch_id % args.save_interval == 0: state_dict = ts.state_dict() fluid.save_dygraph(state_dict, state_dict_path)
def main(type, cnf): class SouhuCompetition(TextMatchingDataset): def __init__(self, tokenizer=None, max_seq_len=None): base_path = './data' if type in ['ssA', 'slA', 'llA']: train_file = 'data78383/{}_train.tsv'.format(type) dev_file = 'data78383/{}_valid.tsv'.format(type) elif type in ['ssB', 'slB', 'llB']: train_file = 'data78384/{}_train.tsv'.format(type) dev_file = 'data78384/{}_valid.tsv'.format(type) super(SouhuCompetition, self).__init__( is_pair_wise=False, # 文本匹配类型,是否为pairwise base_path=base_path, train_file=train_file, # 相对于base_path的文件路径 dev_file=dev_file, # 相对于base_path的文件路径 train_file_with_header=True, dev_file_with_header=True, label_list=["0", "1"], tokenizer=tokenizer, max_seq_len=max_seq_len) module = hub.Module(name="ernie") # pointwise任务需要: query, title_left (2 slots) inputs, outputs, program = module.context(trainable=True, max_seq_len=cnf.max_seq_len, num_slots=2) tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(), tokenize_chinese_chars=True) dataset = SouhuCompetition(tokenizer=tokenizer, max_seq_len=cnf.max_seq_len) strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01, warmup_proportion=0.1, learning_rate=1e-5) config = hub.RunConfig( eval_interval=300, use_cuda=True, num_epoch=10, batch_size=cnf.train_and_eval_batch, checkpoint_dir='./ckpt_ernie_pointwise_matching_{}'.format(type), strategy=strategy) # 构建迁移网络,使用ernie的token-level输出 query = outputs["sequence_output"] title = outputs['sequence_output_2'] # 创建pointwise文本匹配任务 pointwise_matching_task = hub.PointwiseTextMatchingTask( dataset=dataset, query_feature=query, title_feature=title, tokenizer=tokenizer, config=config) run_states = pointwise_matching_task.finetune_and_eval() # # 预测数据样例 # text_pairs = [ # [ # "小孩吃了百令胶囊能打预防针吗", # query # "小孩吃了百令胶囊能不能打预防针", # title # ], # [ # "请问呕血与咯血有什么区别?", # query # "请问呕血与咯血异同?", # title # ] # ] save_df = pd.DataFrame(columns=['id', 'label']) def predict(text_pairs): results = pointwise_matching_task.predict( data=text_pairs, max_seq_len=cnf.max_seq_len, label_list=dataset.get_labels(), return_result=True, accelerate_mode=False) return results if type in ['ssA', 'slA', 'llA']: test_file = './data/data78383/{}_test.tsv'.format(type) elif type in ['ssB', 'slB', 'llB']: test_file = './data/data78384/{}_test.tsv'.format(type) test_df = pd.read_csv(test_file, sep='\t') test_df.columns = ['text_a', 'text_b', 'id'] text_pairs = [] ids = [] for index, row in test_df.iterrows(): text_pairs.append([row['text_a'], row['text_b']]) ids.append(row['id']) if len(text_pairs) == cnf.test_batch: results = predict(text_pairs) for i in range(len(ids)): new = pd.DataFrame({ 'id': ids[i], 'label': results[i] }, index=[0]) save_df = save_df.append(new, ignore_index=True) text_pairs = [] ids = [] if len(text_pairs) != 0: results = predict(text_pairs) for i in range(len(ids)): new = pd.DataFrame({'id': ids[i], 'label': results[i]}, index=[0]) save_df = save_df.append(new, ignore_index=True) save_df.to_csv('./results/{}.csv'.format(type), header=True, sep=',', index=False)