def run_predict(data): ds=MyDataset() module = hub.Module(name="ernie",version="1.1.0") inputs, outputs, program = module.context(max_seq_len=512) pooled_output=outputs["sequence_output"] reader = MyClassifyReader( dataset=ds, vocab_path=module.get_vocab_path(), max_seq_len=512) strategy=hub.AdamWeightDecayStrategy( learning_rate=5e-5, lr_scheduler="linear_decay", warmup_proportion=0.1, weight_decay=0.01, optimizer_name="adam" ) config=hub.RunConfig(use_cuda=False,enable_memory_optim=True,num_epoch=3,batch_size=16,strategy=strategy,checkpoint_dir="./models/Product",) feed_list=[ inputs["input_ids"].name,inputs["position_ids"].name, inputs["segment_ids"].name,inputs["input_mask"].name, ] cls_task=GRUTextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=ds.num_labels, config=config ) map = {3: '天使轮', 1: 'B轮', 4:'战略融资', 0: "A轮", 2: 'C轮'} predictions=[] index = 0 run_states = cls_task.predict(data=data) results = [run_state.run_results for run_state in run_states] for batch_result in results: batch_result = np.argmax(batch_result, axis=2)[0] for result in batch_result: print(result) predictions.append(result) index += 1 # In[27]: result=[] prob=[] index=0 for batch_result in results: for single_result in batch_result[0]: print("=====") print(single_result) score=(1*single_result[0]+2*single_result[1]+3*single_result[2]+4*single_result[3]+5*single_result[4])/15*100 return score
def _initialize(self, use_gpu=False): # Load Paddlehub ERNIE Tiny pretrained model self.module = hub.Module(name="ernie_tiny") inputs, outputs, program = self.module.context(trainable=True, max_seq_len=128) # Download dataset and get its label list and label num # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.Couplet() self.label_list = dataset.get_labels() # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig(use_data_parallel=False, use_cuda=use_gpu, batch_size=1, checkpoint_dir=os.path.join( self.directory, "assets", "ckpt"), strategy=hub.AdamWeightDecayStrategy()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] sequence_output = outputs["sequence_output"] # Define a classfication fine-tune task by PaddleHub's API self.gen_task = hub.TextGenerationTask(feature=pooled_output, token_feature=sequence_output, max_seq_len=128, num_classes=dataset.num_labels, config=config, metrics_choices=["bleu"])
def _initialize(self, ckpt_dir="ckpt_chnsenticorp", num_class=2, max_seq_len=128, use_gpu=False, batch_size=1): self.ckpt_dir = os.path.join(self.directory, ckpt_dir) self.num_class = num_class self.MAX_SEQ_LEN = max_seq_len # Load Paddlehub ERNIE Tiny pretrained model self.module = hub.Module(name="ernie_tiny") inputs, outputs, program = self.module.context(trainable=True, max_seq_len=max_seq_len) self.vocab_path = self.module.get_vocab_path() # Download dataset and use accuracy as metrics # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # metric should be acc, f1 or matthews metrics_choices = ["acc"] # For ernie_tiny, it use sub-word to tokenize chinese sentence # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( vocab_path=self.module.get_vocab_path(), max_seq_len=max_seq_len, sp_model_path=self.module.get_spm_path(), word_dict_path=self.module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_data_parallel=False, use_cuda=use_gpu, batch_size=batch_size, checkpoint_dir=self.ckpt_dir, strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API self.cls_task = hub.TextClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=self.num_class, config=config, metrics_choices=metrics_choices)
def predict_tag(model_name, data): checkpoint_dir = "model/" + model_name dataset_dir = "data/" + model_name # Load Paddlehub ERNIE Tiny pretrained model module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context(trainable=UnicodeTranslateError, max_seq_len=128) # Download dataset and use accuracy as metrics # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC dataset = ViolateDataset(dataset_dir=dataset_dir) # For ernie_tiny, it use sub-word to tokenize chinese sentence # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_data_parallel=True, use_cuda=False, batch_size=24, checkpoint_dir=checkpoint_dir, strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config) # Data to be prdicted # data = [["有保障"], # ["无风险"], # ["基金过往数据并不代表未来趋势"], # ["为什么"], # ["周杰伦"], # ["吴东瀛"], # ] # print(cls_task.predict(data=data, return_result=True)) return cls_task.predict(data=data, return_result=True)
def train(train_i, args): dataset = MyDataset() module = hub.Module(name=args.model) reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, warmup_proportion=args.warmup_proportion, lr_scheduler=args.lr_scheduler, learning_rate=args.learning_rate) config = hub.RunConfig(use_cuda=args.use_gpu, num_epoch=args.num_epoch, checkpoint_dir=args.checkpoint_dir + str(train_i), batch_size=args.batch_size, eval_interval=eval_interval, log_interval=log_interval, strategy=strategy) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"] # feed_list的Tensor顺序不可以调整 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] cls_task = hub.MultiLabelClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config) cls_task.main_program.random_seed = args.seed change_task(cls_task, train_i) return cls_task, reader
def get_task(args, schema_labels, id): # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" model_name = args.model_name module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # 加载数据并通过SequenceLabelReader读取数据 dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = SequenceLabelReader(dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 构建序列标注任务迁移网络 # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入 sequence_output = outputs["sequence_output"] # sequence_output = fluid.layers.dropout( # x=sequence_output , # dropout_prob=args.dropout, # dropout_implementation="upscale_in_train") # 设置模型program需要输入的变量feed_list # 必须按照以下顺序设置 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # 选择优化策略 strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # 配置运行设置 config = hub.RunConfig( log_interval=100, eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, # enable_memory_optim=True, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # 构建序列标注迁移任务 seq_label_task = SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) seq_label_task.main_program.random_seed = args.random_seed add_hook(args, seq_label_task, id) return seq_label_task, reader
# Add elmo embedding input_feature = fluid.layers.concat(input=[elmo_embedding, word_embedding], axis=1) # Choose the net which you would like: bow, cnn, gru, bilstm, lstm # We recommend you to choose the gru_net fc = gru_net(program, input_feature) # Setup feed list for data feeder # Must feed all the tensor of senta's module need feed_list = [word_ids.name] # Step4: Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, lr_scheduler="linear_decay", warmup_proportion=args.warmup_proportion) # Step5: Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_cuda=args.use_gpu, use_data_parallel=True, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Step6: Define a classfication finetune task by PaddleHub's API elmo_task = hub.TextClassifierTask(data_reader=reader, feature=fc, feed_list=feed_list,
'12', '13' ]) dataset = ThuNews() module = hub.Module(name="roberta_wwm_ext_chinese_L-24_H-1024_A-16") reader = hub.reader.ClassifyReader(dataset=dataset, vocab_path=module.get_vocab_path(), sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path(), max_seq_len=128) strategy = hub.AdamWeightDecayStrategy( weight_decay=0.01, warmup_proportion=0.1, # learning_rate=5e-5, lr_scheduler="linear_decay", learning_rate=5e-5) config = hub.RunConfig(use_cuda=True, use_data_parallel=True, num_epoch=1, checkpoint_dir="module", batch_size=20, eval_interval=400, strategy=strategy) inputs, outputs, program = module.context(trainable=True, max_seq_len=128) # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"]
# Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] if args.use_taskid: feed_list.append(inputs["task_ids"].name) # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy(weight_decay=args.weight_decay, learning_rate=args.learning_rate, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_data_parallel=args.use_data_parallel, use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a regression finetune task by PaddleHub's API reg_task = hub.RegressionTask(data_reader=reader, feature=pooled_output, feed_list=feed_list,
spm_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) else: tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path()) dataset = hub.dataset.ChnSentiCorp( tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a classfication fine-tune task by PaddleHub's API cls_task = hub.TextClassifierTask( dataset=dataset, feature=pooled_output,
def one(id, train_i, args): # 加载PaddleHub ERNIE预训练模型 module = hub.Module(name=args.model) # ERNIE预训练模型输入变量inputs、输出变量outputs、以及模型program inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # 加载竞赛数据集并使用ReadingComprehensionReader读取数据 dataset = DuReader(id) reader = hub.reader.ReadingComprehensionReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, doc_stride=128, max_query_length=args.max_que_len) # 取ERNIE的字级别预训练输出 seq_output = outputs["sequence_output"] # 设置运行program所需的feed_list feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # 选择Fine-tune优化策略 strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, warmup_proportion=args.warmup_proportion) # 设置运行配置 config = hub.RunConfig(eval_interval=200, use_pyreader=False, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir + str(id), strategy=strategy) # 定义阅读理解Fine-tune Task # 由于竞赛数据集与cmrc2018数据集格式比较相似,此处sub_task应为cmrc2018 # 否则运行可能出错 reading_comprehension_task = hub.ReadingComprehensionTask( data_reader=reader, feature=seq_output, feed_list=feed_list, config=config, sub_task="cmrc2018", ) reading_comprehension_task.main_program.random_seed = args.seed change_task(reading_comprehension_task, id) # 调用finetune_and_eval API,将会自动进行训练、评估以及保存最佳模型 reading_comprehension_task.finetune_and_eval() # 竞赛数据集测试集部分数据用于预测 data = dataset.predict_examples # 调用predict接口, 打开return_result(True),将自动返回预测结果 all_prediction = reading_comprehension_task.predict(data=data, return_result=True) # 写入预测结果 json.dump(all_prediction, open('./work/result/submit{}_{}.json'.format(train_i, id), 'w'), ensure_ascii=False) value = [id, reading_comprehension_task.best_score] + list( args.__dict__.values()) value = [str(x) for x in value] with open('./work/log/MRC_log.txt', 'a', encoding='utf-8') as f: f.write(','.join(value) + ',-\n') return reading_comprehension_task.best_score, value[2:]
@time: 2020/6/5 1:01 @desc: 情感分类 ''' import paddlehub as hub if __name__ == '__main__': module = hub.Module(name="ernie") dataset = hub.dataset.ChnSentiCorp() reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128) strategy = hub.AdamWeightDecayStrategy( weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5) config = hub.RunConfig( use_cuda=False, num_epoch=1, checkpoint_dir="ernie_txt_cls_turtorial_demo", batch_size=100, eval_interval=50, strategy=strategy) inputs, outputs, program = module.context( trainable=True, max_seq_len=128) # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"]
inputs, outputs, program = module.context(trainable="True", max_seq_len=128) pooled_output = outputs["pooled_output"] sequence_output = outputs["sequence_output"] ds = hub.dataset.ChnSentiCorp() reader = hub.reader.ClassifyReader(dataset=ds, vocab_path=module.get_vocab_path(), max_seq_len=128) ds = hub.dataset.ChnSentiCorp() for e in ds.get_train_examples(): print(e.text_a, e.label) strategy = hub.AdamWeightDecayStrategy(learning_rate=1e-4, lr_scheduler="linear_decay", warmup_proportion=0.0, weight_decay=0.01) config = hub.RunConfig(use_cuda=False, num_epoch=3, batch_size=32, strategy=strategy) feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] cls_task = hub.TextClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=ds.num_labels,
# 定义数据集 dataset = TextClassification(dataset_dir) # 定义数据读取器 reader = hub.reader.ClassifyReader(dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, do_lower_case=True, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 设置优化策略 strategy = hub.AdamWeightDecayStrategy( learning_rate=args.learning_rate, lr_scheduler="linear_decay", warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, optimizer_name="adam") # 设置训练参数 config = hub.RunConfig(log_interval=20, eval_interval=500, use_pyreader=True, use_data_parallel=True, save_ckpt_interval=1000, use_cuda=True, checkpoint_dir="%s_TextClassification" % dataset_name, num_epoch=args.num_epoch, batch_size=args.batch_size, strategy=strategy)
reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=max_seq_len, use_task_id=False) metrics_choices = ['acc', 'f1'] # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. #优化器设置 # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( weight_decay=weight_decay, learning_rate=learning_rate, warmup_proportion=warmup_proportion, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_cuda=use_gpu, num_epoch=num_epoch, batch_size=batch_size, checkpoint_dir=checkpoint_dir, strategy=strategy) #运行模型 pooled_output = outputs["pooled_output"] feed_list = [ inputs["input_ids"].name,
import paddlehub as hub module = hub.Module(name="ernie", version="1.0.2") dataset = DemoDataset() reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128) strategy = hub.AdamWeightDecayStrategy( weight_decay=0.01, warmup_proportion=0.1, learning_rate=1e-5, lr_scheduler="linear_decay", optimizer_name="adam") config = hub.RunConfig( #是否使用GPU use_cuda=True, num_epoch=50, #模型保存地址 checkpoint_dir="ernie_turtorial_demo", batch_size=64, log_interval=10, eval_interval=500, strategy=strategy) inputs, outputs, program = module.context( trainable=True, max_seq_len=128)
def main(): # Load Paddlehub pretrained model # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel #model_name = "ernie_tiny" model_name = "chinese-roberta-wwm-ext-large" module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # Download dataset and use SequenceLabelReader to read dataset dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = hub.reader.SequenceLabelReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # Select a finetune strategy strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a sequence labeling finetune task by PaddleHub's API # If add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) # Finetune and evaluate model by PaddleHub's API # will finish training, evaluation, testing, save model automatically if args.do_train: print("start finetune and eval process") seq_label_task.finetune_and_eval() if args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] run_states = seq_label_task.predict(data=input_data[1:]) results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, r_label in zip(predict_sents, results): sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines("{}.{}.pred".format(args.predict_data, args.do_model), ret)
# Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API if "L_12" in name or name in [ "ernie", "ernie_tiny", "ernie_v2_eng_base" ] or "L_3" in name: batch_size = 16 else: batch_size = 8 config = hub.RunConfig(use_data_parallel=True, use_cuda=True, num_epoch=2, batch_size=batch_size, checkpoint_dir="ckpt_%s" % name, strategy=strategy,
# Otherwise, tokenizer should be hub.CustomTokenizer. tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(), tokenize_chinese_chars=True) # Load dataset dataset = hub.dataset.DuEL(tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network. # Use sequence-level output. query = outputs["sequence_output"] left = outputs['sequence_output_2'] right = outputs['sequence_output_3'] # Select fine-tune strategy. strategy = hub.AdamWeightDecayStrategy() # Setup RunConfig for PaddleHub Fine-tune API. config = hub.RunConfig(use_data_parallel=False, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a pairwise text matching task by PaddleHub's API. pairwise_matching_task = hub.PairwiseTextMatchingTask(query_feature=query, left_feature=left, right_feature=right, tokenizer=tokenizer, dataset=dataset,
text = row["text_a"] labels = [int(value) for value in row[2:]] example = InputExample(guid=guid, label=labels, text_a=text) examples.append(example) return examples dataset = MultiMydatas() module = hub.Module(name='ernie') reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128) strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01, warmup_proportion=0.1, learning_rate=5e-5, lr_scheduler='linear_decay', optimizer_name='adam') config = hub.RunConfig( use_data_parallel=False, use_pyreader=False, use_cuda=False, batch_size=32, enable_memory_optim=False, checkpoint_dir='ernie_txt_cls_turtorial_demo', num_epoch=100, strategy=strategy, ) inputs, outputs, program = module.context(trainable=True, max_seq_len=128) pooled_output = outputs['pooled_output']
def train_model(model_name): checkpoint_dir = "model/" + model_name dataset_dir = "data/" + model_name # Load Paddlehub ERNIE Tiny pretrained model module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=128) # Download dataset and use accuracy as metrics # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # metric should be acc, f1 or matthews # dataset = hub.dataset.ChnSentiCorp() dataset = ViolateDataset(dataset_dir=dataset_dir) metrics_choices = ["acc"] # For ernie_tiny, it use sub-word to tokenize chinese sentence # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=0.1, weight_decay=0.01, learning_rate=5e-5, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=True, use_cuda=False, num_epoch=3, batch_size=24, checkpoint_dir=checkpoint_dir, # model_dir="./models", enable_memory_optim=True, strategy=strategy) # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config, metrics_choices=metrics_choices) # with cls_task.phase_guard(phase="train"): # cls_task.init_if_necessary() # cls_task.load_parameters("./models/model") # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically # cls_task.finetune_and_eval() cls_task.finetune() # Evaluate by PaddleHub's API run_states = cls_task.eval() # Get acc score on dev eval_avg_score, eval_avg_loss, eval_run_speed = cls_task._calculate_metrics( run_states) # acc on dev will be used by auto finetune print("AutoFinetuneEval" + "\t" + str(float(eval_avg_score["acc"])))
def main(type, cnf): class SouhuCompetition(TextMatchingDataset): def __init__(self, tokenizer=None, max_seq_len=None): base_path = './data' if type in ['ssA', 'slA', 'llA']: train_file = 'data78383/{}_train.tsv'.format(type) dev_file = 'data78383/{}_valid.tsv'.format(type) elif type in ['ssB', 'slB', 'llB']: train_file = 'data78384/{}_train.tsv'.format(type) dev_file = 'data78384/{}_valid.tsv'.format(type) super(SouhuCompetition, self).__init__( is_pair_wise=False, # 文本匹配类型,是否为pairwise base_path=base_path, train_file=train_file, # 相对于base_path的文件路径 dev_file=dev_file, # 相对于base_path的文件路径 train_file_with_header=True, dev_file_with_header=True, label_list=["0", "1"], tokenizer=tokenizer, max_seq_len=max_seq_len) module = hub.Module(name="ernie") # pointwise任务需要: query, title_left (2 slots) inputs, outputs, program = module.context(trainable=True, max_seq_len=cnf.max_seq_len, num_slots=2) tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(), tokenize_chinese_chars=True) dataset = SouhuCompetition(tokenizer=tokenizer, max_seq_len=cnf.max_seq_len) strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01, warmup_proportion=0.1, learning_rate=1e-5) config = hub.RunConfig( eval_interval=300, use_cuda=True, num_epoch=10, batch_size=cnf.train_and_eval_batch, checkpoint_dir='./ckpt_ernie_pointwise_matching_{}'.format(type), strategy=strategy) # 构建迁移网络,使用ernie的token-level输出 query = outputs["sequence_output"] title = outputs['sequence_output_2'] # 创建pointwise文本匹配任务 pointwise_matching_task = hub.PointwiseTextMatchingTask( dataset=dataset, query_feature=query, title_feature=title, tokenizer=tokenizer, config=config) run_states = pointwise_matching_task.finetune_and_eval() # # 预测数据样例 # text_pairs = [ # [ # "小孩吃了百令胶囊能打预防针吗", # query # "小孩吃了百令胶囊能不能打预防针", # title # ], # [ # "请问呕血与咯血有什么区别?", # query # "请问呕血与咯血异同?", # title # ] # ] save_df = pd.DataFrame(columns=['id', 'label']) def predict(text_pairs): results = pointwise_matching_task.predict( data=text_pairs, max_seq_len=cnf.max_seq_len, label_list=dataset.get_labels(), return_result=True, accelerate_mode=False) return results if type in ['ssA', 'slA', 'llA']: test_file = './data/data78383/{}_test.tsv'.format(type) elif type in ['ssB', 'slB', 'llB']: test_file = './data/data78384/{}_test.tsv'.format(type) test_df = pd.read_csv(test_file, sep='\t') test_df.columns = ['text_a', 'text_b', 'id'] text_pairs = [] ids = [] for index, row in test_df.iterrows(): text_pairs.append([row['text_a'], row['text_b']]) ids.append(row['id']) if len(text_pairs) == cnf.test_batch: results = predict(text_pairs) for i in range(len(ids)): new = pd.DataFrame({ 'id': ids[i], 'label': results[i] }, index=[0]) save_df = save_df.append(new, ignore_index=True) text_pairs = [] ids = [] if len(text_pairs) != 0: results = predict(text_pairs) for i in range(len(ids)): new = pd.DataFrame({'id': ids[i], 'label': results[i]}, index=[0]) save_df = save_df.append(new, ignore_index=True) save_df.to_csv('./results/{}.csv'.format(type), header=True, sep=',', index=False)