def _initialize(self, use_gpu=False): # Load Paddlehub ERNIE Tiny pretrained model self.module = hub.Module(name="ernie_tiny") inputs, outputs, program = self.module.context(trainable=True, max_seq_len=128) # Download dataset and get its label list and label num # If you just want labels information, you can omit its tokenizer parameter to avoid preprocessing the train set. dataset = hub.dataset.Couplet() self.label_list = dataset.get_labels() # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig(use_data_parallel=False, use_cuda=use_gpu, batch_size=1, checkpoint_dir=os.path.join( self.directory, "assets", "ckpt"), strategy=hub.AdamWeightDecayStrategy()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] sequence_output = outputs["sequence_output"] # Define a classfication fine-tune task by PaddleHub's API self.gen_task = hub.TextGenerationTask(feature=pooled_output, token_feature=sequence_output, max_seq_len=128, num_classes=dataset.num_labels, config=config, metrics_choices=["bleu"])
def run_predict(data): ds=MyDataset() module = hub.Module(name="ernie",version="1.1.0") inputs, outputs, program = module.context(max_seq_len=512) pooled_output=outputs["sequence_output"] reader = MyClassifyReader( dataset=ds, vocab_path=module.get_vocab_path(), max_seq_len=512) strategy=hub.AdamWeightDecayStrategy( learning_rate=5e-5, lr_scheduler="linear_decay", warmup_proportion=0.1, weight_decay=0.01, optimizer_name="adam" ) config=hub.RunConfig(use_cuda=False,enable_memory_optim=True,num_epoch=3,batch_size=16,strategy=strategy,checkpoint_dir="./models/Product",) feed_list=[ inputs["input_ids"].name,inputs["position_ids"].name, inputs["segment_ids"].name,inputs["input_mask"].name, ] cls_task=GRUTextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=ds.num_labels, config=config ) map = {3: '天使轮', 1: 'B轮', 4:'战略融资', 0: "A轮", 2: 'C轮'} predictions=[] index = 0 run_states = cls_task.predict(data=data) results = [run_state.run_results for run_state in run_states] for batch_result in results: batch_result = np.argmax(batch_result, axis=2)[0] for result in batch_result: print(result) predictions.append(result) index += 1 # In[27]: result=[] prob=[] index=0 for batch_result in results: for single_result in batch_result[0]: print("=====") print(single_result) score=(1*single_result[0]+2*single_result[1]+3*single_result[2]+4*single_result[3]+5*single_result[4])/15*100 return score
def load(self): inputs, outputs, program = self.module.context( trainable=True, max_seq_len=128) reader = hub.reader.ClassifyReader( dataset=self.dataset, vocab_path=self.module.get_vocab_path(), max_seq_len=128, use_task_id=False) pooled_output = outputs["pooled_output"] feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] config = hub.RunConfig( use_pyreader=False, use_cuda=True, batch_size=30, enable_memory_optim=False, checkpoint_dir=self.module_in, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) cls_task = hub.TextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=self.dataset.num_labels, config=config, metrics_choices=self.metrics_choices) return cls_task
def finetune(args): # Load Paddlehub pretrained model, default as mobilenet module = hub.Module(name=args.module) input_dict, output_dict, program = module.context(trainable=True) # Download dataset and use ImageClassificationReader to read dataset dataset = hub.dataset.Flowers() data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), images_mean=module.get_pretrained_images_mean(), images_std=module.get_pretrained_images_std(), dataset=dataset) # The last 2 layer of resnet_v2_101_imagenet network feature_map = output_dict["feature_map"] img = input_dict["image"] feed_list = [img.name] # Select finetune strategy, setup config and finetune strategy = hub.DefaultFinetuneStrategy(learning_rate=args.learning_rate) config = hub.RunConfig( use_cuda=True, num_epoch=args.epochs, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Construct transfer learning network task = hub.ImageClassifierTask( data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) # Load model from the defined model path or not if args.model_path != "": with task.phase_guard(phase="train"): task.init_if_necessary() task.load_parameters(args.model_path) logger.info("PaddleHub has loaded model from %s" % args.model_path) # Finetune by PaddleHub's API task.finetune() # Evaluate by PaddleHub's API run_states = task.eval() # Get acc score on dev eval_avg_score, eval_avg_loss, eval_run_speed = task._calculate_metrics( run_states) # Move ckpt/best_model to the defined saved parameters directory best_model_dir = os.path.join(config.checkpoint_dir, "best_model") if is_path_valid(args.saved_params_dir) and os.path.exists(best_model_dir): shutil.copytree(best_model_dir, args.saved_params_dir) shutil.rmtree(config.checkpoint_dir) # acc on dev will be used by auto finetune hub.report_final_result(eval_avg_score["acc"])
def get_embedding(self, texts, use_gpu=False, batch_size=1): """ get pooled_output and sequence_output for input texts. Warnings: this method depends on Paddle Inference Library, it may not work properly in PaddlePaddle <= 1.6.2. Args: texts (list): each element is a text sample, each sample include text_a and text_b where text_b can be omitted. for example: [[sample0_text_a, sample0_text_b], [sample1_text_a, sample1_text_b], ...] use_gpu (bool): use gpu or not, default False. batch_size (int): the data batch size, default 1. Returns: pooled_outputs(list): its element is a numpy array, the first feature of each text sample. sequence_outputs(list): its element is a numpy array, the whole features of each text sample. """ if not hasattr( self, "emb_job" ) or self.emb_job["batch_size"] != batch_size or self.emb_job[ "use_gpu"] != use_gpu: inputs, outputs, program = self.context( trainable=True, max_seq_len=self.MAX_SEQ_LEN) reader = hub.reader.ClassifyReader( dataset=None, vocab_path=self.get_vocab_path(), max_seq_len=self.MAX_SEQ_LEN, sp_model_path=self.get_spm_path() if hasattr( self, "get_spm_path") else None, word_dict_path=self.get_word_dict_path() if hasattr( self, "word_dict_path") else None) feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] pooled_feature, seq_feature = outputs["pooled_output"], outputs[ "sequence_output"] config = hub.RunConfig( use_data_parallel=False, use_cuda=use_gpu, batch_size=batch_size) self.emb_job = {} self.emb_job["task"] = _TransformerEmbeddingTask( pooled_feature=pooled_feature, seq_feature=seq_feature, feed_list=feed_list, data_reader=reader, config=config, ) self.emb_job["batch_size"] = batch_size self.emb_job["use_gpu"] = use_gpu return self.emb_job["task"].predict( data=texts, return_result=True, accelerate_mode=True)
def predict(args): module = hub.Module(name=args.module) input_dict, output_dict, program = module.context(trainable=True) if args.dataset.lower() == "flowers": dataset = hub.dataset.Flowers() elif args.dataset.lower() == "dogcat": dataset = hub.dataset.DogCat() elif args.dataset.lower() == "indoor67": dataset = hub.dataset.Indoor67() elif args.dataset.lower() == "food101": dataset = hub.dataset.Food101() elif args.dataset.lower() == "stanforddogs": dataset = hub.dataset.StanfordDogs() else: raise ValueError("%s dataset is not defined" % args.dataset) data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), images_mean=module.get_pretrained_images_mean(), images_std=module.get_pretrained_images_std(), dataset=dataset) feature_map = output_dict["feature_map"] img = input_dict["image"] feed_list = [img.name] config = hub.RunConfig( use_data_parallel=False, use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) task = hub.ClassifierTask(data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) data = ["./test/test_img_daisy.jpg", "./test/test_img_roses.jpg"] label_map = dataset.label_dict() index = 0 # get classification result run_states = task.predict(data=data) results = [run_state.run_results for run_state in run_states] for batch_result in results: # get predict index batch_result = np.argmax(batch_result, axis=2)[0] for result in batch_result: index += 1 result = label_map[result] print("input %i is %s, and the predict result is %s" % (index, data[index - 1], result))
def _initialize(self, ckpt_dir="ckpt_chnsenticorp", num_class=2, max_seq_len=128, use_gpu=False, batch_size=1): self.ckpt_dir = os.path.join(self.directory, ckpt_dir) self.num_class = num_class self.MAX_SEQ_LEN = max_seq_len # Load Paddlehub ERNIE Tiny pretrained model self.module = hub.Module(name="ernie_tiny") inputs, outputs, program = self.module.context(trainable=True, max_seq_len=max_seq_len) self.vocab_path = self.module.get_vocab_path() # Download dataset and use accuracy as metrics # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # metric should be acc, f1 or matthews metrics_choices = ["acc"] # For ernie_tiny, it use sub-word to tokenize chinese sentence # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( vocab_path=self.module.get_vocab_path(), max_seq_len=max_seq_len, sp_model_path=self.module.get_spm_path(), word_dict_path=self.module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_data_parallel=False, use_cuda=use_gpu, batch_size=batch_size, checkpoint_dir=self.ckpt_dir, strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API self.cls_task = hub.TextClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=self.num_class, config=config, metrics_choices=metrics_choices)
def predict_tag(model_name, data): checkpoint_dir = "model/" + model_name dataset_dir = "data/" + model_name # Load Paddlehub ERNIE Tiny pretrained model module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context(trainable=UnicodeTranslateError, max_seq_len=128) # Download dataset and use accuracy as metrics # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC dataset = ViolateDataset(dataset_dir=dataset_dir) # For ernie_tiny, it use sub-word to tokenize chinese sentence # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_data_parallel=True, use_cuda=False, batch_size=24, checkpoint_dir=checkpoint_dir, strategy=hub.AdamWeightDecayStrategy()) # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config) # Data to be prdicted # data = [["有保障"], # ["无风险"], # ["基金过往数据并不代表未来趋势"], # ["为什么"], # ["周杰伦"], # ["吴东瀛"], # ] # print(cls_task.predict(data=data, return_result=True)) return cls_task.predict(data=data, return_result=True)
def recognize(): global flag module = hub.Module(name="resnet_v2_50_imagenet") dataset = DemoDataset() data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), images_mean=module.get_pretrained_images_mean(), images_std=module.get_pretrained_images_std(), dataset=dataset) config = hub.RunConfig( use_cuda=False, # 是否使用GPU训练,默认为False; num_epoch=5, # Fine-tune的轮数; checkpoint_dir="cv_finetune_turtorial_demo", # 模型checkpoint保存路径, 若用户没有指定,程序会自动生成; batch_size=10, # 训练的批大小,如果使用GPU,请根据实际情况调整batch_size; eval_interval=10, # 模型评估的间隔,默认每100个step评估一次验证集; strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) #Fine-tune优化策略; #strategy=hub.finetune.strategy.AdamWeightDecayStrategy()) input_dict, output_dict, program = module.context(trainable=True) img = input_dict["image"] feature_map = output_dict["feature_map"] feed_list = [img.name] task = hub.ImageClassifierTask( data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) label_map = dataset.label_dict() #run_states = task.finetune_and_eval() while 1: if flag is 1: data = [] data.append("/home/xmy/PycharmProjects/test/paddle/proj3_recognizeMyself/temp_out/cap.jpg") index = 0 run_states = task.predict(data=data) results = [run_state.run_results for run_state in run_states] for batch_result in results: batch_result = np.argmax(batch_result, axis=2)[0] for result in batch_result: index += 1 result = label_map[result] #print("input %i is %s, and the predict result is %s" % #(index, data[index - 1], result)) if "科比" in result: os.system("wmctrl -a \"pycharm\"") elif "库里" in result: os.system("wmctrl -a \"chrome\"") flag = 0
def predict(args): # Load Paddlehub pretrained model module = hub.Module(name=args.module) input_dict, output_dict, program = module.context(trainable=True) # Download dataset if args.dataset.lower() == "flowers": dataset = hub.dataset.Flowers() elif args.dataset.lower() == "dogcat": dataset = hub.dataset.DogCat() elif args.dataset.lower() == "indoor67": dataset = hub.dataset.Indoor67() elif args.dataset.lower() == "food101": dataset = hub.dataset.Food101() elif args.dataset.lower() == "stanforddogs": dataset = hub.dataset.StanfordDogs() else: raise ValueError("%s dataset is not defined" % args.dataset) # Use ImageClassificationReader to read dataset data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), images_mean=module.get_pretrained_images_mean(), images_std=module.get_pretrained_images_std(), dataset=dataset) feature_map = output_dict["feature_map"] # Setup feed list for data feeder feed_list = [input_dict["image"].name] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, use_cuda=args.use_gpu, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) # Define a reading comprehension finetune task by PaddleHub's API task = hub.ImageClassifierTask( data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) data = ["./test/test_img_daisy.jpg", "./test/test_img_roses.jpg"] print(task.predict(data=data, return_result=True))
def human_classfication(data): ''' 使用前面训练好的图片进行人脸识别分类 :param data: 要检测的图片的地址 :return: 人脸的标签(是谁) ''' module = hub.Module(name="resnet_v2_18_imagenet") dataset = DemoDataset() # 模型构建 data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), images_mean=module.get_pretrained_images_mean(), images_std=module.get_pretrained_images_std(), dataset=dataset) config = hub.RunConfig( use_cuda=False, # 是否使用GPU训练,默认为False; num_epoch=4, # Fine-tune的轮数; checkpoint_dir="cv_finetune", # 模型checkpoint保存路径, 若用户没有指定,程序会自动生成; batch_size=10, # 训练的批大小,如果使用GPU,请根据实际情况调整batch_size; eval_interval=10, # 模型评估的间隔,默认每100个step评估一次验证集; strategy=hub.finetune.strategy.DefaultFinetuneStrategy() ) # Fine-tune优化策略; # 组建FinetuneTask input_dict, output_dict, program = module.context(trainable=True) img = input_dict["image"] feature_map = output_dict["feature_map"] feed_list = [img.name] task = hub.ImageClassifierTask(data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) task.load_checkpoint() # ##--------------开始预测 label_map = dataset.label_dict() index = 0 run_states = task.predict(data=data) results = [run_state.run_results for run_state in run_states] for batch_result in results: batch_result = np.argmax(batch_result, axis=2)[0] for result in batch_result: return result
def finetune(args): module = hub.Module(name=args.module) input_dict, output_dict, program = module.context(trainable=True) if args.dataset.lower() == "flowers": dataset = hub.dataset.Flowers() elif args.dataset.lower() == "dogcat": dataset = hub.dataset.DogCat() elif args.dataset.lower() == "indoor67": dataset = hub.dataset.Indoor67() elif args.dataset.lower() == "food101": dataset = hub.dataset.Food101() elif args.dataset.lower() == "stanforddogs": dataset = hub.dataset.StanfordDogs() else: raise ValueError("%s dataset is not defined" % args.dataset) data_reader = hub.reader.ImageClassificationReader( image_width=module.get_expected_image_width(), image_height=module.get_expected_image_height(), images_mean=module.get_pretrained_images_mean(), images_std=module.get_pretrained_images_std(), dataset=dataset) feature_map = output_dict["feature_map"] img = input_dict["image"] feed_list = [img.name] config = hub.RunConfig( use_data_parallel=args.use_data_parallel, use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) task = hub.ImageClassifierTask( data_reader=data_reader, feed_list=feed_list, feature=feature_map, num_classes=dataset.num_labels, config=config) task.finetune_and_eval()
def train(train_i, args): dataset = MyDataset() module = hub.Module(name=args.model) reader = hub.reader.MultiLabelClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len) strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, warmup_proportion=args.warmup_proportion, lr_scheduler=args.lr_scheduler, learning_rate=args.learning_rate) config = hub.RunConfig(use_cuda=args.use_gpu, num_epoch=args.num_epoch, checkpoint_dir=args.checkpoint_dir + str(train_i), batch_size=args.batch_size, eval_interval=eval_interval, log_interval=log_interval, strategy=strategy) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"] # feed_list的Tensor顺序不可以调整 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] cls_task = hub.MultiLabelClassifierTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config) cls_task.main_program.random_seed = args.seed change_task(cls_task, train_i) return cls_task, reader
dataset = hub.dataset.DuEL( tokenizer=tokenizer, max_seq_len=args.max_seq_len) # Construct transfer learning network # Use token-level output. query = outputs["emb"] left = outputs['emb_2'] right = outputs['emb_3'] # Select fine-tune strategy strategy = hub.DefaultStrategy(optimizer_name="sgd") # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( use_data_parallel=False, use_cuda=False, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a text matching task by PaddleHub's API # network choice: bow, cnn, gru, lstm (PaddleHub pre-defined network) pairwise_matching_task = hub.PairwiseTextMatchingTask( dataset=dataset, query_feature=query, left_feature=left, right_feature=right, tokenizer=tokenizer, network=args.network, config=config) # Prediction data sample.
max_seq_len=args.max_seq_len, use_task_id=args.use_taskid) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy(weight_decay=args.weight_decay, learning_rate=args.learning_rate, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a classfication finetune task by PaddleHub's API multi_label_cls_task = hub.MultiLabelClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config) # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically multi_label_cls_task.finetune_and_eval()
module = hub.Module(name="chinese-roberta-wwm-ext-large") inputs, outputs, program = module.context(trainable=True, max_seq_len=128) program.random_seed = 1 reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128, random_seed=1) print("learning rate: ", eval(args.lr)) print("max epoch: ", args.max_epoch) strategy = hub.DefaultFinetuneStrategy(learning_rate=eval(args.lr), optimizer_name="sgd") config = hub.RunConfig(use_cuda=True, num_epoch=args.max_epoch, batch_size=32, strategy=strategy, log_interval=100, eval_interval=1400,save_ckpt_interval=1400, checkpoint_dir='./checkpoint_aug') pooled_output = outputs["pooled_output"] feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] cls_task = hub.TextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list,
def predict(args): module_name = args.module # 'yolov3_darknet53_coco2017' model_type = get_model_type(module_name) # 'yolo' # define data ds = hub.dataset.Coco10(model_type) print("ds.num_labels", ds.num_labels) data_reader = ObjectDetectionReader(dataset=ds, model_type=model_type) # define model(program) module = hub.Module(name=module_name) if model_type == 'rcnn': input_dict, output_dict, program = module.context(trainable=True, phase='train') input_dict_pred, output_dict_pred, program_pred = module.context( trainable=False) else: input_dict, output_dict, program = module.context(trainable=True) input_dict_pred = output_dict_pred = None feed_list, pred_feed_list = get_feed_list(module_name, input_dict, input_dict_pred) feature, pred_feature = get_mid_feature(module_name, output_dict, output_dict_pred) config = hub.RunConfig( use_data_parallel=False, use_pyreader=True, use_cuda=args.use_gpu, batch_size=args.batch_size, enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) task = hub.DetectionTask(data_reader=data_reader, num_classes=ds.num_labels, feed_list=feed_list, feature=feature, predict_feed_list=pred_feed_list, predict_feature=pred_feature, model_type=model_type, config=config) data = [ "./test/test_img_bird.jpg", "./test/test_img_cat.jpg", ] label_map = ds.label_dict() run_states = task.predict(data=data, accelerate_mode=False) results = [run_state.run_results for run_state in run_states] for outs in results: keys = ['im_shape', 'im_id', 'bbox'] res = { k: (np.array(v), v.recursive_sequence_lengths()) for k, v in zip(keys, outs) } print("im_id", res['im_id']) is_bbox_normalized = dconf.conf[model_type]['is_bbox_normalized'] clsid2catid = {} for k in label_map: clsid2catid[k] = k bbox_results = bbox2out([res], clsid2catid, is_bbox_normalized) print(bbox_results)
inputs["input_mask"].name, ] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( weight_decay=args.weight_decay, learning_rate=args.learning_rate, warmup_proportion=args.warmup_proportion, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(eval_interval=300, use_pyreader=args.use_pyreader, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, enable_memory_optim=True, strategy=strategy) # Define a reading comprehension finetune task by PaddleHub's API reading_comprehension_task = hub.ReadingComprehensionTask( data_reader=reader, feature=seq_output, feed_list=feed_list, config=config, sub_task=args.dataset, ) # Finetune by PaddleHub's API
inputs["segment_ids"].name, inputs["input_mask"].name, ] if args.use_taskid: feed_list.append(inputs["task_ids"].name) # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy(weight_decay=args.weight_decay, learning_rate=args.learning_rate, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(use_data_parallel=args.use_data_parallel, use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a regression finetune task by PaddleHub's API reg_task = hub.RegressionTask(data_reader=reader, feature=pooled_output, feed_list=feed_list, config=config) # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically reg_task.finetune_and_eval()
def main(): # Load Paddlehub pretrained model # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel #model_name = "ernie_tiny" model_name = "chinese-roberta-wwm-ext-large" module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # Download dataset and use SequenceLabelReader to read dataset dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = hub.reader.SequenceLabelReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "sequence_output" for token-level output. sequence_output = outputs["sequence_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # Select a finetune strategy strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig(eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a sequence labeling finetune task by PaddleHub's API # If add crf, the network use crf as decoder seq_label_task = hub.SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) # Finetune and evaluate model by PaddleHub's API # will finish training, evaluation, testing, save model automatically if args.do_train: print("start finetune and eval process") seq_label_task.finetune_and_eval() if args.do_predict: print("start predict process") ret = [] id2label = {val: key for key, val in reader.label_map.items()} input_data = [[d] for d in predict_data] run_states = seq_label_task.predict(data=input_data[1:]) results = [] for batch_states in run_states: batch_results = batch_states.run_results batch_infers = batch_results[0].reshape([-1]).astype( np.int32).tolist() seq_lens = batch_results[1].reshape([-1]).astype(np.int32).tolist() current_id = 0 for length in seq_lens: seq_infers = batch_infers[current_id:current_id + length] seq_result = list(map(id2label.get, seq_infers[1:-1])) current_id += length if args.add_crf else args.max_seq_len results.append(seq_result) ret = [] for sent, r_label in zip(predict_sents, results): sent["labels"] = r_label ret.append(json.dumps(sent, ensure_ascii=False)) write_by_lines("{}.{}.pred".format(args.predict_data, args.do_model), ret)
def main(type, cnf): class SouhuCompetition(TextMatchingDataset): def __init__(self, tokenizer=None, max_seq_len=None): base_path = './data' if type in ['ssA', 'slA', 'llA']: train_file = 'data78383/{}_train.tsv'.format(type) dev_file = 'data78383/{}_valid.tsv'.format(type) elif type in ['ssB', 'slB', 'llB']: train_file = 'data78384/{}_train.tsv'.format(type) dev_file = 'data78384/{}_valid.tsv'.format(type) super(SouhuCompetition, self).__init__( is_pair_wise=False, # 文本匹配类型,是否为pairwise base_path=base_path, train_file=train_file, # 相对于base_path的文件路径 dev_file=dev_file, # 相对于base_path的文件路径 train_file_with_header=True, dev_file_with_header=True, label_list=["0", "1"], tokenizer=tokenizer, max_seq_len=max_seq_len) module = hub.Module(name="ernie") # pointwise任务需要: query, title_left (2 slots) inputs, outputs, program = module.context(trainable=True, max_seq_len=cnf.max_seq_len, num_slots=2) tokenizer = hub.BertTokenizer(vocab_file=module.get_vocab_path(), tokenize_chinese_chars=True) dataset = SouhuCompetition(tokenizer=tokenizer, max_seq_len=cnf.max_seq_len) strategy = hub.AdamWeightDecayStrategy(weight_decay=0.01, warmup_proportion=0.1, learning_rate=1e-5) config = hub.RunConfig( eval_interval=300, use_cuda=True, num_epoch=10, batch_size=cnf.train_and_eval_batch, checkpoint_dir='./ckpt_ernie_pointwise_matching_{}'.format(type), strategy=strategy) # 构建迁移网络,使用ernie的token-level输出 query = outputs["sequence_output"] title = outputs['sequence_output_2'] # 创建pointwise文本匹配任务 pointwise_matching_task = hub.PointwiseTextMatchingTask( dataset=dataset, query_feature=query, title_feature=title, tokenizer=tokenizer, config=config) run_states = pointwise_matching_task.finetune_and_eval() # # 预测数据样例 # text_pairs = [ # [ # "小孩吃了百令胶囊能打预防针吗", # query # "小孩吃了百令胶囊能不能打预防针", # title # ], # [ # "请问呕血与咯血有什么区别?", # query # "请问呕血与咯血异同?", # title # ] # ] save_df = pd.DataFrame(columns=['id', 'label']) def predict(text_pairs): results = pointwise_matching_task.predict( data=text_pairs, max_seq_len=cnf.max_seq_len, label_list=dataset.get_labels(), return_result=True, accelerate_mode=False) return results if type in ['ssA', 'slA', 'llA']: test_file = './data/data78383/{}_test.tsv'.format(type) elif type in ['ssB', 'slB', 'llB']: test_file = './data/data78384/{}_test.tsv'.format(type) test_df = pd.read_csv(test_file, sep='\t') test_df.columns = ['text_a', 'text_b', 'id'] text_pairs = [] ids = [] for index, row in test_df.iterrows(): text_pairs.append([row['text_a'], row['text_b']]) ids.append(row['id']) if len(text_pairs) == cnf.test_batch: results = predict(text_pairs) for i in range(len(ids)): new = pd.DataFrame({ 'id': ids[i], 'label': results[i] }, index=[0]) save_df = save_df.append(new, ignore_index=True) text_pairs = [] ids = [] if len(text_pairs) != 0: results = predict(text_pairs) for i in range(len(ids)): new = pd.DataFrame({'id': ids[i], 'label': results[i]}, index=[0]) save_df = save_df.append(new, ignore_index=True) save_df.to_csv('./results/{}.csv'.format(type), header=True, sep=',', index=False)
# * `eval_interval`:每隔50 step在验证集上进行一次性能评估; # # * `checkpoint_dir`:将训练的参数和数据保存到cv_finetune_turtorial_demo目录中; # # * `strategy`:使用DefaultFinetuneStrategy策略进行finetune; # # 更多运行配置,请查看[RunConfig](https://github.com/PaddlePaddle/PaddleHub/wiki/PaddleHub-API:-RunConfig) # # 同时PaddleHub提供了许多优化策略,如`AdamWeightDecayStrategy`、`ULMFiTStrategy`、`DefaultFinetuneStrategy`等,详细信息参见[策略](https://github.com/PaddlePaddle/PaddleHub/wiki/PaddleHub-API:-Strategy) # In[15]: config = hub.RunConfig( use_cuda=False, #是否使用GPU训练,默认为False; num_epoch=3, #Fine-tune的轮数; checkpoint_dir= "cv_finetune_turtorial_demo", #模型checkpoint保存路径, 若用户没有指定,程序会自动生成; batch_size=3, #训练的批大小,如果使用GPU,请根据实际情况调整batch_size; eval_interval=10, #模型评估的间隔,默认每100个step评估一次验证集; strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) #Fine-tune优化策略; # ### Step6、组建Finetune Task # 有了合适的预训练模型和准备要迁移的数据集后,我们开始组建一个Task。 # # 由于该数据设置是一个二分类的任务,而我们下载的分类module是在ImageNet数据集上训练的千分类模型,所以我们需要对模型进行简单的微调,把模型改造为一个二分类模型: # # 1. 获取module的上下文环境,包括输入和输出的变量,以及Paddle Program; # 2. 从输出变量中找到特征图提取层feature_map; # 3. 在feature_map后面接入一个全连接层,生成Task; # In[16]:
inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( eval_interval=300, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # Define a regression finetune task by PaddleHub's API reg_task = hub.RegressionTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, config=config) # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically reg_task.finetune_and_eval()
def get_task(args, schema_labels, id): # 加载PaddleHub 预训练模型ERNIE Tiny/RoBERTa large # 更多预训练模型 https://www.paddlepaddle.org.cn/hublist?filter=en_category&value=SemanticModel # model_name = "ernie_tiny" model_name = args.model_name module = hub.Module(name=model_name) inputs, outputs, program = module.context(trainable=True, max_seq_len=args.max_seq_len) # 加载数据并通过SequenceLabelReader读取数据 dataset = EEDataset(args.data_dir, schema_labels, model=args.do_model) reader = SequenceLabelReader(dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=args.max_seq_len, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # 构建序列标注任务迁移网络 # 使用ERNIE模型字级别的输出sequence_output作为迁移网络的输入 sequence_output = outputs["sequence_output"] # sequence_output = fluid.layers.dropout( # x=sequence_output , # dropout_prob=args.dropout, # dropout_implementation="upscale_in_train") # 设置模型program需要输入的变量feed_list # 必须按照以下顺序设置 feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name ] # 选择优化策略 strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_proportion, weight_decay=args.weight_decay, learning_rate=args.learning_rate) # 配置运行设置 config = hub.RunConfig( log_interval=100, eval_interval=args.eval_step, save_ckpt_interval=args.model_save_step, use_data_parallel=args.use_data_parallel, use_cuda=args.use_gpu, # enable_memory_optim=True, num_epoch=args.num_epoch, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=strategy) # 构建序列标注迁移任务 seq_label_task = SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config, add_crf=args.add_crf) seq_label_task.main_program.random_seed = args.random_seed add_hook(args, seq_label_task, id) return seq_label_task, reader
# Use "sequence_output" for token-level output. seq_output = outputs["sequence_output"] # Setup feed list for data feeder feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, use_cuda=args.use_gpu, batch_size=args.batch_size, checkpoint_dir=args.checkpoint_dir, strategy=hub.AdamWeightDecayStrategy()) # Define a reading comprehension finetune task by PaddleHub's API reading_comprehension_task = hub.ReadingComprehensionTask( data_reader=reader, feature=seq_output, feed_list=feed_list, config=config) # Data to be predicted data = dataset.dev_examples[:10] reading_comprehension_task.predict(data=data)
def finetune(args): module_name = args.module # 'yolov3_darknet53_coco2017' model_type = get_model_type(module_name) # 'yolo' # define dataset ds = hub.dataset.Coco10(model_type) # base_path = '/home/local3/zhaopenghao/data/detect/paddle-job-84942-0' # train_dir = 'train_data/images' # train_list = 'train_data/coco/instances_coco.json' # val_dir = 'eval_data/images' # val_list = 'eval_data/coco/instances_coco.json' # ds = ObjectDetectionDataset(base_path, train_dir, train_list, val_dir, val_list, val_dir, val_list, model_type=model_type) # print(ds.label_dict()) print("ds.num_labels", ds.num_labels) # define batch reader data_reader = ObjectDetectionReader(dataset=ds, model_type=model_type) # define model(program) module = hub.Module(name=module_name) if model_type == 'rcnn': input_dict, output_dict, program = module.context(trainable=True, phase='train') input_dict_pred, output_dict_pred, program_pred = module.context( trainable=False) else: input_dict, output_dict, program = module.context(trainable=True) input_dict_pred = output_dict_pred = None print("input_dict keys", input_dict.keys()) print("output_dict keys", output_dict.keys()) feed_list, pred_feed_list = get_feed_list(module_name, input_dict, input_dict_pred) print("output_dict length:", len(output_dict)) print(output_dict.keys()) if output_dict_pred is not None: print(output_dict_pred.keys()) feature, pred_feature = get_mid_feature(module_name, output_dict, output_dict_pred) config = hub.RunConfig( log_interval=10, eval_interval=100, use_data_parallel=args.use_data_parallel, use_pyreader=True, use_cuda=args.use_gpu, num_epoch=args.num_epoch, batch_size=args.batch_size, enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy( learning_rate=0.00025, optimizer_name="adam")) task = hub.DetectionTask(data_reader=data_reader, num_classes=ds.num_labels, feed_list=feed_list, feature=feature, predict_feed_list=pred_feed_list, predict_feature=pred_feature, model_type=model_type, config=config) task.finetune_and_eval()
sequence_output = outputs["sequence_output"] # Setup feed list for data feeder # Must feed all the tensor of ERNIE's module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=False, use_pyreader=args.use_pyreader, use_cuda=args.use_gpu, batch_size=args.batch_size, enable_memory_optim=False, checkpoint_dir=args.checkpoint_dir, strategy=hub.finetune.strategy.DefaultFinetuneStrategy()) # Define a sequence labeling finetune task by PaddleHub's API seq_label_task = hub.SequenceLabelTask(data_reader=reader, feature=sequence_output, feed_list=feed_list, max_seq_len=args.max_seq_len, num_classes=dataset.num_labels, config=config) # test data data = [ ["我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"],
vocab_path=module.get_vocab_path(), sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path(), max_seq_len=128) strategy = hub.AdamWeightDecayStrategy( weight_decay=0.01, warmup_proportion=0.1, # learning_rate=5e-5, lr_scheduler="linear_decay", learning_rate=5e-5) config = hub.RunConfig(use_cuda=True, use_data_parallel=True, num_epoch=1, checkpoint_dir="module", batch_size=20, eval_interval=400, strategy=strategy) inputs, outputs, program = module.context(trainable=True, max_seq_len=128) # Use "pooled_output" for classification tasks on an entire sentence. pooled_output = outputs["pooled_output"] feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ]
def train_model(model_name): checkpoint_dir = "model/" + model_name dataset_dir = "data/" + model_name # Load Paddlehub ERNIE Tiny pretrained model module = hub.Module(name="ernie_tiny") inputs, outputs, program = module.context( trainable=True, max_seq_len=128) # Download dataset and use accuracy as metrics # Choose dataset: GLUE/XNLI/ChinesesGLUE/NLPCC-DBQA/LCQMC # metric should be acc, f1 or matthews # dataset = hub.dataset.ChnSentiCorp() dataset = ViolateDataset(dataset_dir=dataset_dir) metrics_choices = ["acc"] # For ernie_tiny, it use sub-word to tokenize chinese sentence # If not ernie tiny, sp_model_path and word_dict_path should be set None reader = hub.reader.ClassifyReader( dataset=dataset, vocab_path=module.get_vocab_path(), max_seq_len=128, sp_model_path=module.get_spm_path(), word_dict_path=module.get_word_dict_path()) # Construct transfer learning network # Use "pooled_output" for classification tasks on an entire sentence. # Use "sequence_output" for token-level output. pooled_output = outputs["pooled_output"] # Setup feed list for data feeder # Must feed all the tensor of module need feed_list = [ inputs["input_ids"].name, inputs["position_ids"].name, inputs["segment_ids"].name, inputs["input_mask"].name, ] # Select finetune strategy, setup config and finetune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=0.1, weight_decay=0.01, learning_rate=5e-5, lr_scheduler="linear_decay") # Setup runing config for PaddleHub Finetune API config = hub.RunConfig( use_data_parallel=True, use_cuda=False, num_epoch=3, batch_size=24, checkpoint_dir=checkpoint_dir, # model_dir="./models", enable_memory_optim=True, strategy=strategy) # Define a classfication finetune task by PaddleHub's API cls_task = hub.TextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config, metrics_choices=metrics_choices) # with cls_task.phase_guard(phase="train"): # cls_task.init_if_necessary() # cls_task.load_parameters("./models/model") # Finetune and evaluate by PaddleHub's API # will finish training, evaluation, testing, save model automatically # cls_task.finetune_and_eval() cls_task.finetune() # Evaluate by PaddleHub's API run_states = cls_task.eval() # Get acc score on dev eval_avg_score, eval_avg_loss, eval_run_speed = cls_task._calculate_metrics( run_states) # acc on dev will be used by auto finetune print("AutoFinetuneEval" + "\t" + str(float(eval_avg_score["acc"])))
inputs["segment_ids"].name, inputs["input_mask"].name, ] # Select fine-tune strategy, setup config and fine-tune strategy = hub.AdamWeightDecayStrategy( warmup_proportion=args.warmup_prop, learning_rate=args.learning_rate, weight_decay=args.weight_decay, lr_scheduler="linear_decay") # Setup RunConfig for PaddleHub Fine-tune API config = hub.RunConfig( checkpoint_dir=args.checkpoint_dir, use_cuda=True, num_epoch=args.epochs, batch_size=args.batch_size, enable_memory_optim=True, strategy=strategy) # Define a classfication fine-tune task by PaddleHub's API cls_task = hub.TextClassifierTask( data_reader=reader, feature=pooled_output, feed_list=feed_list, num_classes=dataset.num_labels, config=config, metrics_choices=metrics_choices) # Load model from the defined model path or not if args.model_path != "":