def predict_cls(self, args, ext_results): test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=self.tokenizer, label2id=self.cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batch_list = [ test_ds[idx:idx + args.batch_size] for idx in range(0, len(test_ds), args.batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64")): fn(samples) results = [] for batch_data in batch_list: input_ids, token_type_ids, _ = batchify_fn(batch_data) self.cls_input_handles[0].copy_from_cpu(input_ids) self.cls_input_handles[1].copy_from_cpu(token_type_ids) self.cls_predictor.run() logits = self.cls_output_hanle.copy_to_cpu() predictions = logits.argmax(axis=1).tolist() results.extend(predictions) return results
def predict_cls(args, ext_results): # load dict model_name = "skep_ernie_1.0_large_ch" cls_label2id, cls_id2label = load_dict(args.cls_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=tokenizer, label2id=cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64") ): fn(samples) # set shuffle is False test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) print("test data loaded.") # load cls model cls_state_dict = paddle.load(args.cls_model_path) cls_model = SkepForSequenceClassification.from_pretrained( model_name, num_classes=len(cls_label2id)) cls_model.load_dict(cls_state_dict) print("classification model loaded.") cls_model.eval() results = [] for bid, batch_data in enumerate(test_loader): input_ids, token_type_ids, seq_lens = batch_data logits = cls_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=1).numpy().tolist() results.extend(predictions) results = [cls_id2label[pred_id] for pred_id in results] return results
def train(args): # 加载数据 trainset = IMDBDataset(is_training=True) testset = IMDBDataset(is_training=False) # 封装成MapDataSet的形式 train_ds = MapDataset(trainset, label_list=[0, 1]) test_ds = MapDataset(testset, label_list=[0, 1]) # 定义XLNet的Tokenizer tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) # 构造train_data_loader 和 dev_data_loader train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = MapDataset(testset) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # 训练配置 # 固定随机种子 set_seed(args) # 设定运行环境 use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') num_classes = len(train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) #paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = paddle.DataParallel(model) # 设定lr_scheduler if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # 制定优化器 clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 模型训练 metric = Accuracy() # 定义损失函数 loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: exit(0) tic_train += time.time() - tic_eval