def __init__(self, args): self.args = args self.ext_predictor, self.ext_input_handles, self.ext_output_hanle = self.create_predictor( args.ext_model_path) print(f"ext_model_path: {args.ext_model_path}, {self.ext_predictor}") self.cls_predictor, self.cls_input_handles, self.cls_output_hanle = self.create_predictor( args.cls_model_path) self.ext_label2id, self.ext_id2label = load_dict(args.ext_label_path) self.cls_label2id, self.cls_id2label = load_dict(args.cls_label_path) self.tokenizer = SkepTokenizer.from_pretrained(args.base_model_name)
def predict_cls(args, ext_results): # load dict model_name = "skep_ernie_1.0_large_ch" cls_label2id, cls_id2label = load_dict(args.cls_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=tokenizer, label2id=cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64") ): fn(samples) # set shuffle is False test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) print("test data loaded.") # load cls model cls_state_dict = paddle.load(args.cls_model_path) cls_model = SkepForSequenceClassification.from_pretrained( model_name, num_classes=len(cls_label2id)) cls_model.load_dict(cls_state_dict) print("classification model loaded.") cls_model.eval() results = [] for bid, batch_data in enumerate(test_loader): input_ids, token_type_ids, seq_lens = batch_data logits = cls_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=1).numpy().tolist() results.extend(predictions) results = [cls_id2label[pred_id] for pred_id in results] return results
if __name__ == "__main__": paddle.set_device(args.device) # These data samples is in Chinese. # If you use the english model, you should change the test data in English. data = [ '这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般', '怀着十分激动的心情放映,可是看着看着发现,在放映完毕后,出现一集米老鼠的动画片', '作为老的四星酒店,房间依然很整洁,相当不错。机场接机服务很好,可以在车上办理入住手续,节省时间。', ] label_map = {0: 'negative', 1: 'positive'} model = SkepForSequenceClassification.from_pretrained( args.model_name, num_classes=len(label_map)) tokenizer = SkepTokenizer.from_pretrained(args.model_name) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) results = predict(model, data, tokenizer, label_map, batch_size=args.batch_size) for idx, text in enumerate(data): print('Data: {} \t Label: {}'.format(text, results[idx]))
paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_ds = load_dataset("cote", "dp", splits=['train']) # The COTE_DP dataset labels with "BIO" schema. label_map = {label: idx for idx, label in enumerate(train_ds.label_list)} # `no_entity_label` represents that the token isn't an entity. no_entity_label_idx = label_map.get("O", 2) set_seed(args.seed) skep = SkepModel.from_pretrained('skep_ernie_1.0_large_ch') model = SkepCrfForTokenClassification( skep, num_classes=len(train_ds.label_list)) tokenizer = SkepTokenizer.from_pretrained('skep_ernie_1.0_large_ch') trans_func = partial( convert_example_to_feature, tokenizer=tokenizer, max_seq_len=args.max_seq_length, no_entity_label=no_entity_label_idx, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # token type ids Stack(dtype='int64'), # sequence lens Pad(axis=0, pad_val=no_entity_label_idx) # labels ): [data for data in fn(samples)] train_data_loader = create_dataloader(
def predict_ext(args): # load dict model_name = "skep_ernie_1.0_large_ch" ext_label2id, ext_id2label = load_dict(args.ext_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) ori_test_ds = load_dataset(read_test_file, data_path=args.test_path, lazy=False) trans_func = partial(convert_example_to_feature_ext, tokenizer=tokenizer, label2id=ext_label2id, max_seq_len=args.ext_max_seq_len, is_test=True) test_ds = copy.copy(ori_test_ds).map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64"), ): fn(samples) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) print("test data loaded.") # load ext model ext_state_dict = paddle.load(args.ext_model_path) ext_model = SkepForTokenClassification.from_pretrained( model_name, num_classes=len(ext_label2id)) ext_model.load_dict(ext_state_dict) print("extraction model loaded.") ext_model.eval() results = [] for bid, batch_data in enumerate(test_loader): input_ids, token_type_ids, seq_lens = batch_data logits = ext_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=2).numpy() for eid, (seq_len, prediction) in enumerate(zip(seq_lens, predictions)): idx = bid * args.batch_size + eid tag_seq = [ext_id2label[idx] for idx in prediction[:seq_len][1:-1]] text = ori_test_ds[idx]["text"] aps = decoding(text[:args.ext_max_seq_len - 2], tag_seq) for aid, ap in enumerate(aps): aspect, opinions = ap[0], list(set(ap[1:])) aspect_text = concate_aspect_and_opinion( text, aspect, opinions) results.append({ "id": str(idx) + "_" + str(aid), "aspect": aspect, "opinions": opinions, "text": text, "aspect_text": aspect_text }) return results
def train(): # set running envir model_name = "skep_ernie_1.0_large_ch" paddle.set_device(args.device) set_seed(args.seed) if not os.path.exists(args.checkpoints): os.mkdir(args.checkpoints) # load and process data label2id, id2label = load_dict(args.label_path) train_ds = load_dataset(read, data_path=args.train_path, lazy=False) dev_ds = load_dataset(read, data_path=args.dev_path, lazy=False) tokenizer = SkepTokenizer.from_pretrained(model_name) trans_func = partial( convert_example_to_feature, tokenizer=tokenizer, label2id=label2id, max_seq_len=args.max_seq_len) train_ds = train_ds.map(trans_func, lazy=False) dev_ds = dev_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64"), Pad(axis=0, pad_val= -1, dtype="int64") ): fn(samples) train_batch_sampler = paddle.io.BatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) train_loader = paddle.io.DataLoader( train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader( dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn) # configure model training model = SkepForTokenClassification.from_pretrained( model_name, num_classes=len(label2id)) num_training_steps = len(train_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup( learning_rate=args.learning_rate, total_steps=num_training_steps, warmup=args.warmup_proportion) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) metric = ChunkEvaluator(label2id.keys()) # start to train model global_step, best_f1 = 1, 0. model.train() for epoch in range(1, args.num_epochs + 1): for batch_data in train_loader(): input_ids, token_type_ids, _, labels = batch_data # logits: batch_size, seql_len, num_tags logits = model(input_ids, token_type_ids=token_type_ids) loss = F.cross_entropy( logits.reshape([-1, len(label2id)]), labels.reshape([-1]), ignore_index=-1) loss.backward() lr_scheduler.step() optimizer.step() optimizer.clear_grad() if global_step > 0 and global_step % args.log_steps == 0: print( f"epoch: {epoch} - global_step: {global_step}/{num_training_steps} - loss:{loss.numpy().item():.6f}" ) if (global_step > 0 and global_step % args.eval_steps == 0 ) or global_step == num_training_steps: precision, recall, f1 = evaluate(model, dev_loader, metric) model.train() if f1 > best_f1: print( f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}" ) best_f1 = f1 paddle.save(model.state_dict(), f"{args.checkpoints}/best.pdparams") print( f'evalution result: precision: {precision:.5f}, recall: {recall:.5f}, F1: {f1:.5f}' ) global_step += 1 paddle.save(model.state_dict(), f"{args.checkpoints}/final.pdparams")