def __init__(self, args): self.args = args self.max_seq_length = args.max_seq_length # init ser token and model tokenizer_class, base_model_class, model_class = MODELS[ args.ser_model_type] self.tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) self.model = model_class.from_pretrained(args.model_name_or_path) self.model.eval() # init ocr_engine from paddleocr import PaddleOCR self.ocr_engine = PaddleOCR( rec_model_dir=args.rec_model_dir, det_model_dir=args.det_model_dir, use_angle_cls=False, show_log=False) # init dict label2id_map, self.id2label_map = get_bio_label_maps( args.label_map_path) self.label2id_map_for_draw = dict() for key in label2id_map: if key.startswith("I-"): self.label2id_map_for_draw[key] = label2id_map["B" + key[1:]] else: self.label2id_map_for_draw[key] = label2id_map[key]
def eval(args): logger = get_logger() label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) model = LayoutXLMForRelationExtraction.from_pretrained( args.model_name_or_path) eval_dataset = XFUNDataset(tokenizer, data_dir=args.eval_data_dir, label_path=args.eval_label_path, label2id_map=label2id_map, img_size=(224, 224), max_seq_len=args.max_seq_length, pad_token_label_id=pad_token_label_id, contains_re=True, add_special_ids=False, return_attention_mask=True, load_mode='all') eval_dataloader = paddle.io.DataLoader( eval_dataset, batch_size=args.per_gpu_eval_batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=DataCollator()) results = evaluate(model, eval_dataloader, logger) logger.info("eval results: {}".format(results))
def postprocess(attention_mask, preds, label_map_path): if isinstance(preds, paddle.Tensor): preds = preds.numpy() preds = np.argmax(preds, axis=2) _, label_map = get_bio_label_maps(label_map_path) preds_list = [[] for _ in range(preds.shape[0])] # keep batch info for i in range(preds.shape[0]): for j in range(preds.shape[1]): if attention_mask[i][j] == 1: preds_list[i].append(label_map[preds[i][j]]) return preds_list
def merge_preds_list_with_ocr_info(label_map_path, ocr_info, segment_offset_id, preds_list): # must ensure the preds_list is generated from the same image preds = [p for pred in preds_list for p in pred] label2id_map, _ = get_bio_label_maps(label_map_path) for key in label2id_map: if key.startswith("I-"): label2id_map[key] = label2id_map["B" + key[1:]] id2label_map = dict() for key in label2id_map: val = label2id_map[key] if key == "O": id2label_map[val] = key if key.startswith("B-") or key.startswith("I-"): id2label_map[val] = key[2:] else: id2label_map[val] = key for idx in range(len(segment_offset_id)): if idx == 0: start_id = 0 else: start_id = segment_offset_id[idx - 1] end_id = segment_offset_id[idx] curr_pred = preds[start_id:end_id] curr_pred = [label2id_map[p] for p in curr_pred] if len(curr_pred) <= 0: pred_id = 0 else: counts = np.bincount(curr_pred) pred_id = np.argmax(counts) ocr_info[idx]["pred_id"] = int(pred_id) ocr_info[idx]["pred"] = id2label_map[pred_id] return ocr_info
def eval(args): logger = get_logger() print_arguments(args, logger) label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index tokenizer_class, base_model_class, model_class = MODELS[ args.ser_model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) eval_dataset = XFUNDataset(tokenizer, data_dir=args.eval_data_dir, label_path=args.eval_label_path, label2id_map=label2id_map, img_size=(224, 224), pad_token_label_id=pad_token_label_id, contains_re=False, add_special_ids=False, return_attention_mask=True, load_mode='all') eval_dataloader = paddle.io.DataLoader( eval_dataset, batch_size=args.per_gpu_eval_batch_size, num_workers=args.num_workers, use_shared_memory=True, collate_fn=None, ) loss_class = SERLoss(len(label2id_map)) results, _ = evaluate(args, model, tokenizer, loss_class, eval_dataloader, label2id_map, id2label_map, pad_token_label_id, logger) logger.info(results)
def train(args): os.makedirs(args.output_dir, exist_ok=True) rank = paddle.distributed.get_rank() distributed = paddle.distributed.get_world_size() > 1 logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) print_arguments(args, logger) label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) loss_class = SERLoss(len(label2id_map)) pad_token_label_id = loss_class.ignore_index # dist mode if distributed: paddle.distributed.init_parallel_env() tokenizer_class, base_model_class, model_class = MODELS[args.ser_model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) if not args.resume: base_model = base_model_class.from_pretrained(args.model_name_or_path) model = model_class( base_model, num_classes=len(label2id_map), dropout=None) logger.info('train from scratch') else: logger.info('resume from {}'.format(args.model_name_or_path)) model = model_class.from_pretrained(args.model_name_or_path) # dist mode if distributed: model = paddle.DataParallel(model) train_dataset = XFUNDataset( tokenizer, data_dir=args.train_data_dir, label_path=args.train_label_path, label2id_map=label2id_map, img_size=(224, 224), pad_token_label_id=pad_token_label_id, contains_re=False, add_special_ids=False, return_attention_mask=True, load_mode='all') eval_dataset = XFUNDataset( tokenizer, data_dir=args.eval_data_dir, label_path=args.eval_label_path, label2id_map=label2id_map, img_size=(224, 224), pad_token_label_id=pad_token_label_id, contains_re=False, add_special_ids=False, return_attention_mask=True, load_mode='all') train_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) train_dataloader = paddle.io.DataLoader( train_dataset, batch_sampler=train_sampler, num_workers=args.num_workers, use_shared_memory=True, collate_fn=None, ) eval_dataloader = paddle.io.DataLoader( eval_dataset, batch_size=args.per_gpu_eval_batch_size, num_workers=args.num_workers, use_shared_memory=True, collate_fn=None, ) t_total = len(train_dataloader) * args.num_train_epochs # build linear decay with warmup lr sch lr_scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=t_total, end_lr=0.0, power=1.0) if args.warmup_steps > 0: lr_scheduler = paddle.optimizer.lr.LinearWarmup( lr_scheduler, args.warmup_steps, start_lr=0, end_lr=args.learning_rate, ) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), epsilon=args.adam_epsilon, weight_decay=args.weight_decay) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed) = %d", args.per_gpu_train_batch_size * paddle.distributed.get_world_size(), ) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 set_seed(args.seed) best_metrics = None train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() print_step = 1 model.train() for epoch_id in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): train_reader_cost += time.time() - reader_start if args.ser_model_type == 'LayoutLM': if 'image' in batch: batch.pop('image') labels = batch.pop('labels') train_start = time.time() outputs = model(**batch) train_run_cost += time.time() - train_start if args.ser_model_type == 'LayoutXLM': outputs = outputs[0] loss = loss_class(labels, outputs, batch['attention_mask']) # model outputs are always tuple in ppnlp (see doc) loss = loss.mean() loss.backward() tr_loss += loss.item() optimizer.step() lr_scheduler.step() # Update learning rate schedule optimizer.clear_grad() global_step += 1 total_samples += batch['input_ids'].shape[0] if rank == 0 and step % print_step == 0: logger.info( "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch_id, args.num_train_epochs, step, len(train_dataloader), global_step, loss.numpy()[0], lr_scheduler.get_lr(), train_reader_cost / print_step, (train_reader_cost + train_run_cost) / print_step, total_samples / print_step, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, loss_class, eval_dataloader, label2id_map, id2label_map, pad_token_label_id, logger) if best_metrics is None or results["f1"] >= best_metrics["f1"]: best_metrics = copy.deepcopy(results) output_dir = os.path.join(args.output_dir, "best_model") os.makedirs(output_dir, exist_ok=True) if distributed: model._layers.save_pretrained(output_dir) else: model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to {}".format( output_dir)) logger.info("[epoch {}/{}][iter: {}/{}] results: {}".format( epoch_id, args.num_train_epochs, step, len(train_dataloader), results)) if best_metrics is not None: logger.info("best metrics: {}".format(best_metrics)) reader_start = time.time() if rank == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "latest_model") os.makedirs(output_dir, exist_ok=True) if distributed: model._layers.save_pretrained(output_dir) else: model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to {}".format(output_dir)) return global_step, tr_loss / global_step
def train(args): logger = get_logger(log_file=os.path.join(args.output_dir, "train.log")) rank = paddle.distributed.get_rank() distributed = paddle.distributed.get_world_size() > 1 print_arguments(args, logger) # Added here for reproducibility (even between python 2 and 3) set_seed(args.seed) label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index # dist mode if distributed: paddle.distributed.init_parallel_env() tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) if not args.resume: model = LayoutXLMModel.from_pretrained(args.model_name_or_path) model = LayoutXLMForRelationExtraction(model, dropout=None) logger.info('train from scratch') else: logger.info('resume from {}'.format(args.model_name_or_path)) model = LayoutXLMForRelationExtraction.from_pretrained( args.model_name_or_path) # dist mode if distributed: model = paddle.DataParallel(model) train_dataset = XFUNDataset( tokenizer, data_dir=args.train_data_dir, label_path=args.train_label_path, label2id_map=label2id_map, img_size=(224, 224), max_seq_len=args.max_seq_length, pad_token_label_id=pad_token_label_id, contains_re=True, add_special_ids=False, return_attention_mask=True, load_mode='all') eval_dataset = XFUNDataset( tokenizer, data_dir=args.eval_data_dir, label_path=args.eval_label_path, label2id_map=label2id_map, img_size=(224, 224), max_seq_len=args.max_seq_length, pad_token_label_id=pad_token_label_id, contains_re=True, add_special_ids=False, return_attention_mask=True, load_mode='all') train_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.per_gpu_train_batch_size, shuffle=True) train_dataloader = paddle.io.DataLoader( train_dataset, batch_sampler=train_sampler, num_workers=args.num_workers, use_shared_memory=True, collate_fn=DataCollator()) eval_dataloader = paddle.io.DataLoader( eval_dataset, batch_size=args.per_gpu_eval_batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=DataCollator()) t_total = len(train_dataloader) * args.num_train_epochs # build linear decay with warmup lr sch lr_scheduler = paddle.optimizer.lr.PolynomialDecay( learning_rate=args.learning_rate, decay_steps=t_total, end_lr=0.0, power=1.0) if args.warmup_steps > 0: lr_scheduler = paddle.optimizer.lr.LinearWarmup( lr_scheduler, args.warmup_steps, start_lr=0, end_lr=args.learning_rate, ) grad_clip = paddle.nn.ClipGradByNorm(clip_norm=10) optimizer = paddle.optimizer.Adam( learning_rate=args.learning_rate, parameters=model.parameters(), epsilon=args.adam_epsilon, grad_clip=grad_clip, weight_decay=args.weight_decay) # Train! logger.info("***** Running training *****") logger.info(" Num examples = {}".format(len(train_dataset))) logger.info(" Num Epochs = {}".format(args.num_train_epochs)) logger.info(" Instantaneous batch size per GPU = {}".format( args.per_gpu_train_batch_size)) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = {}". format(args.per_gpu_train_batch_size * paddle.distributed.get_world_size())) logger.info(" Total optimization steps = {}".format(t_total)) global_step = 0 model.clear_gradients() train_dataloader_len = len(train_dataloader) best_metirc = {'f1': 0} model.train() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() print_step = 1 for epoch in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): train_reader_cost += time.time() - reader_start train_start = time.time() outputs = model(**batch) train_run_cost += time.time() - train_start # model outputs are always tuple in ppnlp (see doc) loss = outputs['loss'] loss = loss.mean() loss.backward() optimizer.step() optimizer.clear_grad() # lr_scheduler.step() # Update learning rate schedule global_step += 1 total_samples += batch['image'].shape[0] if rank == 0 and step % print_step == 0: logger.info( "epoch: [{}/{}], iter: [{}/{}], global_step:{}, train loss: {:.6f}, lr: {:.6f}, avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch, args.num_train_epochs, step, train_dataloader_len, global_step, np.mean(loss.numpy()), optimizer.get_lr(), train_reader_cost / print_step, ( train_reader_cost + train_run_cost) / print_step, total_samples / print_step, total_samples / ( train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 if rank == 0 and args.eval_steps > 0 and global_step % args.eval_steps == 0 and args.evaluate_during_training: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(model, eval_dataloader, logger) if results['f1'] >= best_metirc['f1']: best_metirc = results output_dir = os.path.join(args.output_dir, "best_model") os.makedirs(output_dir, exist_ok=True) if distributed: model._layers.save_pretrained(output_dir) else: model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to {}".format( output_dir)) logger.info("eval results: {}".format(results)) logger.info("best_metirc: {}".format(best_metirc)) reader_start = time.time() if rank == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "latest_model") os.makedirs(output_dir, exist_ok=True) if distributed: model._layers.save_pretrained(output_dir) else: model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to {}".format(output_dir)) logger.info("best_metirc: {}".format(best_metirc))
def infer(args): os.makedirs(args.output_dir, exist_ok=True) logger = get_logger() label2id_map, id2label_map = get_bio_label_maps(args.label_map_path) pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index tokenizer = LayoutXLMTokenizer.from_pretrained(args.model_name_or_path) model = LayoutXLMForRelationExtraction.from_pretrained( args.model_name_or_path) eval_dataset = XFUNDataset( tokenizer, data_dir=args.eval_data_dir, label_path=args.eval_label_path, label2id_map=label2id_map, img_size=(224, 224), max_seq_len=args.max_seq_length, pad_token_label_id=pad_token_label_id, contains_re=True, add_special_ids=False, return_attention_mask=True, load_mode='all') eval_dataloader = paddle.io.DataLoader( eval_dataset, batch_size=args.per_gpu_eval_batch_size, num_workers=8, shuffle=False, collate_fn=DataCollator()) # 读取gt的oct数据 ocr_info_list = load_ocr(args.eval_data_dir, args.eval_label_path) for idx, batch in enumerate(eval_dataloader): ocr_info = ocr_info_list[idx] image_path = ocr_info['image_path'] ocr_info = ocr_info['ocr_info'] save_img_path = os.path.join( args.output_dir, os.path.splitext(os.path.basename(image_path))[0] + "_re.jpg") logger.info("[Infer] process: {}/{}, save result to {}".format( idx, len(eval_dataloader), save_img_path)) with paddle.no_grad(): outputs = model(**batch) pred_relations = outputs['pred_relations'] # 根据entity里的信息,做token解码后去过滤不要的ocr_info ocr_info = filter_bg_by_txt(ocr_info, batch, tokenizer) # 进行 relations 到 ocr信息的转换 result = [] used_tail_id = [] for relations in pred_relations: for relation in relations: if relation['tail_id'] in used_tail_id: continue if relation['head_id'] not in ocr_info or relation[ 'tail_id'] not in ocr_info: continue used_tail_id.append(relation['tail_id']) ocr_info_head = ocr_info[relation['head_id']] ocr_info_tail = ocr_info[relation['tail_id']] result.append((ocr_info_head, ocr_info_tail)) img = cv2.imread(image_path) img_show = draw_re_results(img, result) cv2.imwrite(save_img_path, img_show)