def test_min_rect_crop(): dummy_img = np.ones((600, 600, 3), dtype=np.uint8) dummy_box = [20, 20, 120, 20, 120, 40, 20, 40] cropped_img = crop_img(dummy_img, dummy_box) with pytest.raises(AssertionError): crop_img(dummy_img, []) with pytest.raises(AssertionError): crop_img(dummy_img, [20, 40, 40, 20]) assert math.isclose(cropped_img.shape[0], 20) assert math.isclose(cropped_img.shape[1], 100)
def generate_ann(root_path, split, image_infos): """Generate cropped annotations and label txt file. Args: root_path(str): The relative path of the totaltext file split(str): The split of dataset. Namely: training or test image_infos(list[dict]): A list of dicts of the img and annotation information """ dst_image_root = osp.join(root_path, 'dst_imgs', split) if split == 'training': dst_label_file = osp.join(root_path, 'train_label.txt') elif split == 'test': dst_label_file = osp.join(root_path, 'test_label.txt') os.makedirs(dst_image_root, exist_ok=True) lines = [] for image_info in image_infos: index = 1 src_img_path = osp.join(root_path, 'imgs', image_info['file_name']) image = mmcv.imread(src_img_path) src_img_root = osp.splitext(image_info['file_name'])[0].split('/')[1] for anno in image_info['anno_info']: word = anno['word'] dst_img = crop_img(image, anno['bbox']) dst_img_name = f'{src_img_root}_{index}.png' index += 1 dst_img_path = osp.join(dst_image_root, dst_img_name) mmcv.imwrite(dst_img, dst_img_path) lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} ' f'{word}') list_to_file(dst_label_file, lines)
def generate_ann(root_path, split, image_infos, preserve_vertical, format): """Generate cropped annotations and label txt file. Args: root_path (str): The root path of the dataset split (str): The split of dataset. Namely: training or test image_infos (list[dict]): A list of dicts of the img and annotation information preserve_vertical (bool): Whether to preserve vertical texts format (str): Annotation format, should be either 'txt' or 'jsonl' """ dst_image_root = osp.join(root_path, 'crops', split) ignore_image_root = osp.join(root_path, 'ignores', split) if split == 'training': dst_label_file = osp.join(root_path, f'train_label.{format}') elif split == 'val': dst_label_file = osp.join(root_path, f'val_label.{format}') mmcv.mkdir_or_exist(dst_image_root) mmcv.mkdir_or_exist(ignore_image_root) lines = [] for image_info in image_infos: index = 1 src_img_path = osp.join(root_path, 'imgs', split, image_info['file_name']) image = mmcv.imread(src_img_path) src_img_root = image_info['file_name'].split('.')[0] for anno in image_info['anno_info']: word = anno['word'] dst_img = crop_img(image, anno['bbox'], 0, 0) h, w, _ = dst_img.shape dst_img_name = f'{src_img_root}_{index}.png' index += 1 # Skip invalid annotations if min(dst_img.shape) == 0 or len(word) == 0: continue # Filter out vertical texts if not preserve_vertical and h / w > 2 and split == 'training': dst_img_path = osp.join(ignore_image_root, dst_img_name) else: dst_img_path = osp.join(dst_image_root, dst_img_name) mmcv.imwrite(dst_img, dst_img_path) if format == 'txt': lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} ' f'{word}') elif format == 'jsonl': lines.append( json.dumps({ 'filename': f'{osp.basename(dst_image_root)}/{dst_img_name}', 'text': word })) else: raise NotImplementedError list_to_file(dst_label_file, lines)
def det_and_recog_inference(args, det_model, recog_model): image_path = args.img end2end_res = {'filename': image_path} end2end_res['result'] = [] image = mmcv.imread(image_path) det_result = model_inference(det_model, image) bboxes = det_result['boundary_result'] for bbox in bboxes: box_res = {} box_res['box'] = [round(x) for x in bbox[:-1]] box_res['box_score'] = float(bbox[-1]) box = bbox[:8] if len(bbox) > 9: min_x = min(bbox[0:-1:2]) min_y = min(bbox[1:-1:2]) max_x = max(bbox[0:-1:2]) max_y = max(bbox[1:-1:2]) box = [min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y] box_img = crop_img(image, box) recog_result = model_inference(recog_model, box_img) text = recog_result['text'] text_score = recog_result['score'] if isinstance(text_score, list): text_score = sum(text_score) / max(1, len(text)) box_res['text'] = text box_res['text_score'] = text_score end2end_res['result'].append(box_res) return end2end_res
def generate_ann(root_path, split, image_infos, preserve_vertical, format): """Generate cropped annotations and label txt file. Args: root_path (str): The root path of the dataset split (str): The split of dataset. Namely: training or test image_infos (list[dict]): A list of dicts of the img and annotation information preserve_vertical (bool): Whether to preserve vertical texts format (str): Using jsonl(dict) or str to format annotations """ dst_image_root = osp.join(root_path, 'dst_imgs', split) if split == 'training': dst_label_file = osp.join(root_path, f'train_label.{format}') elif split == 'test': dst_label_file = osp.join(root_path, f'test_label.{format}') os.makedirs(dst_image_root, exist_ok=True) lines = [] for image_info in image_infos: index = 1 src_img_path = osp.join(root_path, 'imgs', image_info['file_name']) image = mmcv.imread(src_img_path) src_img_root = image_info['file_name'].split('.')[0] for anno in image_info['anno_info']: word = anno['word'] dst_img = crop_img(image, anno['bbox']) h, w, _ = dst_img.shape # Skip invalid annotations if min(dst_img.shape) == 0: continue # Skip vertical texts if not preserve_vertical and h / w > 2: continue dst_img_name = f'{src_img_root}_{index}.png' index += 1 dst_img_path = osp.join(dst_image_root, dst_img_name) mmcv.imwrite(dst_img, dst_img_path) if format == 'txt': lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} ' f'{word}') elif format == 'jsonl': lines.append( json.dumps( { 'filename': f'{osp.basename(dst_image_root)}/{dst_img_name}', 'text': word }, ensure_ascii=False)) else: raise NotImplementedError list_to_file(dst_label_file, lines)
def det_and_recog_inference(args, det_model, recog_model): image_path = args.img end2end_res = {'filename': image_path} end2end_res['result'] = [] image = mmcv.imread(image_path) det_result = model_inference(det_model, image) bboxes = det_result['boundary_result'] box_imgs = [] for bbox in bboxes: box_res = {} box_res['box'] = [round(x) for x in bbox[:-1]] box_res['box_score'] = float(bbox[-1]) box = bbox[:8] if len(bbox) > 9: min_x = min(bbox[0:-1:2]) min_y = min(bbox[1:-1:2]) max_x = max(bbox[0:-1:2]) max_y = max(bbox[1:-1:2]) box = [min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y] box_img = crop_img(image, box) if args.batch_mode: box_imgs.append(box_img) else: recog_result = model_inference(recog_model, box_img) text = recog_result['text'] text_score = recog_result['score'] if isinstance(text_score, list): text_score = sum(text_score) / max(1, len(text)) box_res['text'] = text box_res['text_score'] = text_score end2end_res['result'].append(box_res) if args.batch_mode: batch_size = args.batch_size for chunk_idx in range(len(box_imgs) // batch_size + 1): start_idx = chunk_idx * batch_size end_idx = (chunk_idx + 1) * batch_size chunk_box_imgs = box_imgs[start_idx:end_idx] if len(chunk_box_imgs) == 0: continue recog_results = model_inference(recog_model, chunk_box_imgs, batch_mode=True) for i, recog_result in enumerate(recog_results): text = recog_result['text'] text_score = recog_result['score'] if isinstance(text_score, list): text_score = sum(text_score) / max(1, len(text)) end2end_res['result'][start_idx + i]['text'] = text end2end_res['result'][start_idx + i]['text_score'] = text_score return end2end_res
def generate_ann(root_path, split, image_infos, format): dst_image_root = osp.join(root_path, 'crops', split) dst_label_file = osp.join(root_path, f'{split}_label.{format}') os.makedirs(dst_image_root, exist_ok=True) lines = [] for image_info in image_infos: index = 1 src_img_path = osp.join(root_path, 'imgs', image_info['file_name']) image = mmcv.imread(src_img_path) src_img_root = image_info['file_name'].split('.')[0] for anno in image_info['anno_info']: word = anno['word'] dst_img = crop_img(image, anno['bbox'], 0, 0) # Skip invalid annotations if min(dst_img.shape) == 0: continue dst_img_name = f'{src_img_root}_{index}.png' index += 1 dst_img_path = osp.join(dst_image_root, dst_img_name) mmcv.imwrite(dst_img, dst_img_path) if format == 'txt': lines.append(f'{osp.basename(dst_image_root)}/{dst_img_name} ' f'{word}') elif format == 'jsonl': lines.append( json.dumps({ 'filename': f'{osp.basename(dst_image_root)}/{dst_img_name}', 'text': word })) else: raise NotImplementedError list_to_file(dst_label_file, lines)
def parse_labelme_json(json_file, img_dir, out_dir, tasks, ignore_marker='###', recog_format='jsonl', warp_flag=False): invalid_res = [[], [], []] json_obj = mmcv.load(json_file) img_file = osp.basename(json_obj['imagePath']) img_full_path = osp.join(img_dir, img_file) img_width = json_obj['imageWidth'] img_height = json_obj['imageHeight'] if 'recog' in tasks: src_img = mmcv.imread(img_full_path) img_basename = osp.splitext(img_file)[0] sub_dir = osp.join(out_dir, 'crops', img_basename) mmcv.mkdir_or_exist(sub_dir) det_line_json_list = [] recog_crop_line_str_list = [] recog_warp_line_str_list = [] shape_info = json_obj['shapes'] idx = 0 annos = [] for box_info in shape_info: shape = box_info['shape_type'] if shape not in ['rectangle', 'polygon']: msg = 'Only \'rectangle\' and \'polygon\' boxes are supported. ' msg += f'Boxes with {shape} will be discarded.' warnings.warn(msg) return invalid_res poly = [] box_points = box_info['points'] for point in box_points: poly.extend([int(x) for x in point]) x_list = poly[0::2] y_list = poly[1::2] quad = [] if shape == 'rectangle': warp_flag = False quad = [ poly[0], poly[1], poly[2], poly[1], poly[2], poly[3], poly[0], poly[3] ] else: if len(poly) < 8 or len(poly) % 2 != 0: msg = f'Invalid polygon {poly}. ' msg += 'The polygon is expected to have 8 or more than 8 ' msg += 'even number of coordinates in MMOCR.' warnings.warn(msg) return invalid_res if len(poly) == 8: quad = poly else: warp_flag = False x_min, x_max, y_min, y_max = min(x_list), max(x_list), min( y_list), max(y_list) quad = [x_min, y_min, x_max, y_min, x_max, y_max, x_min, y_max] text_label = box_info['label'] # for textdet anno = {} anno['iscrowd'] = 0 if text_label != ignore_marker else 1 anno['category_id'] = 1 w = max(x_list) - min(x_list) h = max(y_list) - min(y_list) anno['bbox'] = [min(x_list), min(y_list), w, h] if shape == 'rectangle': anno['segmentation'] = [quad] else: anno['segmentation'] = [poly] anno['text'] = text_label annos.append(anno) # for textrecog if 'recog' in tasks: if text_label == ignore_marker or len(text_label) == 0: continue cropped_img = crop_img(src_img, quad) img_path_cropped_img = osp.join(sub_dir, f'crop_{idx}.jpg') mmcv.imwrite(cropped_img, img_path_cropped_img) if recog_format == 'txt': recog_crop_line_str_list.append( f'{img_path_cropped_img} {text_label}') elif recog_format == 'jsonl': recog_crop_line_str_list.append( json.dumps({ 'filename': img_path_cropped_img, 'text': text_label })) else: raise NotImplementedError if warp_flag: warpped_img = warp_img(src_img, quad) img_path_warpped_img = osp.join(sub_dir, f'warp_{idx}.jpg') mmcv.imwrite(warpped_img, img_path_warpped_img) if recog_format == 'txt': recog_warp_line_str_list.append( f'{img_path_warpped_img} {text_label}') elif recog_format == 'jsonl': recog_warp_line_str_list.append( json.dumps({ 'filename': img_path_cropped_img, 'text': text_label })) idx += 1 line_json = { 'file_name': img_file, 'height': img_height, 'width': img_width, 'annotations': annos } det_line_json_list.append(json.dumps(line_json, ensure_ascii=False)) return [ det_line_json_list, recog_crop_line_str_list, recog_warp_line_str_list ]
def det_recog_kie_inference(self, det_model, recog_model, kie_model=None): end2end_res = [] # Find bounding boxes in the images (text detection) det_result = self.single_inference(det_model, self.args.arrays, self.args.batch_mode, self.args.det_batch_size) bboxes_list = [res['boundary_result'] for res in det_result] if kie_model: kie_dataset = KIEDataset( dict_file=kie_model.cfg.data.test.dict_file) # For each bounding box, the image is cropped and # sent to the recognition model either one by one # or all together depending on the batch_mode for filename, arr, bboxes, out_file in zip(self.args.filenames, self.args.arrays, bboxes_list, self.args.output): img_e2e_res = {} img_e2e_res['filename'] = filename img_e2e_res['result'] = [] box_imgs = [] for bbox in bboxes: box_res = {} box_res['box'] = [round(x) for x in bbox[:-1]] box_res['box_score'] = float(bbox[-1]) box = bbox[:8] if len(bbox) > 9: min_x = min(bbox[0:-1:2]) min_y = min(bbox[1:-1:2]) max_x = max(bbox[0:-1:2]) max_y = max(bbox[1:-1:2]) box = [ min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y ] box_img = crop_img(arr, box) if self.args.batch_mode: box_imgs.append(box_img) else: if recog_model == 'Tesseract_recog': recog_result = self.single_inference(recog_model, box_img, batch_mode=True) else: recog_result = model_inference(recog_model, box_img) text = recog_result['text'] text_score = recog_result['score'] if isinstance(text_score, list): text_score = sum(text_score) / max(1, len(text)) box_res['text'] = text box_res['text_score'] = text_score img_e2e_res['result'].append(box_res) if self.args.batch_mode: recog_results = self.single_inference( recog_model, box_imgs, True, self.args.recog_batch_size) for i, recog_result in enumerate(recog_results): text = recog_result['text'] text_score = recog_result['score'] if isinstance(text_score, (list, tuple)): text_score = sum(text_score) / max(1, len(text)) img_e2e_res['result'][i]['text'] = text img_e2e_res['result'][i]['text_score'] = text_score if self.args.merge: img_e2e_res['result'] = stitch_boxes_into_lines( img_e2e_res['result'], self.args.merge_xdist, 0.5) if kie_model: annotations = copy.deepcopy(img_e2e_res['result']) # Customized for kie_dataset, which # assumes that boxes are represented by only 4 points for i, ann in enumerate(annotations): min_x = min(ann['box'][::2]) min_y = min(ann['box'][1::2]) max_x = max(ann['box'][::2]) max_y = max(ann['box'][1::2]) annotations[i]['box'] = [ min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y ] ann_info = kie_dataset._parse_anno_info(annotations) ann_info['ori_bboxes'] = ann_info.get('ori_bboxes', ann_info['bboxes']) ann_info['gt_bboxes'] = ann_info.get('gt_bboxes', ann_info['bboxes']) kie_result, data = model_inference( kie_model, arr, ann=ann_info, return_data=True, batch_mode=self.args.batch_mode) # visualize KIE results self.visualize_kie_output(kie_model, data, kie_result, out_file=out_file, show=self.args.imshow) gt_bboxes = data['gt_bboxes'].data.numpy().tolist() labels = self.generate_kie_labels(kie_result, gt_bboxes, kie_model.class_list) for i in range(len(gt_bboxes)): img_e2e_res['result'][i]['label'] = labels[i][0] img_e2e_res['result'][i]['label_score'] = labels[i][1] end2end_res.append(img_e2e_res) return end2end_res
def generate_ann(root_path, image_infos, preserve_vertical, val_ratio, format): """Generate cropped annotations and label txt file. Args: root_path (str): The root path of the dataset image_infos (list[dict]): A list of dicts of the img and annotation information preserve_vertical (bool): Whether to preserve vertical texts val_ratio (float): Split ratio for val set format (str): Using jsonl(dict) or str to format annotations """ assert val_ratio <= 1. if val_ratio: image_infos = split_train_val_list(image_infos, val_ratio) splits = ['training', 'val'] else: image_infos = [image_infos] splits = ['training'] for i, split in enumerate(splits): dst_image_root = osp.join(root_path, 'crops', split) ignore_image_root = osp.join(root_path, 'ignores', split) dst_label_file = osp.join(root_path, f'{split}_label.{format}') os.makedirs(dst_image_root, exist_ok=True) lines = [] for image_info in image_infos[i]: index = 1 src_img_path = osp.join(root_path, 'imgs', image_info['file_name']) image = mmcv.imread(src_img_path) src_img_root = image_info['file_name'].split('.')[0] for anno in image_info['anno_info']: word = anno['word'] dst_img = crop_img(image, anno['bbox'], 0, 0) h, w, _ = dst_img.shape dst_img_name = f'{src_img_root}_{index}.png' index += 1 # Skip invalid annotations if min(dst_img.shape) == 0: continue # Skip vertical texts if not preserve_vertical and h / w > 2 and split == 'training': dst_img_path = osp.join(ignore_image_root, dst_img_name) else: dst_img_path = osp.join(dst_image_root, dst_img_name) mmcv.imwrite(dst_img, dst_img_path) filename = f'{osp.basename(dst_image_root)}/{dst_img_name}' if format == 'txt': lines.append(f'{filename} ' f'{word}') elif format == 'jsonl': lines.append( json.dumps({ 'filename': filename, 'text': word }, ensure_ascii=False)) else: raise NotImplementedError list_to_file(dst_label_file, lines)