def process_file(gts_path, new_path, overwrite=False, remove_text=True): pxml = pagexml.PageXML(gts_path) print(gts_path) gt_sizes = pxml.getPagesSize() entity_pxml = pagexml.PageXML(new_path) entity_pxml.resize(gt_sizes) tls = pxml.select(".//_:TextLine") pxml.rmElems(tls) # copy textlines before textregions so that removing them won't kill the entities pxml.copyTextLinesAssignByOverlap(entity_pxml) for tl in pxml.select(".//_:TextLine"): add_baseline(pxml, tl) if remove_text: pxml.rmElems(pxml.select(".//_:Word", tl)) pxml.rmElems(pxml.select(".//_:TextEquiv", tl)) if overwrite: out_path = gts_path else: out_path = gts_path.replace("page", "page_gts") pxml.write(out_path)
def test_groups(self): pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml')) elems1 = pxml.select('//*[@id="r2"]/_:TextLine') elems2 = pxml.select('//*[@id="r4"]/_:TextLine') group1 = pxml.addGroup(None, elems1) group2 = pxml.addGroup('grX') num_add2 = pxml.addToGroup(group2, elems2) self.assertEqual(num_add2, len(elems2)) ids1 = [ pxml.getAttr(x, 'id') for x in pxml.selectGroupElements(group1) ] ids2 = [ pxml.getAttr(x, 'id') for x in pxml.selectGroupElements(group2) ] self.assertEqual(ids1, ['r2_l1', 'r2_l2']) self.assertEqual(ids2, ['r4_l1', 'r4_l2']) pxml.addToGroup(group1, (group2, )) ids3 = [ pxml.getAttr(x, 'id') for x in pxml.selectGroupElements(group1, False) ] ids4 = [ pxml.getAttr(x, 'id') for x in pxml.selectGroupElements(group1, True) ] self.assertEqual(ids3, ['r2_l1', 'r2_l2', 'grX']) self.assertEqual(ids4, ['r2_l1', 'r2_l2', 'r4_l1', 'r4_l2'])
def read_pagexml(xml_file): """Read PageXML from file or string. Args: xml_file (str): PageXML file or string Returns: pxml: PageXML object for further use """ pxml = pagexml.PageXML() if os.path.exists(xml_file): pxml.loadXml(xml_file) else: pxml.loadXmlString(xml_file) return pxml
def start_processing(thread, process_queue): num_requests = 0 tmpdir = None while True: try: done_queue, req_dict = process_queue.get(True, 0.05) num_requests += 1 tmpdir = write_to_tmpdir(req_dict) opts = list(req_dict['options']) if len(opts) == 1 and opts[0][0] == '[': opts = json.loads(opts[0]) if req_dict['pagexml'] is not None: opts.append( os.path.join( tmpdir, os.path.basename(req_dict['pagexml']['filename']))) elif req_dict['images'] is not None: for image in req_dict['images']: opts.append( os.path.join(tmpdir, os.path.basename(image.filename))) else: raise KeyError('No images found in request.') opts.extend(['-o', os.path.join(tmpdir, 'output.xml')]) rc, out = run_tesseract_recognize(*opts) if rc != 0: raise RuntimeError( 'tesseract-recognize execution failed :: opts: ' + str(opts) + ' :: ' + str(out)) pxml = pagexml.PageXML(os.path.join(tmpdir, 'output.xml')) done_queue.put((thread, num_requests, pxml)) except queue.Empty: continue except json.decoder.JSONDecodeError as ex: done_queue.put((thread, num_requests, RuntimeError('JSONDecodeError: ' + str(ex) + ' while parsing ' + opts[0]))) except Exception as ex: done_queue.put((thread, num_requests, ex)) finally: if not cfg.debug and tmpdir is not None: shutil.rmtree(tmpdir) tmpdir = None
def test_coords_resize(self): pxml = pagexml.PageXML() pxml.newXml('test', 'test1.jpg', 24, 32) sizes = pxml.getPagesSize() reg = pxml.addTextRegion(pxml.selectNth('//_:Page')) pxml.setCoordsBBox( reg, 2, 10, 5, 20, 0.8 ) pxml.resize(2.0) self.assertEqual(pxml.getPageWidth(0), 48) self.assertEqual(pxml.getPageHeight(0), 64) pts = pxml.getPoints(reg) self.assertListEqual([p.x for p in pts], [4.0, 12.0, 12.0, 4.0]) self.assertListEqual([p.y for p in pts], [20.0, 20.0, 58.0, 58.0]) sizes[0].width = 12 sizes[0].height = 16 pxml.resize(sizes) self.assertEqual(pxml.getPageWidth(0), 12) self.assertEqual(pxml.getPageHeight(0), 16)
def TypePageXML(value): """Parse Page XML request type. Args: value: The raw type value. Returns: dict[str, {str,PageXML}]: Dictionary including the page xml 'filename', the 'string' representation and the PageXML 'object'. """ if type(value) != FileStorage: raise ValueError('Expected pagexml to be of type FileStorage.') spxml = value.read().decode('utf-8') pxml = pagexml.PageXML() pxml.loadXmlString(spxml) return {'filename': value.filename, 'object': pxml, 'string': spxml}
def test_pages_sizes(self): pxml = pagexml.PageXML() pxml.newXml('test', 'test1.jpg', 24, 32) pxml.addPage("test2.jpg", 48, 64) sizes = pxml.getPagesSize() self.assertEqual(pxml.getPageWidth(0), 24) self.assertEqual(pxml.getPageHeight(0), 32) self.assertEqual(pxml.getPageWidth(1), 48) self.assertEqual(pxml.getPageHeight(1), 64) self.assertEqual(sizes[0].width, 24) self.assertEqual(sizes[0].height, 32) self.assertEqual(sizes[1].width, 48) self.assertEqual(sizes[1].height, 64) self.assertEqual(len(sizes), 2) self.assertEqual(pxml.count('//_:Page'), 2) self.assertEqual(pxml.getPageImageFilename(0), 'test1.jpg') self.assertEqual(pxml.getPageImageFilename(1), 'test2.jpg')
def forward(self, rois): pxml = pagexml.PageXML() pxml.newXml('retinanet_dets', 'image', 1200, 800) page = pxml.selectNth("//_:Page", 0) reg = pxml.addTextRegion(page) pxml.setCoordsBBox(reg, 0, 0, 1200, 800) line = pxml.addTextLine(reg) pxml.setCoordsBBox(line, 0, 0, 1200, 800) for roi in range(rois.shape[0]): word = pxml.addWord(line) x1 = int(rois[roi, 0].cpu().detach()) y1 = int(rois[roi, 1].cpu().detach()) x2 = int(rois[roi, 2].cpu().detach().int()) y2 = int(rois[roi, 3].cpu().detach().int()) # Set text region bounding box with a confidence pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1) words = pxml.select('//_:Word') order, groups = pxml.getLeftRightTopBottomReadingOrder( words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0) line = pxml.selectNth('//_:TextLine') group_idx = 0 idx_in_group = 0 for n in order: word_idx = order.index(n) if idx_in_group >= groups[group_idx]: group_idx += 1 idx_in_group = 0 pxml.setProperty(words[n], 'word_idx', str(word_idx)) pxml.setProperty(words[n], "line", str(group_idx)) pxml.moveElem(words[n], line) idx_in_group += 1 return order
def test_multiple_typed_textequivs(self): pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml')) elem = pxml.selectByID('r1_l1') text1 = 'first typed text' text2 = 'second typed text' text3 = 'new type1 text' conf1 = 0.9 conf2 = 0.4 pxml.setTextEquiv(elem, text1, conf1, 'type1') pxml.setTextEquiv(elem, text2, conf2, 'type2') self.assertEqual('Lorem ipsum', pxml.getTextEquiv(elem, type='')) self.assertEqual(text1, pxml.getTextEquiv(elem, type='type1')) self.assertEqual(text2, pxml.getTextEquiv(elem, type='type2')) self.assertEqual( conf1, float(pxml.getValue('_:TextEquiv[@type="type1"]/@conf', elem))) self.assertEqual( conf2, float(pxml.getValue('_:TextEquiv[@type="type2"]/@conf', elem))) pxml.setTextEquiv(elem, text3, None, 'type1') self.assertEqual(text3, pxml.getTextEquiv(elem, type='type1')) self.assertEqual( '', pxml.getValue('_:TextEquiv[@type="type1"]/@conf', elem))
def process_pdf_pagexml(input): try: """Processing of non-ground truth pdf page xmls. - For each page in a page xml * Reorder all TextLines in each page ignoring parent TextRegion. * Create a full page TextRegion. * Move all TextLines in new order to page TextRegion. - Relabel line IDs to ease preservation of their order. Args: input (str): The input page xml file path. Returns: PageXML object. """ pxml = pagexml.PageXML() pxml.loadXml(input) pagexmltools.process.page_region_with_ordered_textlines(pxml, fake_baseline=True) return pxml except Exception as e: print(e) raise e
def main(args=None): parser = argparse.ArgumentParser( description= 'Convert pagexml files to RetinaNet network csv groundtruth.') parser.add_argument('--pxml_dir', help='Path of directory with pagexml files.', default=".") parser.add_argument('--fout', help='Path of gt file to be read by the model.', default="train.csv") parser.add_argument('--classes_out', help='Path to save text category classes.') parser.add_argument( '--seg_lev', help='segmentation level of the boxes to get (Word/TextLine)', default="Word") parser.add_argument( '--get_property', help='segmentation level of the boxes to get (Word/TextLine)', default=False) parser.add_argument('--property_key', help='key to get property from pagexml', default='category') parser = parser.parse_args(args) pagexml.set_omnius_schema() pxml = pagexml.PageXML() if parser.classes_out is not None: classes_out = open(parser.classes_out, 'w') csv_out = open(parser.fout, 'w') writer = csv.writer(csv_out, delimiter=',') writer_classes = csv.writer(classes_out, delimiter=',') all_tags = [] for root, dirs, files in os.walk(os.path.join(os.getcwd(), parser.pxml_dir)): for f in files: if '.xml' in f: pxml.loadXml(os.path.join(root, f)) pages = pxml.select('_:Page') for page in pages: pagenum = pxml.getPageNumber(page) page_im_file = pxml.getPageImageFilename(page) page_im_file = os.path.join(os.getcwd(), root, page_im_file) regions = pxml.select('_:TextRegion', page) for region in regions: reg_tag = pxml.getPropertyValue( region, key=parser.property_key) for textLine in pxml.select('_:TextLine', region): for word in pxml.select('_:Word', textLine): x0, y0, x1, y1, transcription, tag = get_coords_and_transcript( pxml, word, parser.property_key) if tag not in all_tags: all_tags.append(tag) if x0 >= x1 or y0 >= y1: continue if parser.get_property: if len(tag) > 0: writer.writerow([ page_im_file, x0, y0, x1, y1, tag, transcription ]) else: writer.writerow([ page_im_file, x0, y0, x1, y1, reg_tag, transcription ]) else: writer.writerow([ page_im_file, x0, y0, x1, y1, 'text', transcription ]) if len(all_tags) > 0: for idx, tag in enumerate(all_tags): writer_classes.writerow([tag, idx])
import os import pagexml import glob doc_dir = 'fed90951-622d-47cb-957d-fe95b2addb3' print(doc_dir) xml_file = os.path.join(doc_dir, 'page.xml') pxml = pagexml.PageXML(xml_file) page_list = pxml.select('//_:Page') for p, page in enumerate(page_list): width = pxml.getPageWidth(page) height = pxml.getPageHeight(page) img_file = 'image_%d.jpeg' % p pxml.setPageImageFilename(page, img_file) #cmd = 'mogrify -resize %dx%d %s' % (width, height, img_file) cmd = 'mogrify -resize {}x{} {}'.format(width, height, os.path.join(doc_dir, img_file)) print(cmd) os.system(cmd) # pxml.write(xml_file)
def test_get_textequiv(self): pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml')) text = pxml.getTextEquiv(pxml.selectByID('r1_l1')) self.assertEqual(text, 'Lorem ipsum')
cfg = {} if opts.cfg is not None: with open(opts.cfg) as json_config: cfg = json.load(json_config)['feats'] logger.info('Loaded configuration.') if opts.legacy_feats: if opts.savexml: cfg['savexml'] = opts.outdir extractor = textfeat.LegacyTextFeatExtractor(**cfg) else: extractor = textfeat.TextFeatExtractor(**cfg) ### Loop through input page xmls ### for fpxml in opts.input: logger.info('Processing page xml: ' + fpxml) pxml = pagexml.PageXML(fpxml) if opts.legacy_feats: feats = extractor.extractFeats(fpxml, xpath=opts.xpath) else: cropped = pxml.crop(opts.xpath) feats = [] for crop in cropped: image = crop.image extractor.preprocess(image) slope = textfeat.ptr_float() slant = textfeat.ptr_float() extractor.estimateAngles(image, slope, slant, crop.rotation) image = extractor.extractFeats(image, slope.value(), slant.value(),
def get_transcript(image_id, data, retinanet, score_threshold, nms_threshold, dataset_val, alphabet): image_name = image_id + '.jpg' retinanet.training = False gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2]) pxml = pagexml.PageXML() unnormalize = UnNormalizer() with torch.no_grad(): st = time.time() im = data['img'] im = im.cuda().float() if retinanet.module.htr_gt_box: scores, classification, transformed_anchors, transcriptions = retinanet( [im, data['annot']]) else: scores, classification, transformed_anchors, transcriptions = retinanet( im) idxs = np.where(scores.cpu() > score_threshold) img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() #img = np.array(255 * unnormalize(im)).copy() img[img < 0] = 0 img[img > 255] = 255 img = np.transpose(img, (1, 2, 0)) img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) width = img.shape[1] height = img.shape[0] conf = pagexml.ptr_double() pxml.newXml('retinanet_dets', image_name, width, height) page = pxml.selectNth("//_:Page", 0) reg = pxml.addTextRegion(page) pxml.setCoordsBBox(reg, 0, 0, width, height, conf) line = pxml.addTextLine(reg) pxml.setCoordsBBox(line, 0, 0, width, height, conf) words = [] transcriptions = np.argmax(transcriptions.cpu(), axis=-1) for j in range(idxs[0].shape[0]): # Initialize object for setting confidence values box = {} bbox = transformed_anchors[idxs[0][j], :] if idxs[0][j] >= transcriptions.shape[0]: continue transcription = transcriptions[idxs[0][j], :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int(classification[idxs[0][j]])] # Add a text region to the Page word = pxml.addWord(line, "ID" + str(j)) # Set text region bounding box with a confidence pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1, conf) #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf ) #transcription = transcripts[j] transcription = labels_to_text(transcription, alphabet) # Set the text for the text region conf.assign(0.9) pxml.setTextEquiv(word, transcription, conf) # Add property to text region pxml.setProperty(word, "category", label_name) # Add a second page with a text region and specific id #page = pxml.addPage("example_image_2.jpg", 300, 300) #reg = pxml.addTextRegion( page, "regA" ) #pxml.setCoordsBBox( reg, 15, 12, 76, 128 ) words.append(word) words = pxml.select('//_:Word') order, groups = pxml.getLeftRightTopBottomReadingOrder( words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0) line = pxml.selectNth('//_:TextLine') group_idx = 0 idx_in_group = 0 transcript_pred = [] for n in order: word_idx = order.index(n) if idx_in_group >= groups[group_idx]: group_idx += 1 idx_in_group = 0 transcript_pred.append(pxml.getTextEquiv(words[n])) pxml.setProperty(words[n], 'word_idx', str(word_idx)) pxml.setProperty(words[n], "line", str(group_idx)) pxml.moveElem(words[n], line) idx_in_group += 1 image_text = image_id + '.txt' # Write XML to file return " ".join(transcript_pred)
def blend_single_xml(xmlfile, loc, img_list, xpath): """blends two images in a third image (taken from xml document) Args: xmlfile (str): path to xml file from where third image is taken loc (str): 'left', 'right', 'top', 'bottom', 'top_bottom', 'left_right', 'centre' img_list (list): a list of image files xpath (str): xpath of xml file as blending target region corresponding third image Returns: numpy.ndarray: blended document """ img_file = random.choice(img_list) image = cv2.imread(img_file) image_rect = get_img_rect(image) pxml = pagexml.PageXML(xmlfile) coords = elem_bbox(pxml, xpath) doc_rect = coords2rect(coords) filename = pxml.getPageImageFilename(0) filepath = os.path.join(os.path.dirname(xmlfile), filename) document = cv2.imread(filepath) if loc == 'left': rect = Rect(doc_rect.l, doc_rect.t, int((doc_rect.l + doc_rect.r) / 2), doc_rect.b) document = blend(document, rect, image_rect, image) elif loc == 'right': rect = Rect(int((doc_rect.l + doc_rect.r) / 2), doc_rect.t, doc_rect.r, doc_rect.b) document = blend(document, rect, image_rect, image) elif loc == 'top': rect = Rect(doc_rect.l, doc_rect.t, doc_rect.r, int((doc_rect.t + doc_rect.b) / 2)) document = blend(document, rect, image_rect, image) elif loc == 'bottom': rect = Rect(doc_rect.l, int((doc_rect.t + doc_rect.b) / 2), doc_rect.r, doc_rect.b) document = blend(document, rect, image_rect, image) elif loc == 'centre': rect = doc_rect document = blend(document, rect, image_rect, image) elif loc == 'left_right': rect = Rect(doc_rect.l, doc_rect.t, int((doc_rect.l + doc_rect.r) / 2), doc_rect.b) document = blend(document, rect, image_rect, image) img_file = random.choice(img_list) image = cv2.imread(img_file) image_rect = get_img_rect(image) rect = Rect(int((doc_rect.l + doc_rect.r) / 2), doc_rect.t, doc_rect.r, doc_rect.b) document = blend(document, rect, image_rect, image) elif loc == 'top_bottom': rect = Rect(doc_rect.l, doc_rect.t, doc_rect.r, int((doc_rect.t + doc_rect.b) / 2)) document = blend(document, rect, image_rect, image) img_file = random.choice(img_list) image = cv2.imread(img_file) image_rect = get_img_rect(image) rect = Rect(doc_rect.l, int((doc_rect.t + doc_rect.b) / 2), doc_rect.r, doc_rect.b) document = blend(document, rect, image_rect, image) else: print('Insert location') return document
def generate_pagexml(image_id, data, retinanet, score_threshold, nms_threshold, dataset_val): image_name = image_id + '.jpg' im_file_out = 'pagexmls/' + image_name alphabet = retinanet.alphabet #retinanet.score_threshold = torch.tensor(score_threshold).cuda().float() colors = get_n_random_colors(len(dataset_val.labels)) gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2]) pxml = pagexml.PageXML() unnormalize = UnNormalizer() with torch.no_grad(): st = time.time() im = data['img'] im = im.cuda().float() print(retinanet.htr_gt_box) if retinanet.htr_gt_box: scores, classification, transformed_anchors, transcriptions = retinanet( [im, data['annot']]) score_threshold = 0 else: scores, classification, transformed_anchors, transcriptions = retinanet( im) n_boxes_predicted = transformed_anchors.shape[0] print(n_boxes_predicted, "BOXES PREDICTED") img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() img[img < 0] = 0 img[img > 255] = 255 img = np.transpose(img, (1, 2, 0)) img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) width = img.shape[1] height = img.shape[0] cv2.imwrite(im_file_out, img) conf = pagexml.ptr_double() pxml.newXml('retinanet_dets', image_name, width, height) page = pxml.selectNth("//_:Page", 0) reg = pxml.addTextRegion(page) pxml.setCoordsBBox(reg, 0, 0, width, height, conf) line = pxml.addTextLine(reg) pxml.setCoordsBBox(line, 0, 0, width, height, conf) words = [] for k in range(len(dataset_val.labels)): cv2.putText(img, dataset_val.labels[k], (25, 25 + k * 15), cv2.FONT_HERSHEY_PLAIN, 1, colors[k], 2) transcriptions = np.argmax(transcriptions.cpu(), axis=-1) for box_id in range(n_boxes_predicted): # Initialize object for setting confidence values box = {} bbox = transformed_anchors[box_id, :] transcription = transcriptions[box_id, :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int(classification[box_id])] cv2.rectangle(img, (x1, y1), (x2, y2), color=colors[int(classification[box_id])], thickness=2) # Add a text region to the Page word = pxml.addWord(line, "ID" + str(box_id)) # Set text region bounding box with a confidence pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1, conf) #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf ) #transcription = transcripts[j] transcription = labels_to_text(transcription, alphabet) draw_caption(img, (x1, y1, x2, y2), transcription) # Set the text for the text region conf.assign(0.9) pxml.setTextEquiv(word, transcription, conf) # Add property to text region pxml.setProperty(word, "category", label_name) # Add a second page with a text region and specific id #page = pxml.addPage("example_image_2.jpg", 300, 300) #reg = pxml.addTextRegion( page, "regA" ) #pxml.setCoordsBBox( reg, 15, 12, 76, 128 ) words.append(word) words = pxml.select('//_:Word') order, groups = pxml.getLeftRightTopBottomReadingOrder( words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0) line = pxml.selectNth('//_:TextLine') group_idx = 0 idx_in_group = 0 for n in order: word_idx = order.index(n) if idx_in_group >= groups[group_idx]: group_idx += 1 idx_in_group = 0 pxml.setProperty(words[n], 'word_idx', str(word_idx)) pxml.setProperty(words[n], "line", str(group_idx)) pxml.moveElem(words[n], line) idx_in_group += 1 # Write XML to file pxml.write('pagexmls/' + gtxml_name + ".xml") cv2.imwrite(os.path.join('pred_sample_ims', str(image_id) + '.jpg'), img)
""" convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] return sorted(l, key=alphanum_key) GT_DIR = 'datasets/esposalles/data-esposalles/test' re.sub("[^0-9]", "", "sdkjh987978asd098as0980a98sd") dir_out = 'preds_txt' if not os.path.exists(dir_out): os.mkdir(dir_out) for f in sorted_nicely(os.listdir('pagexmls')): gt_xml = f[:24] gt_xml = os.path.join(GT_DIR, gt_xml + '.xml') pxml = pagexml.PageXML() pxml.loadXml(gt_xml) page = pxml.select('_:Page') page_id = pxml.getPropertyValue(page[0], key='idPage') record_id = f.split('.')[0][-2:] record_id = str(int(re.sub("[^0-9]", "", record_id)) + 1) pred_xml_file = f.split('.')[0] + '.xml' pred_xml = pagexml.PageXML() pred_xml.loadXml(os.path.join('pagexmls', pred_xml_file)) words = pred_xml.select('//_:Word') fout = open( os.path.join( dir_out, "idPage" + page_id + "_Record" + record_id + "_output.txt"), "w")
def test_get_textequiv_grandchildren(self): pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml')) text = pxml.getTextEquiv(pxml.selectByID('r2'), xpath='_:TextLine/_:TextEquiv', separator='\n') self.assertEqual(text, 'dolor sit amet, in per iusto\nnostrud,')
def test_set_textequiv(self): pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml')) elem = pxml.selectByID('r1_l1') text = 'new text' pxml.setTextEquiv(elem, text) self.assertEqual(text, pxml.getTextEquiv(elem))
import sys import numpy as np import os import pdb import cv2 fin = sys.argv[1] f = open(fin, 'r') flines = f.readlines() pdb.set_trace() all_wh = np.zeros((len(flines), 2)) prev_im = "" if not os.path.exists('pagexmls'): os.mkdir('pagexmls') pxml = pagexml.PageXML() for line_idx in range(len(flines)): line = flines[line_idx] vals = line.split(',') x0 = int(vals[1]) x1 = int(vals[3]) y0 = int(vals[2]) y1 = int(vals[4]) width = x1 - x0 height = y1 - y0 tag = vals[5] text = vals[6].strip() im_file = vals[0] image_id = im_file.split('.')[-2].split('/')[-1] gt_im_path = im_file.split('.')[0] + '_gt.jpg'
def get_parser(): parser = ArgumentParser() parser.add_argument('xml', action=ActionPath(mode='fr'), help='Page XML file to process.') parser.add_argument('--with_conf', type=bool, default=False, help='Whether to include confidences.') return parser if __name__ == '__main__': cfg = get_parser().parse_args() pxml = pagexml.PageXML(cfg.xml()) page_id = pxml.getValue('//@imageFilename').split('.')[0] for xreg in pxml.select('//_:TextRegion'): reg_id = page_id + '.' + pxml.getValue('@id', xreg) sys.stdout.write(reg_id + '\t') if cfg.with_conf: conf = min([ float(pxml.getValue(e)) for e in pxml.select('.//_:TextEquiv/@conf', xreg) ]) sys.stdout.write(str(conf) + '\t') reg_text = ''
def generate_pagexml(image_id, data, retinanet, score_threshold, dataset_val, nms_threshold): image_name = image_id + '.jpg' file = 'pagexmls/' + image_name alphabet = " abcdefghijklmnopqrstuvwxy z" colors = get_n_random_colors(len(dataset_val.labels)) gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2]) pxml = pagexml.PageXML() unnormalize = UnNormalizer() with torch.no_grad(): st = time.time() im = data['img'] im = im.cuda().float() scores, classification, transformed_anchors = retinanet( [im, nms_threshold]) print('Elapsed time: {}'.format(time.time() - st)) idxs = np.where(scores > score_threshold) img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() #img = np.array(255 * unnormalize(im)).copy() img[img < 0] = 0 img[img > 255] = 255 img = np.transpose(img, (1, 2, 0)) img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) width = img.shape[1] height = img.shape[0] cv2.imwrite(file, img) conf = pagexml.ptr_double() pxml.newXml('retinanet_dets', image_name, width, height) page = pxml.selectNth("//_:Page", 0) reg = pxml.addTextRegion(page) pxml.setCoordsBBox(reg, 0, 0, width, height) line = pxml.addTextLine(reg) pxml.setCoordsBBox(line, 0, 0, width, height) words = [] for k in range(len(dataset_val.labels)): cv2.putText(img, dataset_val.labels[k], (25, 25 + k * 15), cv2.FONT_HERSHEY_PLAIN, 1, colors[k], 2) for j in range(idxs[0].shape[0]): # Initialize object for setting confidence values box = {} bbox = transformed_anchors[idxs[0][j], :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int(classification[idxs[0][j]])] cv2.rectangle(img, (x1, y1), (x2, y2), color=colors[int(classification[idxs[0][j]])], thickness=2) # Add a text region to the Page word = pxml.addWord(line, "ID" + str(j)) # Set text region bounding box with a confidence pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1) #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf ) transcripts = [] confs = [] seq_len = int(bbox[4]) for k in range(seq_len + 1): transcripts.append( np.argmax(bbox[(5 + k * 27):((5 + (k + 1) * 27))])) transcripts = np.array(transcripts) transcript = labels_to_text(transcripts, alphabet) draw_caption( img, (x1, y1, x2, y2), "".join([ alphabet[transcripts[k]] for k in range(len(transcripts)) ])) # Set the text for the text region conf.assign(1) pxml.setTextEquiv( word, "".join([ alphabet[transcripts[k]] for k in range(len(transcripts)) ])) # Add property to text region pxml.setProperty(word, "category", label_name) words.append(word) words = pxml.select('//_:Word') order, groups = pxml.getLeftRightTopBottomReadingOrder( words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0) line = pxml.selectNth('//_:TextLine', 0) group_idx = 0 idx_in_group = 0 #line= pxml.addTextLine(reg,"ID"+str(group_idx+1)) for n in order: word_idx = order.index(n) if idx_in_group >= groups[group_idx]: #line = pxml.selectNth('//_:TextLine',group_idx,reg) #line= pxml.selectNth(reg) group_idx += 1 idx_in_group = 0 pxml.setProperty(words[n], 'word_idx', str(word_idx)) pxml.setProperty(words[n], "line", str(group_idx)) pxml.moveElem(words[n], line) idx_in_group += 1 # Write XML to file pxml.write('pagexmls/' + gtxml_name + ".xml") cv2.imwrite(str(image_id) + '.jpg', img)