Ejemplo n.º 1
0
def process_file(gts_path, new_path, overwrite=False, remove_text=True):
    pxml = pagexml.PageXML(gts_path)
    print(gts_path)
    gt_sizes = pxml.getPagesSize()

    entity_pxml = pagexml.PageXML(new_path)

    entity_pxml.resize(gt_sizes)

    tls = pxml.select(".//_:TextLine")
    pxml.rmElems(tls)

    # copy textlines before textregions so that removing them won't kill the entities
    pxml.copyTextLinesAssignByOverlap(entity_pxml)

    for tl in pxml.select(".//_:TextLine"):
        add_baseline(pxml, tl)
        if remove_text:
            pxml.rmElems(pxml.select(".//_:Word", tl))
            pxml.rmElems(pxml.select(".//_:TextEquiv", tl))

    if overwrite:
        out_path = gts_path
    else:
        out_path = gts_path.replace("page", "page_gts")

    pxml.write(out_path)
Ejemplo n.º 2
0
 def test_groups(self):
     pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml'))
     elems1 = pxml.select('//*[@id="r2"]/_:TextLine')
     elems2 = pxml.select('//*[@id="r4"]/_:TextLine')
     group1 = pxml.addGroup(None, elems1)
     group2 = pxml.addGroup('grX')
     num_add2 = pxml.addToGroup(group2, elems2)
     self.assertEqual(num_add2, len(elems2))
     ids1 = [
         pxml.getAttr(x, 'id') for x in pxml.selectGroupElements(group1)
     ]
     ids2 = [
         pxml.getAttr(x, 'id') for x in pxml.selectGroupElements(group2)
     ]
     self.assertEqual(ids1, ['r2_l1', 'r2_l2'])
     self.assertEqual(ids2, ['r4_l1', 'r4_l2'])
     pxml.addToGroup(group1, (group2, ))
     ids3 = [
         pxml.getAttr(x, 'id')
         for x in pxml.selectGroupElements(group1, False)
     ]
     ids4 = [
         pxml.getAttr(x, 'id')
         for x in pxml.selectGroupElements(group1, True)
     ]
     self.assertEqual(ids3, ['r2_l1', 'r2_l2', 'grX'])
     self.assertEqual(ids4, ['r2_l1', 'r2_l2', 'r4_l1', 'r4_l2'])
Ejemplo n.º 3
0
def read_pagexml(xml_file):
    """Read PageXML from file or string.

    Args:
        xml_file (str): PageXML file or string

    Returns:
        pxml: PageXML object for further use
    """
    pxml = pagexml.PageXML()
    if os.path.exists(xml_file):
        pxml.loadXml(xml_file)
    else:
        pxml.loadXmlString(xml_file)
    return pxml
Ejemplo n.º 4
0
    def start_processing(thread, process_queue):

        num_requests = 0
        tmpdir = None
        while True:
            try:
                done_queue, req_dict = process_queue.get(True, 0.05)
                num_requests += 1
                tmpdir = write_to_tmpdir(req_dict)

                opts = list(req_dict['options'])
                if len(opts) == 1 and opts[0][0] == '[':
                    opts = json.loads(opts[0])
                if req_dict['pagexml'] is not None:
                    opts.append(
                        os.path.join(
                            tmpdir,
                            os.path.basename(req_dict['pagexml']['filename'])))
                elif req_dict['images'] is not None:
                    for image in req_dict['images']:
                        opts.append(
                            os.path.join(tmpdir,
                                         os.path.basename(image.filename)))
                else:
                    raise KeyError('No images found in request.')
                opts.extend(['-o', os.path.join(tmpdir, 'output.xml')])

                rc, out = run_tesseract_recognize(*opts)
                if rc != 0:
                    raise RuntimeError(
                        'tesseract-recognize execution failed :: opts: ' +
                        str(opts) + ' :: ' + str(out))

                pxml = pagexml.PageXML(os.path.join(tmpdir, 'output.xml'))
                done_queue.put((thread, num_requests, pxml))

            except queue.Empty:
                continue
            except json.decoder.JSONDecodeError as ex:
                done_queue.put((thread, num_requests,
                                RuntimeError('JSONDecodeError: ' + str(ex) +
                                             ' while parsing ' + opts[0])))
            except Exception as ex:
                done_queue.put((thread, num_requests, ex))
            finally:
                if not cfg.debug and tmpdir is not None:
                    shutil.rmtree(tmpdir)
                    tmpdir = None
Ejemplo n.º 5
0
 def test_coords_resize(self):
     pxml = pagexml.PageXML()
     pxml.newXml('test', 'test1.jpg', 24, 32)
     sizes = pxml.getPagesSize()
     reg = pxml.addTextRegion(pxml.selectNth('//_:Page'))
     pxml.setCoordsBBox( reg, 2, 10, 5, 20, 0.8 )
     pxml.resize(2.0)
     self.assertEqual(pxml.getPageWidth(0), 48)
     self.assertEqual(pxml.getPageHeight(0), 64)
     pts = pxml.getPoints(reg)
     self.assertListEqual([p.x for p in pts], [4.0, 12.0, 12.0, 4.0])
     self.assertListEqual([p.y for p in pts], [20.0, 20.0, 58.0, 58.0])
     sizes[0].width = 12
     sizes[0].height = 16
     pxml.resize(sizes)
     self.assertEqual(pxml.getPageWidth(0), 12)
     self.assertEqual(pxml.getPageHeight(0), 16)
Ejemplo n.º 6
0
def TypePageXML(value):
    """Parse Page XML request type.

    Args:
        value: The raw type value.

    Returns:
        dict[str, {str,PageXML}]: Dictionary including the page xml 'filename', the 'string' representation and the PageXML 'object'.
    """
    if type(value) != FileStorage:
        raise ValueError('Expected pagexml to be of type FileStorage.')

    spxml = value.read().decode('utf-8')
    pxml = pagexml.PageXML()
    pxml.loadXmlString(spxml)

    return {'filename': value.filename, 'object': pxml, 'string': spxml}
Ejemplo n.º 7
0
 def test_pages_sizes(self):
     pxml = pagexml.PageXML()
     pxml.newXml('test', 'test1.jpg', 24, 32)
     pxml.addPage("test2.jpg", 48, 64)
     sizes = pxml.getPagesSize()
     self.assertEqual(pxml.getPageWidth(0), 24)
     self.assertEqual(pxml.getPageHeight(0), 32)
     self.assertEqual(pxml.getPageWidth(1), 48)
     self.assertEqual(pxml.getPageHeight(1), 64)
     self.assertEqual(sizes[0].width, 24)
     self.assertEqual(sizes[0].height, 32)
     self.assertEqual(sizes[1].width, 48)
     self.assertEqual(sizes[1].height, 64)
     self.assertEqual(len(sizes), 2)
     self.assertEqual(pxml.count('//_:Page'), 2)
     self.assertEqual(pxml.getPageImageFilename(0), 'test1.jpg')
     self.assertEqual(pxml.getPageImageFilename(1), 'test2.jpg')
Ejemplo n.º 8
0
    def forward(self, rois):
        pxml = pagexml.PageXML()

        pxml.newXml('retinanet_dets', 'image', 1200, 800)
        page = pxml.selectNth("//_:Page", 0)
        reg = pxml.addTextRegion(page)
        pxml.setCoordsBBox(reg, 0, 0, 1200, 800)
        line = pxml.addTextLine(reg)
        pxml.setCoordsBBox(line, 0, 0, 1200, 800)
        for roi in range(rois.shape[0]):

            word = pxml.addWord(line)
            x1 = int(rois[roi, 0].cpu().detach())
            y1 = int(rois[roi, 1].cpu().detach())

            x2 = int(rois[roi, 2].cpu().detach().int())

            y2 = int(rois[roi, 3].cpu().detach().int())

            # Set text region bounding box with a confidence
            pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1)
        words = pxml.select('//_:Word')
        order, groups = pxml.getLeftRightTopBottomReadingOrder(
            words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0)
        line = pxml.selectNth('//_:TextLine')
        group_idx = 0
        idx_in_group = 0
        for n in order:
            word_idx = order.index(n)
            if idx_in_group >= groups[group_idx]:
                group_idx += 1
                idx_in_group = 0

            pxml.setProperty(words[n], 'word_idx', str(word_idx))
            pxml.setProperty(words[n], "line", str(group_idx))
            pxml.moveElem(words[n], line)
            idx_in_group += 1

        return order
Ejemplo n.º 9
0
 def test_multiple_typed_textequivs(self):
     pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml'))
     elem = pxml.selectByID('r1_l1')
     text1 = 'first typed text'
     text2 = 'second typed text'
     text3 = 'new type1 text'
     conf1 = 0.9
     conf2 = 0.4
     pxml.setTextEquiv(elem, text1, conf1, 'type1')
     pxml.setTextEquiv(elem, text2, conf2, 'type2')
     self.assertEqual('Lorem ipsum', pxml.getTextEquiv(elem, type=''))
     self.assertEqual(text1, pxml.getTextEquiv(elem, type='type1'))
     self.assertEqual(text2, pxml.getTextEquiv(elem, type='type2'))
     self.assertEqual(
         conf1,
         float(pxml.getValue('_:TextEquiv[@type="type1"]/@conf', elem)))
     self.assertEqual(
         conf2,
         float(pxml.getValue('_:TextEquiv[@type="type2"]/@conf', elem)))
     pxml.setTextEquiv(elem, text3, None, 'type1')
     self.assertEqual(text3, pxml.getTextEquiv(elem, type='type1'))
     self.assertEqual(
         '', pxml.getValue('_:TextEquiv[@type="type1"]/@conf', elem))
Ejemplo n.º 10
0
def process_pdf_pagexml(input):
    try:
        """Processing of non-ground truth pdf page xmls.

        - For each page in a page xml
        * Reorder all TextLines in each page ignoring parent TextRegion.
        * Create a full page TextRegion.
        * Move all TextLines in new order to page TextRegion.
        - Relabel line IDs to ease preservation of their order.

        Args:
            input (str): The input page xml file path.

        Returns:
            PageXML object.
        """

        pxml = pagexml.PageXML()
        pxml.loadXml(input)
        pagexmltools.process.page_region_with_ordered_textlines(pxml, fake_baseline=True)
        return pxml
    except Exception as e:
        print(e)
        raise e
def main(args=None):
    parser = argparse.ArgumentParser(
        description=
        'Convert pagexml files to RetinaNet network csv groundtruth.')

    parser.add_argument('--pxml_dir',
                        help='Path of directory with pagexml files.',
                        default=".")
    parser.add_argument('--fout',
                        help='Path of gt file to be read by the model.',
                        default="train.csv")
    parser.add_argument('--classes_out',
                        help='Path to save text category classes.')
    parser.add_argument(
        '--seg_lev',
        help='segmentation level of the boxes to get (Word/TextLine)',
        default="Word")
    parser.add_argument(
        '--get_property',
        help='segmentation level of the boxes to get (Word/TextLine)',
        default=False)
    parser.add_argument('--property_key',
                        help='key to get property from pagexml',
                        default='category')

    parser = parser.parse_args(args)
    pagexml.set_omnius_schema()
    pxml = pagexml.PageXML()
    if parser.classes_out is not None:
        classes_out = open(parser.classes_out, 'w')
    csv_out = open(parser.fout, 'w')
    writer = csv.writer(csv_out, delimiter=',')
    writer_classes = csv.writer(classes_out, delimiter=',')
    all_tags = []
    for root, dirs, files in os.walk(os.path.join(os.getcwd(),
                                                  parser.pxml_dir)):
        for f in files:
            if '.xml' in f:
                pxml.loadXml(os.path.join(root, f))
                pages = pxml.select('_:Page')
                for page in pages:
                    pagenum = pxml.getPageNumber(page)
                    page_im_file = pxml.getPageImageFilename(page)
                    page_im_file = os.path.join(os.getcwd(), root,
                                                page_im_file)
                    regions = pxml.select('_:TextRegion', page)
                    for region in regions:
                        reg_tag = pxml.getPropertyValue(
                            region, key=parser.property_key)
                        for textLine in pxml.select('_:TextLine', region):
                            for word in pxml.select('_:Word', textLine):
                                x0, y0, x1, y1, transcription, tag = get_coords_and_transcript(
                                    pxml, word, parser.property_key)
                                if tag not in all_tags: all_tags.append(tag)
                                if x0 >= x1 or y0 >= y1: continue
                                if parser.get_property:

                                    if len(tag) > 0:
                                        writer.writerow([
                                            page_im_file, x0, y0, x1, y1, tag,
                                            transcription
                                        ])
                                    else:

                                        writer.writerow([
                                            page_im_file, x0, y0, x1, y1,
                                            reg_tag, transcription
                                        ])
                                else:
                                    writer.writerow([
                                        page_im_file, x0, y0, x1, y1, 'text',
                                        transcription
                                    ])

    if len(all_tags) > 0:
        for idx, tag in enumerate(all_tags):
            writer_classes.writerow([tag, idx])
Ejemplo n.º 12
0
import os
import pagexml
import glob

doc_dir = 'fed90951-622d-47cb-957d-fe95b2addb3'
print(doc_dir)
xml_file = os.path.join(doc_dir, 'page.xml')
pxml = pagexml.PageXML(xml_file)

page_list = pxml.select('//_:Page')
for p, page in enumerate(page_list):
    width = pxml.getPageWidth(page)
    height = pxml.getPageHeight(page)

    img_file = 'image_%d.jpeg' % p

    pxml.setPageImageFilename(page, img_file)

    #cmd = 'mogrify -resize %dx%d %s' % (width, height, img_file)
    cmd = 'mogrify -resize {}x{} {}'.format(width, height,
                                            os.path.join(doc_dir, img_file))

    print(cmd)
    os.system(cmd)
# pxml.write(xml_file)
Ejemplo n.º 13
0
 def test_get_textequiv(self):
     pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml'))
     text = pxml.getTextEquiv(pxml.selectByID('r1_l1'))
     self.assertEqual(text, 'Lorem ipsum')
Ejemplo n.º 14
0
    cfg = {}
    if opts.cfg is not None:
        with open(opts.cfg) as json_config:
            cfg = json.load(json_config)['feats']
        logger.info('Loaded configuration.')
    if opts.legacy_feats:
        if opts.savexml:
            cfg['savexml'] = opts.outdir
        extractor = textfeat.LegacyTextFeatExtractor(**cfg)
    else:
        extractor = textfeat.TextFeatExtractor(**cfg)

    ### Loop through input page xmls ###
    for fpxml in opts.input:
        logger.info('Processing page xml: ' + fpxml)
        pxml = pagexml.PageXML(fpxml)

        if opts.legacy_feats:
            feats = extractor.extractFeats(fpxml, xpath=opts.xpath)
        else:
            cropped = pxml.crop(opts.xpath)
            feats = []
            for crop in cropped:
                image = crop.image
                extractor.preprocess(image)
                slope = textfeat.ptr_float()
                slant = textfeat.ptr_float()
                extractor.estimateAngles(image, slope, slant, crop.rotation)
                image = extractor.extractFeats(image,
                                               slope.value(),
                                               slant.value(),
Ejemplo n.º 15
0
def get_transcript(image_id, data, retinanet, score_threshold, nms_threshold,
                   dataset_val, alphabet):
    image_name = image_id + '.jpg'
    retinanet.training = False
    gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2])

    pxml = pagexml.PageXML()
    unnormalize = UnNormalizer()
    with torch.no_grad():
        st = time.time()
        im = data['img']
        im = im.cuda().float()
        if retinanet.module.htr_gt_box:
            scores, classification, transformed_anchors, transcriptions = retinanet(
                [im, data['annot']])
        else:
            scores, classification, transformed_anchors, transcriptions = retinanet(
                im)
        idxs = np.where(scores.cpu() > score_threshold)
        img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy()
        #img = np.array(255 * unnormalize(im)).copy()

        img[img < 0] = 0
        img[img > 255] = 255

        img = np.transpose(img, (1, 2, 0))

        img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
        width = img.shape[1]
        height = img.shape[0]

        conf = pagexml.ptr_double()
        pxml.newXml('retinanet_dets', image_name, width, height)
        page = pxml.selectNth("//_:Page", 0)
        reg = pxml.addTextRegion(page)
        pxml.setCoordsBBox(reg, 0, 0, width, height, conf)
        line = pxml.addTextLine(reg)
        pxml.setCoordsBBox(line, 0, 0, width, height, conf)
        words = []
        transcriptions = np.argmax(transcriptions.cpu(), axis=-1)
        for j in range(idxs[0].shape[0]):

            # Initialize object for setting confidence values
            box = {}
            bbox = transformed_anchors[idxs[0][j], :]
            if idxs[0][j] >= transcriptions.shape[0]: continue
            transcription = transcriptions[idxs[0][j], :]
            x1 = int(bbox[0])
            y1 = int(bbox[1])
            x2 = int(bbox[2])
            y2 = int(bbox[3])
            label_name = dataset_val.labels[int(classification[idxs[0][j]])]

            # Add a text region to the Page
            word = pxml.addWord(line, "ID" + str(j))

            # Set text region bounding box with a confidence
            pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1, conf)

            #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf )
            #transcription = transcripts[j]
            transcription = labels_to_text(transcription, alphabet)

            # Set the text for the text region
            conf.assign(0.9)
            pxml.setTextEquiv(word, transcription, conf)

            # Add property to text region
            pxml.setProperty(word, "category", label_name)

            # Add a second page with a text region and specific id
            #page = pxml.addPage("example_image_2.jpg", 300, 300)
            #reg = pxml.addTextRegion( page, "regA" )
            #pxml.setCoordsBBox( reg, 15, 12, 76, 128 )
            words.append(word)
        words = pxml.select('//_:Word')
        order, groups = pxml.getLeftRightTopBottomReadingOrder(
            words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0)
        line = pxml.selectNth('//_:TextLine')
        group_idx = 0
        idx_in_group = 0
        transcript_pred = []
        for n in order:
            word_idx = order.index(n)
            if idx_in_group >= groups[group_idx]:
                group_idx += 1
                idx_in_group = 0
            transcript_pred.append(pxml.getTextEquiv(words[n]))
            pxml.setProperty(words[n], 'word_idx', str(word_idx))
            pxml.setProperty(words[n], "line", str(group_idx))
            pxml.moveElem(words[n], line)
            idx_in_group += 1
        image_text = image_id + '.txt'
        # Write XML to file
        return " ".join(transcript_pred)
Ejemplo n.º 16
0
def blend_single_xml(xmlfile, loc, img_list, xpath):
    """blends two images in a third image (taken from xml document) 
    
    Args:
        xmlfile (str): path to xml file from where third image is taken
        loc (str): 'left', 'right', 'top', 'bottom',  'top_bottom', 'left_right', 'centre'
        img_list (list): a list of image files
        xpath (str): xpath of xml file as blending target region corresponding third image 
    
    Returns:
        numpy.ndarray: blended document 
    """
    img_file = random.choice(img_list)
    image = cv2.imread(img_file)
    image_rect = get_img_rect(image)

    pxml = pagexml.PageXML(xmlfile)
    coords = elem_bbox(pxml, xpath)
    doc_rect = coords2rect(coords)
    filename = pxml.getPageImageFilename(0)
    filepath = os.path.join(os.path.dirname(xmlfile), filename)

    document = cv2.imread(filepath)
    if loc == 'left':
        rect = Rect(doc_rect.l, doc_rect.t, int((doc_rect.l + doc_rect.r) / 2),
                    doc_rect.b)
        document = blend(document, rect, image_rect, image)
    elif loc == 'right':
        rect = Rect(int((doc_rect.l + doc_rect.r) / 2), doc_rect.t, doc_rect.r,
                    doc_rect.b)
        document = blend(document, rect, image_rect, image)
    elif loc == 'top':
        rect = Rect(doc_rect.l, doc_rect.t, doc_rect.r,
                    int((doc_rect.t + doc_rect.b) / 2))
        document = blend(document, rect, image_rect, image)
    elif loc == 'bottom':
        rect = Rect(doc_rect.l, int((doc_rect.t + doc_rect.b) / 2), doc_rect.r,
                    doc_rect.b)
        document = blend(document, rect, image_rect, image)
    elif loc == 'centre':
        rect = doc_rect
        document = blend(document, rect, image_rect, image)
    elif loc == 'left_right':
        rect = Rect(doc_rect.l, doc_rect.t, int((doc_rect.l + doc_rect.r) / 2),
                    doc_rect.b)
        document = blend(document, rect, image_rect, image)

        img_file = random.choice(img_list)
        image = cv2.imread(img_file)
        image_rect = get_img_rect(image)
        rect = Rect(int((doc_rect.l + doc_rect.r) / 2), doc_rect.t, doc_rect.r,
                    doc_rect.b)
        document = blend(document, rect, image_rect, image)
    elif loc == 'top_bottom':
        rect = Rect(doc_rect.l, doc_rect.t, doc_rect.r,
                    int((doc_rect.t + doc_rect.b) / 2))
        document = blend(document, rect, image_rect, image)

        img_file = random.choice(img_list)
        image = cv2.imread(img_file)
        image_rect = get_img_rect(image)
        rect = Rect(doc_rect.l, int((doc_rect.t + doc_rect.b) / 2), doc_rect.r,
                    doc_rect.b)
        document = blend(document, rect, image_rect, image)
    else:
        print('Insert location')
    return document
Ejemplo n.º 17
0
def generate_pagexml(image_id, data, retinanet, score_threshold, nms_threshold,
                     dataset_val):
    image_name = image_id + '.jpg'
    im_file_out = 'pagexmls/' + image_name
    alphabet = retinanet.alphabet
    #retinanet.score_threshold = torch.tensor(score_threshold).cuda().float()
    colors = get_n_random_colors(len(dataset_val.labels))
    gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2])

    pxml = pagexml.PageXML()
    unnormalize = UnNormalizer()
    with torch.no_grad():
        st = time.time()
        im = data['img']

        im = im.cuda().float()
        print(retinanet.htr_gt_box)
        if retinanet.htr_gt_box:
            scores, classification, transformed_anchors, transcriptions = retinanet(
                [im, data['annot']])
            score_threshold = 0
        else:
            scores, classification, transformed_anchors, transcriptions = retinanet(
                im)

        n_boxes_predicted = transformed_anchors.shape[0]
        print(n_boxes_predicted, "BOXES PREDICTED")

        img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy()

        img[img < 0] = 0
        img[img > 255] = 255

        img = np.transpose(img, (1, 2, 0))

        img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
        width = img.shape[1]
        height = img.shape[0]
        cv2.imwrite(im_file_out, img)

        conf = pagexml.ptr_double()
        pxml.newXml('retinanet_dets', image_name, width, height)
        page = pxml.selectNth("//_:Page", 0)
        reg = pxml.addTextRegion(page)
        pxml.setCoordsBBox(reg, 0, 0, width, height, conf)
        line = pxml.addTextLine(reg)
        pxml.setCoordsBBox(line, 0, 0, width, height, conf)
        words = []
        for k in range(len(dataset_val.labels)):
            cv2.putText(img, dataset_val.labels[k], (25, 25 + k * 15),
                        cv2.FONT_HERSHEY_PLAIN, 1, colors[k], 2)
        transcriptions = np.argmax(transcriptions.cpu(), axis=-1)
        for box_id in range(n_boxes_predicted):

            # Initialize object for setting confidence values
            box = {}
            bbox = transformed_anchors[box_id, :]
            transcription = transcriptions[box_id, :]
            x1 = int(bbox[0])
            y1 = int(bbox[1])
            x2 = int(bbox[2])
            y2 = int(bbox[3])
            label_name = dataset_val.labels[int(classification[box_id])]

            cv2.rectangle(img, (x1, y1), (x2, y2),
                          color=colors[int(classification[box_id])],
                          thickness=2)

            # Add a text region to the Page
            word = pxml.addWord(line, "ID" + str(box_id))

            # Set text region bounding box with a confidence
            pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1, conf)

            #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf )
            #transcription = transcripts[j]
            transcription = labels_to_text(transcription, alphabet)
            draw_caption(img, (x1, y1, x2, y2), transcription)

            # Set the text for the text region
            conf.assign(0.9)
            pxml.setTextEquiv(word, transcription, conf)

            # Add property to text region
            pxml.setProperty(word, "category", label_name)

            # Add a second page with a text region and specific id
            #page = pxml.addPage("example_image_2.jpg", 300, 300)
            #reg = pxml.addTextRegion( page, "regA" )
            #pxml.setCoordsBBox( reg, 15, 12, 76, 128 )
            words.append(word)
        words = pxml.select('//_:Word')
        order, groups = pxml.getLeftRightTopBottomReadingOrder(
            words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0)
        line = pxml.selectNth('//_:TextLine')
        group_idx = 0
        idx_in_group = 0
        for n in order:
            word_idx = order.index(n)
            if idx_in_group >= groups[group_idx]:
                group_idx += 1
                idx_in_group = 0

            pxml.setProperty(words[n], 'word_idx', str(word_idx))
            pxml.setProperty(words[n], "line", str(group_idx))
            pxml.moveElem(words[n], line)
            idx_in_group += 1

        # Write XML to file
        pxml.write('pagexmls/' + gtxml_name + ".xml")
        cv2.imwrite(os.path.join('pred_sample_ims',
                                 str(image_id) + '.jpg'), img)
    """
    convert = lambda text: int(text) if text.isdigit() else text
    alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
    return sorted(l, key=alphanum_key)


GT_DIR = 'datasets/esposalles/data-esposalles/test'
re.sub("[^0-9]", "", "sdkjh987978asd098as0980a98sd")

dir_out = 'preds_txt'
if not os.path.exists(dir_out): os.mkdir(dir_out)

for f in sorted_nicely(os.listdir('pagexmls')):
    gt_xml = f[:24]
    gt_xml = os.path.join(GT_DIR, gt_xml + '.xml')
    pxml = pagexml.PageXML()

    pxml.loadXml(gt_xml)
    page = pxml.select('_:Page')
    page_id = pxml.getPropertyValue(page[0], key='idPage')
    record_id = f.split('.')[0][-2:]
    record_id = str(int(re.sub("[^0-9]", "", record_id)) + 1)

    pred_xml_file = f.split('.')[0] + '.xml'
    pred_xml = pagexml.PageXML()
    pred_xml.loadXml(os.path.join('pagexmls', pred_xml_file))
    words = pred_xml.select('//_:Word')
    fout = open(
        os.path.join(
            dir_out,
            "idPage" + page_id + "_Record" + record_id + "_output.txt"), "w")
Ejemplo n.º 19
0
 def test_get_textequiv_grandchildren(self):
     pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml'))
     text = pxml.getTextEquiv(pxml.selectByID('r2'),
                              xpath='_:TextLine/_:TextEquiv',
                              separator='\n')
     self.assertEqual(text, 'dolor sit amet, in per iusto\nnostrud,')
Ejemplo n.º 20
0
 def test_set_textequiv(self):
     pxml = pagexml.PageXML(os.path.join(TESTSDIR, 'examples/lorem.xml'))
     elem = pxml.selectByID('r1_l1')
     text = 'new text'
     pxml.setTextEquiv(elem, text)
     self.assertEqual(text, pxml.getTextEquiv(elem))
Ejemplo n.º 21
0
import sys
import numpy as np
import os
import pdb
import cv2

fin = sys.argv[1]
f = open(fin, 'r')
flines = f.readlines()
pdb.set_trace()
all_wh = np.zeros((len(flines), 2))
prev_im = ""
if not os.path.exists('pagexmls'):
    os.mkdir('pagexmls')

pxml = pagexml.PageXML()

for line_idx in range(len(flines)):
    line = flines[line_idx]
    vals = line.split(',')
    x0 = int(vals[1])
    x1 = int(vals[3])
    y0 = int(vals[2])
    y1 = int(vals[4])
    width = x1 - x0
    height = y1 - y0
    tag = vals[5]
    text = vals[6].strip()
    im_file = vals[0]
    image_id = im_file.split('.')[-2].split('/')[-1]
    gt_im_path = im_file.split('.')[0] + '_gt.jpg'
def get_parser():
    parser = ArgumentParser()
    parser.add_argument('xml',
                        action=ActionPath(mode='fr'),
                        help='Page XML file to process.')
    parser.add_argument('--with_conf',
                        type=bool,
                        default=False,
                        help='Whether to include confidences.')
    return parser


if __name__ == '__main__':
    cfg = get_parser().parse_args()

    pxml = pagexml.PageXML(cfg.xml())
    page_id = pxml.getValue('//@imageFilename').split('.')[0]

    for xreg in pxml.select('//_:TextRegion'):

        reg_id = page_id + '.' + pxml.getValue('@id', xreg)
        sys.stdout.write(reg_id + '\t')

        if cfg.with_conf:
            conf = min([
                float(pxml.getValue(e))
                for e in pxml.select('.//_:TextEquiv/@conf', xreg)
            ])
            sys.stdout.write(str(conf) + '\t')

        reg_text = ''
Ejemplo n.º 23
0
def generate_pagexml(image_id, data, retinanet, score_threshold, dataset_val,
                     nms_threshold):
    image_name = image_id + '.jpg'
    file = 'pagexmls/' + image_name
    alphabet = " abcdefghijklmnopqrstuvwxy z"

    colors = get_n_random_colors(len(dataset_val.labels))
    gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2])

    pxml = pagexml.PageXML()
    unnormalize = UnNormalizer()
    with torch.no_grad():
        st = time.time()
        im = data['img']

        im = im.cuda().float()
        scores, classification, transformed_anchors = retinanet(
            [im, nms_threshold])
        print('Elapsed time: {}'.format(time.time() - st))
        idxs = np.where(scores > score_threshold)
        img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy()
        #img = np.array(255 * unnormalize(im)).copy()

        img[img < 0] = 0
        img[img > 255] = 255

        img = np.transpose(img, (1, 2, 0))

        img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
        width = img.shape[1]
        height = img.shape[0]
        cv2.imwrite(file, img)

        conf = pagexml.ptr_double()
        pxml.newXml('retinanet_dets', image_name, width, height)
        page = pxml.selectNth("//_:Page", 0)
        reg = pxml.addTextRegion(page)
        pxml.setCoordsBBox(reg, 0, 0, width, height)
        line = pxml.addTextLine(reg)
        pxml.setCoordsBBox(line, 0, 0, width, height)
        words = []
        for k in range(len(dataset_val.labels)):
            cv2.putText(img, dataset_val.labels[k], (25, 25 + k * 15),
                        cv2.FONT_HERSHEY_PLAIN, 1, colors[k], 2)

        for j in range(idxs[0].shape[0]):

            # Initialize object for setting confidence values
            box = {}
            bbox = transformed_anchors[idxs[0][j], :]
            x1 = int(bbox[0])
            y1 = int(bbox[1])
            x2 = int(bbox[2])
            y2 = int(bbox[3])
            label_name = dataset_val.labels[int(classification[idxs[0][j]])]

            cv2.rectangle(img, (x1, y1), (x2, y2),
                          color=colors[int(classification[idxs[0][j]])],
                          thickness=2)

            # Add a text region to the Page
            word = pxml.addWord(line, "ID" + str(j))

            # Set text region bounding box with a confidence
            pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1)

            #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf )

            transcripts = []
            confs = []
            seq_len = int(bbox[4])
            for k in range(seq_len + 1):
                transcripts.append(
                    np.argmax(bbox[(5 + k * 27):((5 + (k + 1) * 27))]))
            transcripts = np.array(transcripts)
            transcript = labels_to_text(transcripts, alphabet)
            draw_caption(
                img, (x1, y1, x2, y2), "".join([
                    alphabet[transcripts[k]] for k in range(len(transcripts))
                ]))

            # Set the text for the text region
            conf.assign(1)
            pxml.setTextEquiv(
                word, "".join([
                    alphabet[transcripts[k]] for k in range(len(transcripts))
                ]))

            # Add property to text region
            pxml.setProperty(word, "category", label_name)

            words.append(word)
        words = pxml.select('//_:Word')
        order, groups = pxml.getLeftRightTopBottomReadingOrder(
            words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0)
        line = pxml.selectNth('//_:TextLine', 0)
        group_idx = 0
        idx_in_group = 0
        #line= pxml.addTextLine(reg,"ID"+str(group_idx+1))
        for n in order:
            word_idx = order.index(n)

            if idx_in_group >= groups[group_idx]:
                #line = pxml.selectNth('//_:TextLine',group_idx,reg)
                #line= pxml.selectNth(reg)
                group_idx += 1
                idx_in_group = 0

            pxml.setProperty(words[n], 'word_idx', str(word_idx))
            pxml.setProperty(words[n], "line", str(group_idx))
            pxml.moveElem(words[n], line)
            idx_in_group += 1

        # Write XML to file
        pxml.write('pagexmls/' + gtxml_name + ".xml")
        cv2.imwrite(str(image_id) + '.jpg', img)