Esempio n. 1
0
def test_orderedgroup_export_order():
    """
    See https://github.com/OCR-D/core/issues/475
    """
    # arrange
    with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f:
        pcgts = parseString(f.read().encode('utf8'), silence=True)

    # act
    og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    xml_before = to_xml(og)
    children = og.get_AllIndexed()

    # assert
    assert len(children) == 22
    assert [c.index for c in children] == list(range(0, 22))
    # mix up the indexes
    children[0].index = 11
    children[11].index = 3
    children[3].index = 0
    assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
    assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22))
    assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType
    # serialize and make sure the correct order was serialized
    new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
    new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
    assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22))

    xml_after = to_xml(new_og)
Esempio n. 2
0
 def test_to_xml(self):
     #  with open('/tmp/test.xml', 'w') as f:
     #  f.write(to_xml(self.pcgts))
     self.assertIn(
         ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"',
         to_xml(self.pcgts)[:1000])
     self.assertIn('</TextRegion', to_xml(self.pcgts))
Esempio n. 3
0
 def process(self):
     LOG = getLogger('ocrd.dummy')
     assert_file_grp_cardinality(self.input_file_grp, 1)
     assert_file_grp_cardinality(self.output_file_grp, 1)
     for input_file in self.input_files:
         input_file = self.workspace.download_file(input_file)
         file_id = make_file_id(input_file, self.output_file_grp)
         ext = MIME_TO_EXT.get(input_file.mimetype, '')
         local_filename = join(self.output_file_grp, file_id + ext)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         pcgts.set_pcGtsId(file_id)
         self.add_metadata(pcgts)
         LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
         if input_file.mimetype == MIMETYPE_PAGE:
             # Source file is PAGE-XML: Write out in-memory PcGtsType
             self.workspace.add_file(
                 ID=file_id,
                 file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=input_file.mimetype,
                 local_filename=local_filename,
                 content=to_xml(pcgts).encode('utf-8'))
         else:
             # Source file is not PAGE-XML: Copy byte-by-byte
             with open(input_file.local_filename, 'rb') as f:
                 content = f.read()
                 self.workspace.add_file(
                     ID=file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
                     content=content)
             if input_file.mimetype.startswith('image/'):
                 # write out the PAGE-XML representation for this image
                 page_file_id = file_id + '_PAGE'
                 pcgts.set_pcGtsId(page_file_id)
                 pcgts.get_Page().set_imageFilename(local_filename)
                 page_filename = join(self.output_file_grp, file_id + '.xml')
                 LOG.info("Add PAGE-XML %s generated for %s at %s",
                          page_file_id, file_id, page_filename)
                 self.workspace.add_file(
                     ID=page_file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename=page_filename,
                     content=to_xml(pcgts).encode('utf-8'))
Esempio n. 4
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
Esempio n. 5
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
Esempio n. 6
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            angle = page.get_orientation()
            if angle:
                LOG.warning('Overwriting existing deskewing angle: %i', angle)
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter='deskewed')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 7
0
    def process(self):
        """
        Segment with ocropy
        """

        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            page_width = pcgts.get_Page().get_imageWidth()
            page_height = pcgts.get_Page().get_imageHeight()
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            binary = ocrolib.read_image_binary(
                self.workspace.download_url(image_url))
            binary = 1 - binary

            scale = self.parameter['scale'] if self.parameter[
                'scale'] != 0 else psegutils.estimate_scale(binary)
            log.debug(binary)

            pseg = self.compute_segmentation(binary, scale)
            log.debug("pseg=%s", pseg)

            # TODO reading order / enumber
            #  log.debug("finding reading order")
            #  lines = psegutils.compute_lines(pseg, scale)
            #  order = psegutils.reading_order([l.bounds for l in lines])
            #  lsort = psegutils.topsort(order)

            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)

            dummyRegion = TextRegionType(
                id="dummy",
                Coords=CoordsType(
                    points="0,0 %s,0 %s,%s 0,%s" %
                    (page_width, page_width, page_height, page_height)))
            pcgts.get_Page().add_TextRegion(dummyRegion)

            for lineno in range(1, regions.length()):
                log.debug("id=%s bbox=%s", regions.id(lineno),
                          regions.bbox(lineno))
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(
                        points=points_from_y0x0y1x1(regions.bbox(lineno))))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts))
Esempio n. 8
0
    def process(self):
        network_file = self.parameter['network']
        stride = self.parameter['stride']
        classifier = TypegroupsClassifier.load(network_file)

        ignore_type = ('Adornment', 'Book covers and other irrelevant data',
                       'Empty Pages', 'Woodcuts - Engravings')

        self.log.debug('Processing: %s', self.input_files)
        for (_, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            image_url = pcgts.get_Page().imageFilename
            pil_image = self.workspace.resolve_image_as_pil(image_url)
            result = classifier.run(pil_image, stride)
            score_sum = 0
            for typegroup in classifier.classMap.cl2id:
                if not typegroup in ignore_type:
                    score_sum += max(0, result[typegroup])

            script_highscore = 0
            noise_highscore = 0
            result_map = {}
            output = ''
            for typegroup in classifier.classMap.cl2id:
                score = result[typegroup]
                if typegroup in ignore_type:
                    noise_highscore = max(noise_highscore, score)
                else:
                    script_highscore = max(script_highscore, score)
                    normalised_score = max(0, score / score_sum)
                    result_map[normalised_score] = typegroup
            if noise_highscore > script_highscore:
                pcgts.get_Page().set_primaryScript(None)
                self.log.debug(
                    'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s',
                    noise_highscore, script_highscore)
            else:
                for k in sorted(result_map, reverse=True):
                    intk = round(100 * k)
                    if intk <= 0:
                        continue
                    if output != '':
                        output = '%s, ' % output
                    output = '%s%s:%d' % (output, result_map[k], intk)
                self.log.debug('Detected %s' % output)
                page = pcgts.get_Page()
                textStyle = page.get_TextStyle()
                if not textStyle:
                    textStyle = TextStyleType()
                    page.set_TextStyle(textStyle)
                textStyle.set_fontFamily(output)
                ID = concat_padded(self.output_file_grp, input_file.ID)
                self.workspace.add_file(ID=ID,
                                        file_grp=self.output_file_grp,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename="%s/%s" %
                                        (self.output_file_grp, ID),
                                        content=to_xml(pcgts))
Esempio n. 9
0
    def process(self):
        """
        Performs the cropping.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Cropping with tesseract")
                tessapi.SetImage(image)
                
                #
                # helper variables for saving the box coordinates
                #
                min_x = image.width
                min_y = image.height
                max_x = 0
                max_y = 0

                # iterate over all boxes and compare their extent
                # to the min and max values
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)

                    for pair in points.split(' '):
                        x, y = (int(pair.split(',')[0]), int(pair.split(',')[1]))
                        if x < min_x:
                            min_x = x
                        if y < min_y:
                            min_y = y
                        elif x > max_x:
                            max_x = x
                        elif y > max_y:
                            max_y = y
                    log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))

                #
                # set the identified page border
                #
                brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)))
                pcgts.get_Page().set_Border(brd)

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Esempio n. 10
0
def test_serialize_no_empty_readingorder():
    """
    https://github.com/OCR-D/core/issues/602
    """
    pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif')))
    pcgts.get_Page().set_ReadingOrder(ReadingOrderType())
    assert pcgts.get_Page().get_ReadingOrder()
    pcgts = parseString(to_xml(pcgts, skip_declaration=True))
    assert not pcgts.get_Page().get_ReadingOrder()
Esempio n. 11
0
 def test_to_xml_unicode_nsprefix(self):
     with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f:
         from_xml = f.read()
         self.assertIn('<Unicode>', from_xml.decode('utf-8'), 'without NS prefix')
         self.assertIn('<Created', from_xml.decode('utf-8'), 'without NS prefix')
         pcgts = parseString(from_xml, silence=True)
         as_xml = to_xml(pcgts)
         self.assertIn('<pc:Unicode>', as_xml, 'with NS prefix')
         self.assertIn('<pc:Created>', as_xml, 'with NS prefix')
Esempio n. 12
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        # print(self.parameter)
        self.maxlevel = self.parameter['textequiv_level']
        linesdir = self.parameter['linesdir']

        if self.maxlevel not in ['line', 'word', 'glyph']:
            raise Exception(
                "currently only implemented at the line/glyph level")

        root, _, files = os.walk(linesdir).__next__()
        self.root = root
        predfiles = []
        for file in files:
            if '.pred' in file:
                predfiles.append(file[:-9])

########################################################################################

# self.log.info("Using model %s in %s for recognition", model)
        for (n, input_file) in enumerate(self.input_files):
            # self.log.info("INPUT FILE %i / %s", n, input_file)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            self.log.info("Processing text in page '%s'", pcgts.get_pcGtsId())
            page = pcgts.get_Page()

            index = input_file.url.rfind('/') + 1
            fgrp = input_file.url[index:-4]

            # region, line, word, or glyph level:
            regions = page.get_TextRegion()
            if not regions:
                self.log.warning("Page contains no text regions")

            self.process_regions(regions, predfiles, fgrp)

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)

            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
Esempio n. 13
0
 def redraw(self) -> None:
     if self.current:
         self.text_view.set_tooltip_text(self.page_id)
         if self.current.file:
             with self.document.path(self.current.file).open('r') as f:
                 text = f.read()
         else:
             text = to_xml(self.current.pc_gts)
         self.buffer.set_text(text)
     else:
         self.buffer.set_text('')
Esempio n. 14
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            if oplevel == "page":
                self._process_segment(page, page_image.filename, page_id,
                                      file_id + ".ds")

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 15
0
    def process(self):
        """Segment pages into regions using a Mask R-CNN model."""
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        if not tf.test.is_gpu_available():
            LOG.warning(
                "Tensorflow cannot detect CUDA installation. Running without GPU will be slow."
            )

        for input_file in self.input_files:
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_id = input_file.pageId or input_file.ID

            # todo rs: why not cropped?
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized,deskewed,cropped,clipped,non_text')
            # try to load pixel masks
            try:
                # todo rs: this combination only works for tiseg with use_deeplr=true
                mask_image, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='clipped',
                    feature_filter='binarized,deskewed,cropped,non_text')
            except:
                mask_image = None
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None

            self._process_segment(page_image, page, page_xywh, page_id,
                                  input_file, mask_image, dpi)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 16
0
 def test_simpletypes(self):
     pcgts = parseString(simple_page, silence=True)
     self.assertTrue(isinstance(pcgts.get_Page().imageWidth, int))
     el = pcgts.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word(
     )[0].get_TextEquiv()[0]
     self.assertTrue(isinstance(el.conf, float))
     # XXX no validation on setting attributes :-(
     # c.f. https://www.davekuhlman.org/generateDS.html#simpletype
     #  el.set_conf('2.0987')
     #  self.assertTrue(isinstance(el.conf, float))
     with self.assertRaisesRegex(TypeError, ''):
         el.set_conf('I AM NOT A FLOAT DEAL WITH IT')
         parseString(to_xml(pcgts).encode('utf8'))
Esempio n. 17
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        mainIndex = self.parameter['mainIndex']

        for (n, input_file) in enumerate(self.input_files):

            alignurl = input_file.url
            pcgts = parse(alignurl, True)
            page = pcgts.get_Page()
            regions = page.get_TextRegion()

            pagecontent = ''
            for region in regions:
                regioncontent = ''

                lines = region.get_TextLine()
                for line in lines:
                    linecontent = ''

                    words = line.get_Word()
                    for word in words:
                        wordunicode = word.get_TextEquiv()[mainIndex].Unicode
                        word.add_TextEquiv(TextEquivType(Unicode=wordunicode))
                        linecontent += ' ' + wordunicode

                    line.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                    regioncontent += '\n' + linecontent

                region.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                pagecontent += '\n' + regioncontent

            page.add_TextEquiv(TextEquivType(Unicode=pagecontent))

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)
            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
Esempio n. 18
0
def test_to_xml_unicode_nsprefix():
    """see https://github.com/OCR-D/core/pull/474#issuecomment-621477590"""

    # arrange
    with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f:
        from_xml = f.read()

    # assert
    assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix'
    assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix'
    pcgts = parseString(from_xml, silence=True)
    as_xml = to_xml(pcgts)
    assert '<pc:Unicode>' in as_xml, 'with NS prefix'
    assert '<pc:Created>' in as_xml, 'with NS prefix'
Esempio n. 19
0
    def process(self):
        """
        Performs the recognition.
        """

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)

                    line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    line_text = prediction.sentence
                    line_conf = prediction.avg_char_probability

                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])

            _page_update_higher_textequiv_levels('line', pcgts)

            file_id = self._make_file_id(input_file, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts))
    def process(self):
        LOG = getLogger('OcrdAnybaseocrTextline')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized,deskewed')

            if oplevel == 'page':
                LOG.warning("Operation level should be region.")
                self._process_segment(page_image, page, None, page_xywh,
                                      page_id, input_file, n)

            else:
                regions = page.get_TextRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                    continue
                for (k, region) in enumerate(regions):

                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)

                    self._process_segment(region_image, page, region,
                                          region_xywh, region.id, input_file,
                                          k)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 21
0
def negative2zero(inputfile, outputfile):
    print("Setting negative coords to zero..")
    pcgts = parse(inputfile, silence=True)
    page = pcgts.get_Page()
    for attr in dir(page):
        if "get_" in attr and "Region" in attr:
            for regiondata in getattr(page,attr)():
                if attr == "get_TextRegion":
                    for textline in regiondata.get_TextLine():
                        textcoords = textline.get_Coords()
                        textcoords.set_points(update_points(textcoords.get_points()))
                regcoords = regiondata.get_Coords()
                regcoords.set_points(update_points(regcoords.get_points()))
    content = to_xml(pcgts)
    with open(outputfile,"w") as fout:
        fout.write(content)
Esempio n. 22
0
    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)
                    # <pg:ReadingOrder>
                    ro = pcgts.get_Page().get_ReadingOrder()
                    if ro is None:
                        ro = ReadingOrderType()
                        pcgts.get_Page().set_ReadingOrder(ro)
                    # <pg:OrderedGroup>
                    og = ro.get_OrderedGroup()
                    if og is None:
                        og = OrderedGroupType(id="reading-order")
                        ro.set_OrderedGroup(og)
                    # <pg:RegionRefIndexed>
                    og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index))

                    #
                    #  text region
                    #
                    pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points)))

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
    def process(self):
        """Performs border detection on the workspace. """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('OcrdAnybaseocrCropper')

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                            left, top, right, bottom)

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                feature_filter='cropped',
                feature_selector='binarized') # should also be deskewed

            if oplevel == "page":
                self._process_segment(
                    page_image, page, page_coords, page_id, input_file, n)
            else:
                raise Exception(
                    'Operation level %s, but should be "page".', oplevel)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts).encode('utf-8')
            )
Esempio n. 24
0
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file)
            local_input_file = self.workspace.download_file(input_file)
            pcgts = parse(local_input_file.url, silence=True)
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), self.parameter['textequiv_level'])
            self._process_page(pcgts)

            # write back result
            file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                local_filename=os.path.join(self.output_file_grp, file_id),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
Esempio n. 25
0
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']
        LOG = getLogger('OcrdAnybaseocrBinarizer')

        for (n, input_file) in enumerate(self.input_files):
            file_id = make_file_id(input_file, self.output_file_grp)
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            pcgts.set_pcGtsId(file_id)
            self.add_metadata(pcgts)

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter="binarized")
            LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                regions = page.get_TextRegion() + page.get_TableRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for (k, region) in enumerate(regions):
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # TODO: not tested on regions
                    self._process_segment(region_image, page, region_xywh,
                                          region.id, input_file,
                                          str(n) + "_" + str(k))

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 26
0
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()

            regions = page.get_TextRegion()

            for region in regions:
                if region.readingDirection != 'left-to-right':
                    LOG.info('Not processing region "%s" (not left-to-right)',
                             region.id)
                    continue
                if len(region.get_TextLine()
                       ) > 1 and region.textLineOrder != 'top-to-bottom':
                    LOG.info('Not processing region "%s" (not top-to-bottom)',
                             region.id)
                    continue

                _fix_lines(region)

                lines = region.get_TextLine()
                for line in lines:
                    _fix_words(line)

                    words = line.get_Word()
                    for word in words:
                        _fix_glyphs(word)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Esempio n. 27
0
 def test_empty_groups_to_regionrefindexed(self):
     """
     Corrolary See https://github.com/OCR-D/core/issues/475
     """
     with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'),
               'r') as f:
         pcgts = parseString(f.read().encode('utf8'), silence=True)
         og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
         children = og.get_AllIndexed()
         self.assertTrue(isinstance(children[1], OrderedGroupIndexedType))
         self.assertTrue(isinstance(children[21],
                                    UnorderedGroupIndexedType))
         # empty all the elements in the first orederdGroupIndexed
         children[1].set_RegionRefIndexed([])
         # serialize apnd parse to see empty group converted
         pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True)
         og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup()
         children = og.get_AllIndexed()
         self.assertTrue(isinstance(children[1], RegionRefIndexedType))
         self.assertTrue(isinstance(children[21], RegionRefIndexedType))
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']

        LOG = getLogger('OcrdAnybaseocrDeskewer')

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            angle = page.get_orientation()
            if angle:
                LOG.warning('Overwriting existing deskewing angle: %i', angle)
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='deskewed',
                feature_selector='binarized')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 29
0
    def process(self):
        """
        Segment with kraken
        """
        log = getLogger('processor.KrakenSegment')
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            im = self.workspace.resolve_image_as_pil(image_url)

            log.info('Segmenting')
            log.info('Params %s', self.parameter)
            res = segment(im, self.parameter['text_direction'],
                          self.parameter['scale'],
                          self.parameter['maxcolseps'],
                          self.parameter['black_colseps'])
            if self.parameter['script_detect']:
                res = detect_scripts(im, res)

            dummyRegion = TextRegionType()
            pcgts.get_Page().add_TextRegion(dummyRegion)
            #  print(res)
            for lineno, box in enumerate(res['boxes']):
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(points=points_from_x0y0x1y1(box)))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(self.output_file_grp,
                                    pageId=input_file.pageId,
                                    ID=ID,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))
Esempio n. 30
0
 def process(self):
     ifgs = self.input_file_grp.split(",")  # input file groups
     if len(ifgs) < 2:
         raise Exception("need at least two input file groups to align")
     ifts = self.zip_input_files(ifgs)  # input file tuples
     for _id, ift in enumerate(ifts):
         alignments = json.loads(self.run_java_aligner(ift))
         pcgts = self.align(alignments, ift)
         # keep the right part after OCR-D-...-filename
         # and prepend output_file_grp
         input_file = ift[0].input_file
         file_id = make_file_id(input_file, self.output_file_grp)
         pcgts.set_pcGtsId(file_id)
         out = self.workspace.add_file(
             ID=file_id,
             file_grp=self.output_file_grp,
             pageId=input_file.pageId,
             local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
             mimetype=MIMETYPE_PAGE,
             content=to_xml(pcgts),
         )
         self.log.info('created file %s', out)