Beispiel #1
0
    def process(self):
        """
        Segment with ocropy
        """

        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            page_width = pcgts.get_Page().get_imageWidth()
            page_height = pcgts.get_Page().get_imageHeight()
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            binary = ocrolib.read_image_binary(
                self.workspace.download_url(image_url))
            binary = 1 - binary

            scale = self.parameter['scale'] if self.parameter[
                'scale'] != 0 else psegutils.estimate_scale(binary)
            log.debug(binary)

            pseg = self.compute_segmentation(binary, scale)
            log.debug("pseg=%s", pseg)

            # TODO reading order / enumber
            #  log.debug("finding reading order")
            #  lines = psegutils.compute_lines(pseg, scale)
            #  order = psegutils.reading_order([l.bounds for l in lines])
            #  lsort = psegutils.topsort(order)

            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)

            dummyRegion = TextRegionType(
                id="dummy",
                Coords=CoordsType(
                    points="0,0 %s,0 %s,%s 0,%s" %
                    (page_width, page_width, page_height, page_height)))
            pcgts.get_Page().add_TextRegion(dummyRegion)

            for lineno in range(1, regions.length()):
                log.debug("id=%s bbox=%s", regions.id(lineno),
                          regions.bbox(lineno))
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(
                        points=points_from_y0x0y1x1(regions.bbox(lineno))))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts))
Beispiel #2
0
 def process(self):
     """
     Performs the binarization.
     """
     log = getLogger('processor.KrakenBinarize')
     log.debug('Level of operation: "%s"',
               self.parameter['level-of-operation'])
     log.debug('Input file group %s', self.input_file_grp)
     log.debug('Input files %s', [str(f) for f in self.input_files])
     for (n, input_file) in enumerate(self.input_files):
         log.info("INPUT FILE %i / %s", n, input_file)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         image_url = pcgts.get_Page().imageFilename
         log.info("pcgts %s", pcgts)
         if self.parameter['level-of-operation'] == 'page':
             log.info("About to binarize page '%s'", pcgts.pcGtsId)
             image = self.workspace.resolve_image_as_pil(image_url)
             bin_image = kraken.binarization.nlbin(image)
             bin_image_bytes = io.BytesIO()
             bin_image.save(bin_image_bytes, format='PNG')
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(self.output_file_grp,
                                     pageId=input_file.pageId,
                                     ID=ID,
                                     mimetype='image/png',
                                     local_filename="%s/%s" %
                                     (self.output_file_grp, ID),
                                     content=bin_image_bytes.getvalue())
         else:
             for region in pcgts.get_Page().get_TextRegion():
                 if self.parameter['level-of-operation'] == 'block':
                     log.info("About to binarize region '%s'", region.id)
                     image = self.workspace.resolve_image_as_pil(
                         image_url,
                         polygon_from_points(region.get_Coords().points))
                 else:
                     textlines = region.get_TextLine()
                     log.info("About to binarize %i lines of region '%s'",
                              len(textlines), region.id)
                     for (line_no, line) in enumerate(textlines):
                         log.debug("Binarizing line '%s' in region '%s'",
                                   line_no, region.id)
                         image = self.workspace.resolve_image_as_pil(
                             image_url,
                             polygon_from_points(line.get_Coords().points))
                         bin_image = kraken.binarization.nlbin(image)
                         bin_image_bytes = io.BytesIO()
                         bin_image.save(bin_image_bytes, format='PNG')
                         ID = concat_padded(self.output_file_grp, n,
                                            region.id, line_no)
                         self.workspace.add_file(
                             self.output_file_grp,
                             pageId=input_file.pageId,
                             ID=ID,
                             local_filename="%s/%s" %
                             (self.output_file_grp, ID),
                             mimetype='image/png',
                             content=bin_image_bytes.getvalue())
Beispiel #3
0
    def file_id(self, file_grp):
        file_id = self.input_file.ID.replace(self.input_file_grp, file_grp)

        if file_id == self.input_file.ID:
            file_id = concat_padded(file_grp, self.page_num)

        return file_id
Beispiel #4
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
Beispiel #5
0
 def process(self):
     """
     Performs the binarization.
     """
     for (n, input_file) in enumerate(self.input_files):
         log.info("INPUT FILE %i / %s", n, input_file)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         image_url = pcgts.get_Page().imageFilename
         log.info("pcgts %s", pcgts)
         for region in pcgts.get_Page().get_TextRegion():
             textlines = region.get_TextLine()
             log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
             for (line_no, line) in enumerate(textlines):
                 log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
                 image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                 print(dir(kraken.binarization))
                 bin_image = kraken.binarization.nlbin(image)
                 bin_image_bytes = io.BytesIO()
                 bin_image.save(bin_image_bytes, format='PNG')
                 ID = concat_padded(self.output_file_grp, n)
                 self.workspace.add_file(
                     self.output_file_grp,
                     pageId=input_file.pageId,
                     ID=ID,
                     basename="%s.bin.png" % ID,
                     mimetype='image/png',
                     content=bin_image_bytes.getvalue())
Beispiel #6
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
    def _process_segment(self, model, dataset, page, page_xywh, page_id,
                         input_file, orig_img_size, n):
        for i, data in enumerate(dataset):
            w, h = orig_img_size
            generated = model.inference(data['label'], data['inst'],
                                        data['image'])
            dewarped = array(generated.data[0].permute(1, 2, 0).detach().cpu())
            bin_array = array(255 * (dewarped > ocrolib.midrange(dewarped)),
                              'B')
            dewarped = ocrolib.array2pil(bin_array)
            dewarped = dewarped.resize((w, h))

            page_xywh['features'] += ',dewarped'

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(
                dewarped,
                file_id,
                page_id=page_id,
                file_grp=self.image_grp,
                force=self.parameter['force'])
            page.add_AlternativeImage(
                AlternativeImageType(filename=file_path,
                                     comments=page_xywh['features']))
Beispiel #8
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            angle = page.get_orientation()
            if angle:
                LOG.warning('Overwriting existing deskewing angle: %i', angle)
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter='deskewed')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Beispiel #9
0
    def process(self):
        network_file = self.parameter['network']
        stride = self.parameter['stride']
        classifier = TypegroupsClassifier.load(network_file)

        ignore_type = ('Adornment', 'Book covers and other irrelevant data',
                       'Empty Pages', 'Woodcuts - Engravings')

        self.log.debug('Processing: %s', self.input_files)
        for (_, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            image_url = pcgts.get_Page().imageFilename
            pil_image = self.workspace.resolve_image_as_pil(image_url)
            result = classifier.run(pil_image, stride)
            score_sum = 0
            for typegroup in classifier.classMap.cl2id:
                if not typegroup in ignore_type:
                    score_sum += max(0, result[typegroup])

            script_highscore = 0
            noise_highscore = 0
            result_map = {}
            output = ''
            for typegroup in classifier.classMap.cl2id:
                score = result[typegroup]
                if typegroup in ignore_type:
                    noise_highscore = max(noise_highscore, score)
                else:
                    script_highscore = max(script_highscore, score)
                    normalised_score = max(0, score / score_sum)
                    result_map[normalised_score] = typegroup
            if noise_highscore > script_highscore:
                pcgts.get_Page().set_primaryScript(None)
                self.log.debug(
                    'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s',
                    noise_highscore, script_highscore)
            else:
                for k in sorted(result_map, reverse=True):
                    intk = round(100 * k)
                    if intk <= 0:
                        continue
                    if output != '':
                        output = '%s, ' % output
                    output = '%s%s:%d' % (output, result_map[k], intk)
                self.log.debug('Detected %s' % output)
                page = pcgts.get_Page()
                textStyle = page.get_TextStyle()
                if not textStyle:
                    textStyle = TextStyleType()
                    page.set_TextStyle(textStyle)
                textStyle.set_fontFamily(output)
                ID = concat_padded(self.output_file_grp, input_file.ID)
                self.workspace.add_file(ID=ID,
                                        file_grp=self.output_file_grp,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename="%s/%s" %
                                        (self.output_file_grp, ID),
                                        content=to_xml(pcgts))
Beispiel #10
0
    def process(self):
        """
        Performs the cropping.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Cropping with tesseract")
                tessapi.SetImage(image)
                
                #
                # helper variables for saving the box coordinates
                #
                min_x = image.width
                min_y = image.height
                max_x = 0
                max_y = 0

                # iterate over all boxes and compare their extent
                # to the min and max values
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)

                    for pair in points.split(' '):
                        x, y = (int(pair.split(',')[0]), int(pair.split(',')[1]))
                        if x < min_x:
                            min_x = x
                        if y < min_y:
                            min_y = y
                        elif x > max_x:
                            max_x = x
                        elif y > max_y:
                            max_y = y
                    log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))

                #
                # set the identified page border
                #
                brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)))
                pcgts.get_Page().set_Border(brd)

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Beispiel #11
0
    def file_id(self):
        file_id = self.input_file.ID.replace(self.input_file_grp,
                                             self.output_file_grp)

        if file_id == self.input_file.ID:
            file_id = ocrd_utils.concat_padded(self.output_file_grp,
                                               self.page_num)

        return file_id
Beispiel #12
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        # print(self.parameter)
        self.maxlevel = self.parameter['textequiv_level']
        linesdir = self.parameter['linesdir']

        if self.maxlevel not in ['line', 'word', 'glyph']:
            raise Exception(
                "currently only implemented at the line/glyph level")

        root, _, files = os.walk(linesdir).__next__()
        self.root = root
        predfiles = []
        for file in files:
            if '.pred' in file:
                predfiles.append(file[:-9])

########################################################################################

# self.log.info("Using model %s in %s for recognition", model)
        for (n, input_file) in enumerate(self.input_files):
            # self.log.info("INPUT FILE %i / %s", n, input_file)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            self.log.info("Processing text in page '%s'", pcgts.get_pcGtsId())
            page = pcgts.get_Page()

            index = input_file.url.rfind('/') + 1
            fgrp = input_file.url[index:-4]

            # region, line, word, or glyph level:
            regions = page.get_TextRegion()
            if not regions:
                self.log.warning("Page contains no text regions")

            self.process_regions(regions, predfiles, fgrp)

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)

            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
Beispiel #13
0
    def process(self):
        """
        Segment with kraken
        """
        log = getLogger('processor.KrakenSegment')
        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            im = self.workspace.resolve_image_as_pil(image_url)

            log.info('Segmenting')
            log.info('Params %s', self.parameter)
            res = segment(im, self.parameter['text_direction'],
                          self.parameter['scale'],
                          self.parameter['maxcolseps'],
                          self.parameter['black_colseps'])
            if self.parameter['script_detect']:
                res = detect_scripts(im, res)

            dummyRegion = TextRegionType()
            pcgts.get_Page().add_TextRegion(dummyRegion)
            #  print(res)
            for lineno, box in enumerate(res['boxes']):
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(points=points_from_x0y0x1y1(box)))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(self.output_file_grp,
                                    pageId=input_file.pageId,
                                    ID=ID,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts).encode('utf-8'))
Beispiel #14
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            if oplevel == "page":
                self._process_segment(page, page_image.filename, page_id,
                                      file_id + ".ds")

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Beispiel #15
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        mainIndex = self.parameter['mainIndex']

        for (n, input_file) in enumerate(self.input_files):

            alignurl = input_file.url
            pcgts = parse(alignurl, True)
            page = pcgts.get_Page()
            regions = page.get_TextRegion()

            pagecontent = ''
            for region in regions:
                regioncontent = ''

                lines = region.get_TextLine()
                for line in lines:
                    linecontent = ''

                    words = line.get_Word()
                    for word in words:
                        wordunicode = word.get_TextEquiv()[mainIndex].Unicode
                        word.add_TextEquiv(TextEquivType(Unicode=wordunicode))
                        linecontent += ' ' + wordunicode

                    line.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                    regioncontent += '\n' + linecontent

                region.add_TextEquiv(TextEquivType(Unicode=regioncontent))
                pagecontent += '\n' + regioncontent

            page.add_TextEquiv(TextEquivType(Unicode=pagecontent))

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)
            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n, mrcnn_model, class_names):

        img_array = ocrolib.pil2array(page_image)
        results = mrcnn_model.detect([img_array], verbose=1)
        r = results[0]

        page_xywh['features'] += ',blksegmented'

        for i in range(len(r['rois'])):

            width, height, _ = img_array.shape
            min_x = r['rois'][i][0]
            min_y = r['rois'][i][1]
            max_x = r['rois'][i][2]
            max_y = r['rois'][i][3]

            #small post-processing incase of paragrapgh to not cut last alphabets
            if (min_x - 5) > width and r['class_ids'][i] == 2:
                min_x -= 5
            if (max_x + 10) < width and r['class_ids'][i] == 2:
                min_x += 10

            # this can be tested, provided whether we need previous comments or not?

            region_img = img_array[min_x:max_x, min_y:
                                   max_y]  #extract from points and img_array
            region_img = ocrolib.array2pil(region_img)
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.image_grp, n)

            file_path = self.workspace.save_image_file(region_img,
                                                       file_id + "_" + str(i),
                                                       page_id=page_id,
                                                       file_grp=self.image_grp)

            ai = AlternativeImageType(filename=file_path,
                                      comments=page_xywh['features'])
            coords = CoordsType(
                "%i,%i %i,%i %i,%i %i,%i" %
                (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))
            textregion = TextRegionType(Coords=coords,
                                        type_=class_names[r['class_ids'][i]])
            textregion.add_AlternativeImage(ai)
            page.add_TextRegion(textregion)
Beispiel #17
0
    def process(self):
        """
        Performs the region segmentation.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Detecting regions with tesseract")
                tessapi.SetImage(image)
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)
                    # <pg:ReadingOrder>
                    ro = pcgts.get_Page().get_ReadingOrder()
                    if ro is None:
                        ro = ReadingOrderType()
                        pcgts.get_Page().set_ReadingOrder(ro)
                    # <pg:OrderedGroup>
                    og = ro.get_OrderedGroup()
                    if og is None:
                        og = OrderedGroupType(id="reading-order")
                        ro.set_OrderedGroup(og)
                    # <pg:RegionRefIndexed>
                    og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index))

                    #
                    #  text region
                    #
                    pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points)))

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Beispiel #18
0
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file)
            local_input_file = self.workspace.download_file(input_file)
            pcgts = parse(local_input_file.url, silence=True)
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), self.parameter['textequiv_level'])
            self._process_page(pcgts)

            # write back result
            file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                local_filename=os.path.join(self.output_file_grp, file_id),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n):

        I = ocrolib.pil2array(page_image)
        if len(I.shape) > 2:
            I = np.mean(I, 2)
        I = 1 - I / I.max()
        rows, cols = I.shape

        # Generate Mask and Seed Images
        Imask, Iseed = self.pixMorphSequence_mask_seed_fill_holes(I)

        # Iseedfill: Union of Mask and Seed Images
        Iseedfill = self.pixSeedfillBinary(Imask, Iseed)

        # Dilation of Iseedfill
        mask = ones((3, 3))
        Iseedfill = ndimage.binary_dilation(Iseedfill, mask)

        # Expansion of Iseedfill to become equal in size of I
        Iseedfill = self.expansion(Iseedfill, (rows, cols))

        # Write Text and Non-Text images
        image_part = array((1 - I * Iseedfill), dtype=int)
        image_part[0, 0] = 0  # only for visualisation purpose
        text_part = array((1 - I * (1 - Iseedfill)), dtype=int)
        text_part[0, 0] = 0  # only for visualisation purpose

        page_xywh['features'] += ',tiseged'

        bin_array = array(255 * (text_part > ocrolib.midrange(text_part)), 'B')
        bin_image = ocrolib.array2pil(bin_array)

        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)
        file_path = self.workspace.save_image_file(bin_image,
                                                   file_id,
                                                   page_id=page_id,
                                                   file_grp=self.image_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Beispiel #20
0
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()

            regions = page.get_TextRegion()

            for region in regions:
                if region.readingDirection != 'left-to-right':
                    LOG.info('Not processing region "%s" (not left-to-right)',
                             region.id)
                    continue
                if len(region.get_TextLine()
                       ) > 1 and region.textLineOrder != 'top-to-bottom':
                    LOG.info('Not processing region "%s" (not top-to-bottom)',
                             region.id)
                    continue

                _fix_lines(region)

                lines = region.get_TextLine()
                for line in lines:
                    _fix_words(line)

                    words = line.get_Word()
                    for word in words:
                        _fix_glyphs(word)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            img = self.workspace.resolve_image_as_pil(fname)
            #fname = str(fname)
            print("Process file: ", fname)
            base, _ = ocrolib.allsplitext(fname)

            img_array = ocrolib.pil2array(img)
            img_array_bin = np.array(img_array > ocrolib.midrange(img_array),
                                     'i')

            lineDetectH = []
            lineDetectV = []
            img_array_rr = self.remove_rular(img_array)

            textarea, img_array_rr_ta, height, width = self.detect_textarea(
                img_array_rr)
            self.parameter['colSeparator'] = int(
                width * self.parameter['colSeparator'])

            if len(textarea) > 1:
                textarea = self.crop_area(textarea, img_array_bin,
                                          img_array_rr_ta)

                if len(textarea) == 0:
                    min_x, min_y, max_x, max_y = self.select_borderLine(
                        img_array_rr, lineDetectH, lineDetectV)
                else:
                    min_x, min_y, max_x, max_y = textarea[0]
            elif len(textarea) == 1 and (
                    height * width * 0.5 <
                (abs(textarea[0][2] - textarea[0][0]) *
                 abs(textarea[0][3] - textarea[0][1]))):
                x1, y1, x2, y2 = textarea[0]
                x1 = x1 - 20 if x1 > 20 else 0
                x2 = x2 + 20 if x2 < width - 20 else width
                y1 = y1 - 40 if y1 > 40 else 0
                y2 = y2 + 40 if y2 < height - 40 else height

                #self.save_pf(base, [x1, y1, x2, y2])
                min_x, min_y, max_x, max_y = textarea[0]
            else:
                min_x, min_y, max_x, max_y = self.select_borderLine(
                    img_array_rr, lineDetectH, lineDetectV)

            brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" %
                                               (min_x, min_y, max_x, min_y,
                                                max_x, max_y, min_x, max_y)))
            pcgts.get_Page().set_Border(brd)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Beispiel #22
0
    def _process_segment(self, page_image, page, page_xywh, page_id,
                         input_file, n):

        raw = ocrolib.pil2array(page_image)
        flat = raw.astype("float64")

        # estimate skew angle and rotate
        if self.parameter['maxskew'] > 0:
            if self.parameter['parallel'] < 2:
                LOG.info("Estimating Skew Angle")
            d0, d1 = flat.shape
            o0, o1 = int(self.parameter['bignore'] * d0), int(
                self.parameter['bignore'] * d1)
            flat = amax(flat) - flat
            flat -= amin(flat)
            est = flat[o0:d0 - o0, o1:d1 - o1]
            ma = self.parameter['maxskew']
            ms = int(2 * self.parameter['maxskew'] *
                     self.parameter['skewsteps'])
            angle = self.estimate_skew_angle(est, linspace(-ma, ma, ms + 1))
            flat = interpolation.rotate(flat,
                                        angle,
                                        mode='constant',
                                        reshape=0)
            flat = amax(flat) - flat
        else:
            angle = 0

        # self.write_angles_to_pageXML(base,angle)
        # estimate low and high thresholds
        if self.parameter['parallel'] < 2:
            LOG.info("Estimating Thresholds")
        d0, d1 = flat.shape
        o0, o1 = int(self.parameter['bignore'] * d0), int(
            self.parameter['bignore'] * d1)
        est = flat[o0:d0 - o0, o1:d1 - o1]
        if self.parameter['escale'] > 0:
            # by default, we use only regions that contain
            # significant variance; this makes the percentile
            # based low and high estimates more reliable
            e = self.parameter['escale']
            v = est - filters.gaussian_filter(est, e * 20.0)
            v = filters.gaussian_filter(v**2, e * 20.0)**0.5
            v = (v > 0.3 * amax(v))
            v = morphology.binary_dilation(v, structure=ones((int(e * 50), 1)))
            v = morphology.binary_dilation(v, structure=ones((1, int(e * 50))))
            if self.parameter['debug'] > 0:
                imshow(v)
                ginput(1, self.parameter['debug'])
            est = est[v]
        lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo'])
        hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi'])

        # rescale the image to get the gray scale image
        if self.parameter['parallel'] < 2:
            LOG.info("Rescaling")
        flat -= lo
        flat /= (hi - lo)
        flat = clip(flat, 0, 1)
        if self.parameter['debug'] > 0:
            imshow(flat, vmin=0, vmax=1)
            ginput(1, self.parameter['debug'])
        deskewed = 1 * (flat > self.parameter['threshold'])

        # output the normalized grayscale and the thresholded images
        #LOG.info("%s lo-hi (%.2f %.2f) angle %4.1f" %(lo, hi, angle))

        #TODO: Need some clarification as the results effect the following pre-processing steps.
        #orientation = -angle
        #orientation = 180 - ((180 - orientation) % 360)

        if angle is None:  # FIXME: quick fix to prevent angle of "none"
            angle = 0

        page.set_orientation(angle)

        page_xywh['features'] += ',deskewed'
        bin_array = array(255 * (deskewed > ocrolib.midrange(deskewed)), 'B')
        page_image = ocrolib.array2pil(bin_array)

        file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
        if file_id == input_file.ID:
            file_id = concat_padded(self.image_grp, n)
        file_path = self.workspace.save_image_file(page_image,
                                                   file_id,
                                                   page_id=page_id,
                                                   file_grp=self.image_grp)
        page.add_AlternativeImage(
            AlternativeImageType(filename=file_path,
                                 comments=page_xywh['features']))
Beispiel #23
0
 def process(self):
     """
     Performs the (text) recognition.
     """
     # print(self.parameter)
     log.debug("TESSDATA: %s, installed tesseract models: %s",
               *get_languages())
     maxlevel = self.parameter['textequiv_level']
     model = get_languages()[1][-1]  # last installed model
     if 'model' in self.parameter:
         model = self.parameter['model']
         if model not in get_languages()[1]:
             raise Exception("configured model " + model +
                             " is not installed")
     with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
         log.info("Using model '%s' in %s for recognition at the %s level",
                  model,
                  get_languages()[0], maxlevel)
         # todo: populate GetChoiceIterator() with LSTM models, too:
         #tessapi.SetVariable("lstm_choice_mode", "2")
         # todo: determine relevancy of these variables:
         # tessapi.SetVariable("tessedit_single_match", "0")
         #
         # tessedit_load_sublangs
         # tessedit_preserve_min_wd_len 2
         # tessedit_prefer_joined_punct 0
         # tessedit_write_rep_codes 0
         # tessedit_parallelize 0
         # tessedit_zero_rejection 0
         # tessedit_zero_kelvin_rejection 0
         # tessedit_reject_mode 0
         # tessedit_use_reject_spaces 1
         # tessedit_fix_fuzzy_spaces 1
         # tessedit_char_blacklist
         # tessedit_char_whitelist
         # chs_leading_punct ('`"
         # chs_trailing_punct1 ).,;:?!
         # chs_trailing_punct2 )'`"
         # numeric_punctuation .,
         # unrecognised_char |
         # ok_repeated_ch_non_alphanum_wds -?*=
         # conflict_set_I_l_1 Il1[]
         # preserve_interword_spaces 0
         # tessedit_enable_dict_correction 0
         # tessedit_enable_bigram_correction 1
         # stopper_smallword_size 2
         # wordrec_max_join_chunks 4
         # suspect_space_level 100
         # suspect_short_words 2
         # language_model_ngram_on 0
         # language_model_ngram_order 8
         # language_model_min_compound_length 3
         # language_model_penalty_non_freq_dict_word 0.1
         # language_model_penalty_non_dict_word 0.15
         # language_model_penalty_punc 0.2
         # language_model_penalty_case 0.1
         # language_model_penalty_script 0.5
         # language_model_penalty_chartype 0.3
         # language_model_penalty_spacing 0.05
         # textord_max_noise_size 7
         # enable_noise_removal 1
         # classify_bln_numeric_mode 0
         # lstm_use_matrix 1
         # user_words_file
         # user_patterns_file
         for (n, input_file) in enumerate(self.input_files):
             log.info("INPUT FILE %i / %s", n, input_file)
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             # TODO use binarized / gray
             pil_image = self.workspace.resolve_image_as_pil(
                 pcgts.get_Page().imageFilename)
             tessapi.SetImage(pil_image)
             metadata = pcgts.get_Metadata()  # ensured by from_file()
             metadata.add_MetadataItem(
                 MetadataItemType(
                     type_="processingStep",
                     name=OCRD_TOOL['tools']['ocrd-tesserocr-recognize']
                     ['steps'][0],
                     value='ocrd-tesserocr-recognize',
                     Labels=[
                         LabelsType(externalRef="parameters",
                                    Label=[
                                        LabelType(
                                            type_=name,
                                            value=self.parameter[name])
                                        for name in self.parameter.keys()
                                    ])
                     ]))
             log.info("Recognizing text in page '%s'", pcgts.get_pcGtsId())
             regions = pcgts.get_Page().get_TextRegion()
             if not regions:
                 log.warning("Page contains no text regions")
             self._process_regions(regions, maxlevel, tessapi)
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts),
             )
    def process(self):
        try:
            page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        if not torch.cuda.is_available():
            LOG.error("Your system has no CUDA installed. No GPU detected.")
            sys.exit(1)

        path = self.parameter['pix2pixHD']
        if not Path(path).is_dir():
            LOG.error("""\
                NVIDIA's pix2pixHD was not found at '%s'. Make sure the `pix2pixHD` parameter 
                in ocrd-tools.json points to the local path to the cloned pix2pixHD repository.

                pix2pixHD can be downloaded from https://github.com/NVIDIA/pix2pixHD
                """ % path)
            sys.exit(1)
        model_file_path = os.path.join(path, 'checkpoints/latest_net_G.pth')
        if not Path(model_file_path).is_file():
            LOG.error("""\
                pix2pixHD model file was not found at '%s'. Make sure the this file exists.
                """ % model_file_path)
            sys.exit(1)

        opt, model = self.prepare_options(path)

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %s", page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='dewarped',
                feature_selector='binarized'
            )  # images should be deskewed and cropped
            if oplevel == 'page':
                dataset = self.prepare_data(opt, page_image, path)
                orig_img_size = page_image.size
                self._process_segment(model, dataset, page, page_xywh, page_id,
                                      input_file, orig_img_size, n)
            else:
                regions = page.get_TextRegion() + page.get_TableRegion(
                )  #get all regions?
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for (k, region) in enumerate(regions):
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # TODO: not tested on regions
                    # TODO: region has to exist as a physical file to be processed by pix2pixHD
                    dataset = self.prepare_data(opt, region_image, path)
                    orig_img_size = region_image.size
                    self._process_segment(model, dataset, page, region_xywh,
                                          region.id, input_file, orig_img_size,
                                          n)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp, page_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(page_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=page_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        page_grp, file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'),
                                    force=self.parameter['force'])
        os.rmdir(self.input_file_grp +
                 "/test_A/")  #FIXME: better way of deleting a temp_dir?
Beispiel #25
0
    def process(self):
        """Extract region images from the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Extract an image for each region (which depending on the workflow
        can already be deskewed, dewarped, binarized etc.), cropped to its
        minimal bounding box, and masked by the coordinate polygon outline.
        If ``transparency`` is true, then also add an alpha channel which is
        fully transparent outside of the mask.
        
        Create a JSON file with:
        * the IDs of the region and its parents,
        * the region's coordinates relative to the region image,
        * the region's absolute coordinates,
        * the (text) region's text content (if any),
        * the (text) region's TextStyle (if any),
        * the (text) region's @production (if any),
        * the (text) region's @readingDirection (if any),
        * the (text) region's @textLineOrder (if any),
        * the (text) region's @primaryScript (if any),
        * the (text) region's @primaryLanguage (if any),
        * the region's AlternativeImage/@comments (features),
        * the region's element class,
        * the region's @type,
        * the page's @type,
        * the page's DPI value.
        
        Write all files in the directory of the output file group, named like so:
        * ID + '.raw.png': region image (if the workflow provides raw images)
        * ID + '.bin.png': region image (if the workflow provides binarized images)
        * ID + '.nrm.png': region image (if the workflow provides grayscale-normalized images)
        * ID + '.json': region metadata.
        """
        # pylint: disable=attribute-defined-outside-init
        for n, input_file in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            metadata = pcgts.get_Metadata() # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(type_="processingStep",
                                 name=self.ocrd_tool['steps'][0],
                                 value=TOOL,
                                 Labels=[LabelsType(
                                     externalModel="ocrd-tool",
                                     externalId="parameters",
                                     Label=[LabelType(type_=name,
                                                      value=self.parameter[name])
                                            for name in self.parameter.keys()])]))
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                transparency=self.parameter['transparency'])
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None
            ptype = page.get_type()

            regions = { 'advert': page.get_AdvertRegion(),
                        'text': page.get_TextRegion(),
                        'table': page.get_TableRegion(),
                        'chart': page.get_ChartRegion(),
                        'chem': page.get_ChemRegion(),
                        'graphic': page.get_GraphicRegion(),
                        'image': page.get_ImageRegion(),
                        'linedrawing': page.get_LineDrawingRegion(),
                        'maths': page.get_MathsRegion(),
                        'music': page.get_MusicRegion(),
                        'noise': page.get_NoiseRegion(),
                        'separator': page.get_SeparatorRegion(),
                        'unknown': page.get_UnknownRegion()
            }
            for rtype, rlist in regions.items():
                for region in rlist:
                    description = { 'region.ID': region.id, 'region.type': rtype }
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords,
                        transparency=self.parameter['transparency'])
                    description['subtype'] = region.get_type() if rtype in ['text', 'chart', 'graphic'] else None
                    description['coords_rel'] = coordinates_of_segment(
                        region, region_image, region_coords).tolist()
                    description['coords_abs'] = polygon_from_points(region.get_Coords().points)
                    if rtype == 'text':
                        rtext = region.get_TextEquiv()
                        if rtext:
                            description['region.text'] = rtext[0].Unicode
                        else:
                            description['region.text'] = ''
                        rstyle = region.get_TextStyle() or page.get_TextStyle()
                        if rstyle:
                            description['region.style'] = {
                                'fontFamily': rstyle.fontFamily,
                                'fontSize': rstyle.fontSize,
                                'xHeight': rstyle.xHeight,
                                'kerning': rstyle.kerning,
                                'serif': rstyle.serif,
                                'monospace': rstyle.monospace,
                                'bold': rstyle.bold,
                                'italic': rstyle.italic,
                                'smallCaps': rstyle.smallCaps,
                                'letterSpaced': rstyle.letterSpaced,
                                'strikethrough': rstyle.strikethrough,
                                'underlined': rstyle.underlined,
                                'underlineStyle': rstyle.underlineStyle,
                                'subscript': rstyle.subscript,
                                'superscript': rstyle.superscript
                            }
                        description['production'] = region.get_production()
                        description['readingDirection'] = (
                            region.get_readingDirection() or
                            page.get_readingDirection())
                        description['textLineOrder'] = (
                            region.get_textLineOrder() or
                            page.get_textLineOrder())
                        description['primaryScript'] = (
                            region.get_primaryScript() or
                            page.get_primaryScript())
                        description['primaryLanguage'] = (
                            region.get_primaryLanguage() or
                            page.get_primaryLanguage())
                    description['features'] = region_coords['features']
                    description['DPI']= dpi
                    description['page.ID'] = page_id
                    description['page.type'] = ptype
                    description['file_grp'] = self.input_file_grp
                    description['METS.UID'] = self.workspace.mets.unique_identifier
                    if 'binarized' in region_coords['features']:
                        extension = '.bin'
                    elif 'grayscale_normalized' in region_coords['features']:
                        extension = '.nrm'
                    else:
                        extension = '.raw'
                    
                    file_path = self.workspace.save_image_file(
                        region_image,
                        file_id + '_' + region.id + extension,
                        self.output_file_grp,
                        page_id=page_id,
                        format='PNG')
                    file_path = file_path.replace(extension + '.png', '.json')
                    json.dump(description, open(file_path, 'w'))
Beispiel #26
0
 def test_concat_padded(self):
     self.assertEqual(concat_padded('x', 1), 'x_0001')
     self.assertEqual(concat_padded('x', 1, 2, 3), 'x_0001_0002_0003')
     self.assertEqual(concat_padded('x', 1, '2', 3), 'x_0001_2_0003')
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID
            page = pcgts.get_Page()

            # why does it save the image ??
            page_image, page_xywh, _ = self.workspace.image_from_page(
                page, page_id)

            if self.parameter['parallel'] < 2:
                LOG.info("INPUT FILE %s ", input_file.pageId or input_file.ID)
            raw = ocrolib.read_image_gray(page_image.filename)

            flat = raw
            #flat = np.array(binImg)
            # estimate skew angle and rotate
            if self.parameter['maxskew'] > 0:
                if self.parameter['parallel'] < 2:
                    LOG.info("Estimating Skew Angle")
                d0, d1 = flat.shape
                o0, o1 = int(self.parameter['bignore'] * d0), int(
                    self.parameter['bignore'] * d1)
                flat = amax(flat) - flat
                flat -= amin(flat)
                est = flat[o0:d0 - o0, o1:d1 - o1]
                ma = self.parameter['maxskew']
                ms = int(2 * self.parameter['maxskew'] *
                         self.parameter['skewsteps'])
                angle = self.estimate_skew_angle(est,
                                                 linspace(-ma, ma, ms + 1))
                flat = interpolation.rotate(flat,
                                            angle,
                                            mode='constant',
                                            reshape=0)
                flat = amax(flat) - flat
            else:
                angle = 0

            # self.write_angles_to_pageXML(base,angle)
            # estimate low and high thresholds
            if self.parameter['parallel'] < 2:
                LOG.info("Estimating Thresholds")
            d0, d1 = flat.shape
            o0, o1 = int(self.parameter['bignore'] * d0), int(
                self.parameter['bignore'] * d1)
            est = flat[o0:d0 - o0, o1:d1 - o1]
            if self.parameter['escale'] > 0:
                # by default, we use only regions that contain
                # significant variance; this makes the percentile
                # based low and high estimates more reliable
                e = self.parameter['escale']
                v = est - filters.gaussian_filter(est, e * 20.0)
                v = filters.gaussian_filter(v**2, e * 20.0)**0.5
                v = (v > 0.3 * amax(v))
                v = morphology.binary_dilation(v,
                                               structure=ones(
                                                   (int(e * 50), 1)))
                v = morphology.binary_dilation(v,
                                               structure=ones(
                                                   (1, int(e * 50))))
                if self.parameter['debug'] > 0:
                    imshow(v)
                    ginput(1, self.parameter['debug'])
                est = est[v]
            lo = stats.scoreatpercentile(est.ravel(), self.parameter['lo'])
            hi = stats.scoreatpercentile(est.ravel(), self.parameter['hi'])
            # rescale the image to get the gray scale image
            if self.parameter['parallel'] < 2:
                LOG.info("Rescaling")
            flat -= lo
            flat /= (hi - lo)
            flat = clip(flat, 0, 1)
            if self.parameter['debug'] > 0:
                imshow(flat, vmin=0, vmax=1)
                ginput(1, self.parameter['debug'])
            deskewed = 1 * (flat > self.parameter['threshold'])

            # output the normalized grayscale and the thresholded images
            LOG.info("%s lo-hi (%.2f %.2f) angle %4.1f" %
                     (pcgts.get_Page().imageFilename, lo, hi, angle))
            if self.parameter['parallel'] < 2:
                LOG.info("Writing")
            #ocrolib.write_image_binary(base+".ds.png", deskewed)

            #TODO: Need some clarification as the results effect the following pre-processing steps.
            #orientation = -angle
            #orientation = 180 - ((180 - orientation) % 360)
            pcgts.get_Page().set_orientation(angle)
            #print(orientation, angle)

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Beispiel #28
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Return information on the plausibility of the segmentation into
        regions on the logging level.
        """
        plausibilize = self.parameter['plausibilize']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
                    # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()

            regions = page.get_TextRegion()

            mark_for_deletion = set()
            mark_for_merging = set()

            for i in range(0, len(regions)):
                for j in range(i + 1, len(regions)):
                    LOG.info('Comparing regions "%s" and "%s"', regions[i].id,
                             regions[j].id)
                    region_poly1 = Polygon(
                        polygon_from_points(regions[i].get_Coords().points))
                    region_poly2 = Polygon(
                        polygon_from_points(regions[j].get_Coords().points))

                    LOG.debug('Checking for equality ...')
                    equality = region_poly1.almost_equals(region_poly2)
                    if equality:
                        LOG.warn(
                            'Warning: regions %s and %s cover the same area.' %
                            (regions[i].id, regions[j].id))
                        mark_for_deletion.add(j)

                    LOG.debug('Checking for containment ...')
                    containment_r = region_poly1.contains(region_poly2)
                    containment_l = region_poly2.contains(region_poly1)
                    if containment_r:
                        LOG.warn('Warning: %s contains %s' %
                                 (regions[i].id, regions[j].id))
                        mark_for_deletion.add(j)
                    if containment_l:
                        LOG.warn('Warning: %s contains %s' %
                                 (regions[j].id, regions[i].id))
                        mark_for_deletion.add(i)

            if plausibilize:
                new_regions = []
                for i in range(0, len(regions)):
                    if not i in mark_for_deletion:
                        new_regions.append(regions[i])
                page.set_TextRegion(new_regions)

                #LOG.info('Intersection %i', region_poly1.intersects(region_poly2))
                #LOG.info('Containment %i', region_poly1.contains(region_poly2))
                #if region_poly1.intersects(region_poly2):
                #    LOG.info('Area 1 %d', region_poly1.area)
                #    LOG.info('Area 2 %d', region_poly2.area)
                #    LOG.info('Area intersect %d', region_poly1.intersection(region_poly2).area)

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Beispiel #29
0
 def test_concat_padded(self):
     self.assertEqual(concat_padded('x', 0), 'x_0001')
     self.assertEqual(concat_padded('x', 0, 1, 2), 'x_0001_0002_0003')
     self.assertEqual(concat_padded('x', 0, '1', 2), 'x_0001_1_0003')
Beispiel #30
0
    def process(self):
        """Rates textual annotation of PAGE input files, producing output files with LM scores (and choices).
        
        ... explain incremental page-wise processing here ...
        """
        level = self.parameter['textequiv_level']
        beam_width = self.parameter['beam_width']
        lm_weight = self.parameter['lm_weight']

        prev_traceback = None
        prev_pcgts = None
        prev_file_id = None
        prev_page_id = None
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), level)

            # annotate processing metadata:
            metadata = pcgts.get_Metadata()  # ensured by page_from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0],
                    value='ocrd-keraslm-rate',
                    Labels=[
                        LabelsType(externalRef="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            # context preprocessing:
            # todo: as soon as we have true MODS meta-data in METS (dmdSec/mdWrap/xmlData/mods),
            #       get global context variables from there (e.g. originInfo/dateIssued/@text for year)
            ident = self.workspace.mets.unique_identifier  # at least try to get purl
            context = [0]
            if ident:
                name = ident.split('/')[-1]
                year = name.split('_')[-1]
                if year.isnumeric():
                    year = ceil(int(year) / 10)
                    context = [year]
                    # todo: author etc

            # create a graph for the linear sequence of elements at the given level:
            graph, start_node, end_node = page_get_linear_graph_at(
                level, pcgts)

            # apply language model to (TextEquiv path in) graph,
            # remove non-path TextEquivs, modify confidences:
            if not self.parameter['alternative_decoding']:
                text = [(edge['element'], edge['alternatives'])
                        for edge in _get_edges(graph, 0)]  # graph's path
                textstring = u''.join(
                    textequivs[0].Unicode
                    for element, textequivs in text)  # same length as text
                LOG.info("Rating %d elements with a total of %d characters",
                         len(text), len(textstring))
                confidences = self.rater.rate(textstring,
                                              context)  # much faster
                i = 0
                for element, textequivs in text:
                    textequiv = textequivs[0]  # 1st choice only
                    if element:
                        element.set_TextEquiv([textequiv])  # delete others
                    textequiv_len = len(textequiv.Unicode)
                    conf = sum(confidences[i:i + textequiv_len]
                               ) / textequiv_len  # mean probability
                    conf2 = textequiv.conf
                    textequiv.set_conf(conf * lm_weight + conf2 *
                                       (1. - lm_weight))
                    i += textequiv_len
                if i != len(confidences):
                    LOG.critical(
                        "Input text length and output scores length are off by %d characters",
                        i - len(confidences))
                avg = sum(confidences) / len(confidences)
                ent = sum([-log(max(p, 1e-99), 2)
                           for p in confidences]) / len(confidences)
                ppl = pow(2.0, ent)  # character level
                ppll = pow(
                    2.0,
                    ent * len(confidences) /
                    len(text))  # textequiv level (including spaces/newlines)
                LOG.info("avg: %.3f, char ppl: %.3f, %s ppl: %.3f", avg, ppl,
                         level, ppll)  # character need not always equal glyph!

                # ensure parent textequivs are up to date:
                page_update_higher_textequiv_levels(level, pcgts)

                # write back result
                file_id = input_file.ID.replace(self.input_file_grp,
                                                self.output_file_grp)
                if file_id == input_file.ID:
                    file_id = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=file_id,
                    pageId=input_file.pageId,
                    file_grp=self.output_file_grp,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    mimetype=MIMETYPE_PAGE,
                    content=to_xml(pcgts),
                )
            else:
                LOG.info("Rating %d elements including its alternatives",
                         end_node - start_node)
                path, entropy, traceback = self.rater.rate_best(
                    graph,
                    start_node,
                    end_node,
                    start_traceback=prev_traceback,
                    context=context,
                    lm_weight=lm_weight,
                    beam_width=beam_width,
                    beam_clustering_dist=BEAM_CLUSTERING_DIST
                    if BEAM_CLUSTERING_ENABLE else 0)

                if prev_pcgts:
                    _page_update_from_path(level, path, entropy)

                    # ensure parent textequivs are up to date:
                    page_update_higher_textequiv_levels(level, prev_pcgts)

                    # write back result
                    file_id = prev_file_id.replace(self.input_file_grp,
                                                   self.output_file_grp)
                    if file_id == prev_file_id:
                        file_id = concat_padded(self.output_file_grp, n - 1)
                    self.workspace.add_file(
                        ID=file_id,
                        pageId=prev_page_id,
                        file_grp=self.output_file_grp,
                        local_filename=os.path.join(self.output_file_grp,
                                                    file_id + '.xml'),
                        mimetype=MIMETYPE_PAGE,
                        content=to_xml(prev_pcgts),
                    )

                prev_page_id = input_file.pageId
                prev_file_id = input_file.ID
                prev_pcgts = pcgts
                prev_traceback = traceback

        if prev_pcgts:
            path, entropy, _ = self.rater.next_path(prev_traceback[0],
                                                    ([], prev_traceback[1]))
            _page_update_from_path(level, path, entropy)

            # ensure parent textequivs are up to date:
            page_update_higher_textequiv_levels(level, prev_pcgts)

            # write back result
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(
                ID=file_id,
                pageId=input_file.pageId,
                file_grp=self.output_file_grp,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(prev_pcgts),
            )