Exemple #1
0
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 2, "GT and OCR")
        assert_file_grp_cardinality(self.output_file_grp, 1)

        log = getLogger("processor.OcrdDinglehopperEvaluate")

        metrics = self.parameter["metrics"]
        textequiv_level = self.parameter["textequiv_level"]
        gt_grp, ocr_grp = self.input_file_grp.split(",")

        input_file_tuples = self.zip_input_files(on_error='abort')
        for n, (gt_file, ocr_file) in enumerate(input_file_tuples):
            if not gt_file or not ocr_file:
                # file/page was not found in this group
                continue
            gt_file = self.workspace.download_file(gt_file)
            ocr_file = self.workspace.download_file(ocr_file)
            page_id = gt_file.pageId

            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)

            file_id = make_file_id(ocr_file, self.output_file_grp)
            report_prefix = os.path.join(self.output_file_grp, file_id)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass
            cli_process(
                gt_file.local_filename,
                ocr_file.local_filename,
                report_prefix,
                metrics=metrics,
                textequiv_level=textequiv_level,
            )

            # Add reports to the workspace
            for report_suffix, mimetype in [
                [".html", "text/html"],
                [".json", "application/json"],
            ]:
                self.workspace.add_file(
                    ID=file_id + report_suffix,
                    file_grp=self.output_file_grp,
                    pageId=page_id,
                    mimetype=mimetype,
                    local_filename=report_prefix + report_suffix,
                )

            # Clear cache between files
            levenshtein_matrix_cache_clear()
Exemple #2
0
    def process(self):
        """Segment pages into regions using a Mask R-CNN model."""
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        if not tf.test.is_gpu_available():
            LOG.warning(
                "Tensorflow cannot detect CUDA installation. Running without GPU will be slow."
            )

        for input_file in self.input_files:
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_id = input_file.pageId or input_file.ID

            # todo rs: why not cropped?
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized,deskewed,cropped,clipped,non_text')
            # try to load pixel masks
            try:
                # todo rs: this combination only works for tiseg with use_deeplr=true
                mask_image, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='clipped',
                    feature_filter='binarized,deskewed,cropped,non_text')
            except:
                mask_image = None
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None

            self._process_segment(page_image, page, page_xywh, page_id,
                                  input_file, mask_image, dpi)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
    def process(self):
        LOG = getLogger('OcrdAnybaseocrLayoutAnalyser')
        if not tf.test.is_gpu_available():
            LOG.error("Your system has no CUDA installed. No GPU detected.")
            # sys.exit(1)
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        model_path = Path(self.parameter['model_path'])
        class_mapper_path = Path(self.parameter['class_mapping_path'])
        if not Path(model_path).is_file():
            LOG.error("""\
                Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter
                points to the local model path.
                model can be downloaded from http://url
                """ % model_path)
            sys.exit(1)
        else:

            LOG.info('Loading model from file %s', model_path)
            model = self.create_model(str(model_path))
            # load the mapping
            pickle_in = open(str(class_mapper_path), "rb")
            class_indices = pickle.load(pickle_in)
            label_mapping = dict((v, k) for k, v in class_indices.items())

            # print("INPUT FILE HERE",self.input_files)
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            page_id = input_file.pageId or input_file.ID
            size = 600, 500

            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')

            img_array = ocrolib.pil2array(
                page_image.resize((500, 600), Image.ANTIALIAS))
            img_array = img_array * 1. / 255.
            img_array = img_array[np.newaxis, :, :, np.newaxis]
            results = self.start_test(model, img_array, fname, label_mapping)
            LOG.info(results)
            self.workspace.mets.set_physical_page_for_file(
                "PHYS_000" + str(n), input_file)
            self.create_logmap_smlink(pcgts)
            self.write_to_mets(results, "PHYS_000" + str(n))
Exemple #4
0
 def test_assert_file_grp_cardinality(self):
     with self.assertRaisesRegex(AssertionError, "Expected exactly 5 output file groups, but '.'FOO', 'BAR'.' has 2"):
         assert_file_grp_cardinality('FOO,BAR', 5)
     with self.assertRaisesRegex(AssertionError, "Expected exactly 1 output file group, but '.'FOO', 'BAR'.' has 2"):
         assert_file_grp_cardinality('FOO,BAR', 1)
     assert_file_grp_cardinality('FOO,BAR', 2)
     with self.assertRaisesRegex(AssertionError, r"Expected exactly 1 output file group .foo bar., but '.'FOO', 'BAR'.' has 2"):
         assert_file_grp_cardinality('FOO,BAR', 1, 'foo bar')
Exemple #5
0
 def process(self):
     LOG = getLogger('ocrd.dummy')
     assert_file_grp_cardinality(self.input_file_grp, 1)
     assert_file_grp_cardinality(self.output_file_grp, 1)
     for input_file in self.input_files:
         input_file = self.workspace.download_file(input_file)
         file_id = make_file_id(input_file, self.output_file_grp)
         ext = MIME_TO_EXT.get(input_file.mimetype, '')
         local_filename = join(self.output_file_grp, file_id + ext)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         pcgts.set_pcGtsId(file_id)
         self.add_metadata(pcgts)
         LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
         if input_file.mimetype == MIMETYPE_PAGE:
             # Source file is PAGE-XML: Write out in-memory PcGtsType
             self.workspace.add_file(
                 ID=file_id,
                 file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=input_file.mimetype,
                 local_filename=local_filename,
                 content=to_xml(pcgts).encode('utf-8'))
         else:
             # Source file is not PAGE-XML: Copy byte-by-byte
             with open(input_file.local_filename, 'rb') as f:
                 content = f.read()
                 self.workspace.add_file(
                     ID=file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
                     content=content)
             if input_file.mimetype.startswith('image/'):
                 # write out the PAGE-XML representation for this image
                 page_file_id = file_id + '_PAGE'
                 pcgts.set_pcGtsId(page_file_id)
                 pcgts.get_Page().set_imageFilename(local_filename)
                 page_filename = join(self.output_file_grp, file_id + '.xml')
                 LOG.info("Add PAGE-XML %s generated for %s at %s",
                          page_file_id, file_id, page_filename)
                 self.workspace.add_file(
                     ID=page_file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename=page_filename,
                     content=to_xml(pcgts).encode('utf-8'))
    def process(self):
        LOG = getLogger('OcrdAnybaseocrTextline')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized,deskewed')

            if oplevel == 'page':
                LOG.warning("Operation level should be region.")
                self._process_segment(page_image, page, None, page_xywh,
                                      page_id, input_file, n)

            else:
                regions = page.get_TextRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                    continue
                for (k, region) in enumerate(regions):

                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)

                    self._process_segment(region_image, page, region,
                                          region_xywh, region.id, input_file,
                                          k)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
    def process(self):
        """Performs border detection on the workspace. """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('OcrdAnybaseocrCropper')

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                            left, top, right, bottom)

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id,
                feature_filter='cropped',
                feature_selector='binarized') # should also be deskewed

            if oplevel == "page":
                self._process_segment(
                    page_image, page, page_coords, page_id, input_file, n)
            else:
                raise Exception(
                    'Operation level %s, but should be "page".', oplevel)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts).encode('utf-8')
            )
Exemple #8
0
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']
        LOG = getLogger('OcrdAnybaseocrBinarizer')

        for (n, input_file) in enumerate(self.input_files):
            file_id = make_file_id(input_file, self.output_file_grp)
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            pcgts.set_pcGtsId(file_id)
            self.add_metadata(pcgts)

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter="binarized")
            LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                regions = page.get_TextRegion() + page.get_TableRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for (k, region) in enumerate(regions):
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # TODO: not tested on regions
                    self._process_segment(region_image, page, region_xywh,
                                          region.id, input_file,
                                          str(n) + "_" + str(k))

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemple #9
0
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 2, 'GT and OCR')
        assert_file_grp_cardinality(self.output_file_grp, 1)

        metrics = self.parameter['metrics']
        gt_grp, ocr_grp = self.input_file_grp.split(',')
        for n, page_id in enumerate(self.workspace.mets.physical_pages):
            gt_file = self.workspace.mets.find_files(fileGrp=gt_grp,
                                                     pageId=page_id)[0]
            ocr_file = self.workspace.mets.find_files(fileGrp=ocr_grp,
                                                      pageId=page_id)[0]
            gt_file = self.workspace.download_file(gt_file)
            ocr_file = self.workspace.download_file(ocr_file)
            log.info("INPUT FILES %i / %s↔ %s", n, gt_file, ocr_file)

            file_id = make_file_id(ocr_file, self.output_file_grp)
            report_prefix = os.path.join(self.output_file_grp, file_id)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass
            cli_process(gt_file.local_filename,
                        ocr_file.local_filename,
                        report_prefix,
                        metrics=metrics)

            # Add reports to the workspace
            for report_suffix, mimetype in \
                    [
                        ['.html', 'text/html'],
                        ['.json', 'application/json']
                    ]:
                self.workspace.add_file(ID=file_id + report_suffix,
                                        file_grp=self.output_file_grp,
                                        pageId=page_id,
                                        mimetype=mimetype,
                                        local_filename=report_prefix +
                                        report_suffix)

            # Clear cache between files
            levenshtein_matrix_cache_clear()
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']

        LOG = getLogger('OcrdAnybaseocrDeskewer')

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            angle = page.get_orientation()
            if angle:
                LOG.warning('Overwriting existing deskewing angle: %i', angle)
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='deskewed',
                feature_selector='binarized')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemple #11
0
    def process(self):
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), self.parameter['textequiv_level'])
            self._process_page(pcgts)

            # write back result
            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
    def process(self):
        LOG = getLogger('OcrdAnybaseocrTiseg')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for input_file in self.input_files:
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)

            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            if self.parameter['use_deeplr']:
                kwargs = {'feature_filter': 'binarized,deskewed,cropped'}
            else:
                # _should_ also be deskewed and cropped, but no need to enforce that here
                kwargs = {'feature_selector': 'binarized'}
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, **kwargs)

            self._process_segment(page, page_image, page_coords, page_id,
                                  input_file)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                content=to_xml(pcgts).encode('utf-8'),
            )
Exemple #13
0
 def process(self):
     LOG = getLogger('eynollah')
     assert_file_grp_cardinality(self.input_file_grp, 1)
     assert_file_grp_cardinality(self.output_file_grp, 1)
     for n, input_file in enumerate(self.input_files):
         page_id = input_file.pageId or input_file.ID
         LOG.info("INPUT FILE %s (%d/%d) ", page_id, n + 1, len(self.input_files))
         pcgts = page_from_file(self.workspace.download_file(input_file))
         LOG.debug('width %s height %s', pcgts.get_Page().imageWidth, pcgts.get_Page().imageHeight)
         self.add_metadata(pcgts)
         page = pcgts.get_Page()
         # XXX loses DPI information
         # page_image, _, _ = self.workspace.image_from_page(page, page_id, feature_filter='binarized')
         image_filename = self.workspace.download_file(next(self.workspace.mets.find_files(url=page.imageFilename))).local_filename
         eynollah_kwargs = {
             'dir_models': self.resolve_resource(self.parameter['models']),
             'allow_enhancement': False,
             'curved_line': self.parameter['curved_line'],
             'full_layout': self.parameter['full_layout'],
             'allow_scaling': self.parameter['allow_scaling'],
             'headers_off': self.parameter['headers_off'],
             'override_dpi': self.parameter['dpi'],
             'logger': LOG,
             'pcgts': pcgts,
             'image_filename': image_filename
             }
         Eynollah(**eynollah_kwargs).run()
         file_id = make_file_id(input_file, self.output_file_grp)
         pcgts.set_pcGtsId(file_id)
         self.workspace.add_file(
             ID=file_id,
             file_grp=self.output_file_grp,
             pageId=page_id,
             mimetype=MIMETYPE_PAGE,
             local_filename=join(self.output_file_grp, file_id) + '.xml',
             content=to_xml(pcgts))
Exemple #14
0
    def process(self):
        """Rates textual annotation of PAGE input files, producing output files with LM scores (and choices).
        
        ... explain incremental page-wise processing here ...
        """
        LOG = getLogger('processor.KerasRate')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        level = self.parameter['textequiv_level']
        beam_width = self.parameter['beam_width']
        lm_weight = self.parameter['lm_weight']

        prev_traceback = None
        prev_pcgts = None
        prev_file = None
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            LOG.info("Scoring text in page '%s' at the %s level",
                     pcgts.get_pcGtsId(), level)

            # annotate processing metadata:
            metadata = pcgts.get_Metadata()  # ensured by page_from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=OCRD_TOOL['tools']['ocrd-keraslm-rate']['steps'][0],
                    value='ocrd-keraslm-rate',
                    Labels=[
                        LabelsType(externalRef="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            # context preprocessing:
            # todo: as soon as we have true MODS meta-data in METS (dmdSec/mdWrap/xmlData/mods),
            #       get global context variables from there (e.g. originInfo/dateIssued/@text for year)
            ident = self.workspace.mets.unique_identifier  # at least try to get purl
            context = [0]
            if ident:
                name = ident.split('/')[-1]
                year = name.split('_')[-1]
                if year.isnumeric():
                    year = ceil(int(year) / 10)
                    context = [year]
                    # todo: author etc

            # create a graph for the linear sequence of elements at the given level:
            graph, start_node, end_node = page_get_linear_graph_at(
                level, pcgts)

            # apply language model to (TextEquiv path in) graph,
            # remove non-path TextEquivs, modify confidences:
            if not self.parameter['alternative_decoding']:
                text = [(edge['element'], edge['alternatives'])
                        for edge in _get_edges(graph, 0)]  # graph's path
                textstring = u''.join(
                    textequivs[0].Unicode
                    for element, textequivs in text)  # same length as text
                LOG.info("Rating %d elements with a total of %d characters",
                         len(text), len(textstring))
                confidences = self.rater.rate(textstring,
                                              context)  # much faster
                i = 0
                for element, textequivs in text:
                    textequiv = textequivs[0]  # 1st choice only
                    if element:
                        element.set_TextEquiv([textequiv])  # delete others
                    textequiv_len = len(textequiv.Unicode)
                    conf = sum(confidences[i:i + textequiv_len]
                               ) / textequiv_len  # mean probability
                    conf2 = textequiv.conf
                    textequiv.set_conf(conf * lm_weight + conf2 *
                                       (1. - lm_weight))
                    i += textequiv_len
                if i != len(confidences):
                    LOG.critical(
                        "Input text length and output scores length are off by %d characters",
                        i - len(confidences))
                avg = sum(confidences) / len(confidences)
                ent = sum([-log(max(p, 1e-99), 2)
                           for p in confidences]) / len(confidences)
                ppl = pow(2.0, ent)  # character level
                ppll = pow(
                    2.0,
                    ent * len(confidences) /
                    len(text))  # textequiv level (including spaces/newlines)
                LOG.info("avg: %.3f, char ppl: %.3f, %s ppl: %.3f", avg, ppl,
                         level, ppll)  # character need not always equal glyph!

                # ensure parent textequivs are up to date:
                page_update_higher_textequiv_levels(level, pcgts)

                # write back result
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    pageId=input_file.pageId,
                    file_grp=self.output_file_grp,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    mimetype=MIMETYPE_PAGE,
                    content=to_xml(pcgts),
                )
            else:
                LOG.info("Rating %d elements including its alternatives",
                         end_node - start_node)
                path, entropy, traceback = self.rater.rate_best(
                    graph,
                    start_node,
                    end_node,
                    start_traceback=prev_traceback,
                    context=context,
                    lm_weight=lm_weight,
                    beam_width=beam_width,
                    beam_clustering_dist=BEAM_CLUSTERING_DIST
                    if BEAM_CLUSTERING_ENABLE else 0)

                if prev_pcgts:
                    _page_update_from_path(level, path, entropy)

                    # ensure parent textequivs are up to date:
                    page_update_higher_textequiv_levels(level, prev_pcgts)

                    # write back result
                    file_id = make_file_id(prev_file, self.output_file_grp)
                    prev_pcgts.set_pcGtsId(file_id)
                    self.workspace.add_file(
                        ID=file_id,
                        pageId=prev_file.pageId,
                        file_grp=self.output_file_grp,
                        local_filename=os.path.join(self.output_file_grp,
                                                    file_id + '.xml'),
                        mimetype=MIMETYPE_PAGE,
                        content=to_xml(prev_pcgts),
                    )

                prev_file = input_file
                prev_pcgts = pcgts
                prev_traceback = traceback

        if prev_pcgts:
            path, entropy, _ = self.rater.next_path(prev_traceback[0],
                                                    ([], prev_traceback[1]))
            _page_update_from_path(level, path, entropy)

            # ensure parent textequivs are up to date:
            page_update_higher_textequiv_levels(level, prev_pcgts)

            # write back result
            file_id = make_file_id(input_file, self.output_file_grp)
            prev_pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(
                ID=file_id,
                pageId=input_file.pageId,
                file_grp=self.output_file_grp,
                local_filename=os.path.join(self.output_file_grp,
                                            file_id + '.xml'),
                mimetype=MIMETYPE_PAGE,
                content=to_xml(prev_pcgts),
            )
Exemple #15
0
    def process(self):
        """Despeckle the pages / regions / lines of the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``level-of-operation``.

        Next, for each file, crop each segment image according to the layout
        annotation (via coordinates into the higher-level image, or from the
        alternative image). Then despeckle by removing connected components
        smaller than ``noise_maxsize``. Apply results to the image and export
        it as an image file.

        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-DESPECK`` along with further
        identification of the input element.

        Reference each new image in the AlternativeImage of the element.

        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.OcropyDenoise')
        level = self.parameter['level-of-operation']
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            LOG.info("INPUT FILE %i / %s", n, input_file.pageId
                     or input_file.ID)
            file_id = make_file_id(input_file, self.output_file_grp)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_selector='binarized' if level == 'page' else '')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                LOG.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            if level == 'page':
                self.process_segment(page, page_image, page_xywh, zoom,
                                     input_file.pageId, file_id)
            else:
                regions = page.get_AllRegions(classes=['Text'],
                                              order='reading-order')
                if not regions:
                    LOG.warning('Page "%s" contains no text regions', page_id)
                for region in regions:
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region,
                        page_image,
                        page_xywh,
                        feature_selector='binarized'
                        if level == 'region' else '')
                    if level == 'region':
                        self.process_segment(region, region_image, region_xywh,
                                             zoom, input_file.pageId,
                                             file_id + '_' + region.id)
                        continue
                    lines = region.get_TextLine()
                    if not lines:
                        LOG.warning(
                            'Page "%s" region "%s" contains no text lines',
                            page_id, region.id)
                    for line in lines:
                        line_image, line_xywh = self.workspace.image_from_segment(
                            line,
                            region_image,
                            region_xywh,
                            feature_selector='binarized')
                        self.process_segment(
                            line, line_image, line_xywh, zoom,
                            input_file.pageId,
                            file_id + '_' + region.id + '_' + line.id)

            # update METS (add the PAGE file):
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id,
                     self.output_file_grp, out.local_filename)
Exemple #16
0
    def process(self):
        """Perform OCR recognition with Tesseract on the workspace.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the requested
        ``textequiv_level`` if it exists and ``overwrite_words`` is disabled,
        or to the line level otherwise. In the latter case,
        (remove any existing segmentation below the line level, and)
        create new segmentation below the line level if necessary.
        
        Set up Tesseract to recognise each segment's image (either from
        AlternativeImage or cropping the bounding box rectangle and masking
        it from the polygon outline) with the appropriate mode and ``model``.
        
        Put text and confidence results into the TextEquiv at ``textequiv_level``,
        removing any existing TextEquiv.
        
        Finally, make the higher levels consistent with these results by concatenation,
        ordered as appropriate for its readingDirection, textLineOrder, and ReadingOrder,
        and joined by whitespace, as appropriate for the respective level and Relation/join
        status.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG.debug("TESSDATA: %s, installed Tesseract models: %s",
                  *get_languages())

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        maxlevel = self.parameter['textequiv_level']
        model = get_languages()[1][-1]  # last installed model
        if 'model' in self.parameter:
            model = self.parameter['model']
            for sub_model in model.split('+'):
                if sub_model not in get_languages()[1]:
                    raise Exception("configured model " + sub_model +
                                    " is not installed")

        with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi:
            LOG.info("Using model '%s' in %s for recognition at the %s level",
                     model,
                     get_languages()[0], maxlevel)
            if maxlevel == 'glyph':
                # populate GetChoiceIterator() with LSTM models, too:
                tessapi.SetVariable("lstm_choice_mode",
                                    "2")  # aggregate symbols
                tessapi.SetVariable("lstm_choice_iterations",
                                    "15")  # squeeze out more best paths
            # TODO: maybe warn/raise when illegal combinations or characters not in the model unicharset?
            if self.parameter['char_whitelist']:
                tessapi.SetVariable("tessedit_char_whitelist",
                                    self.parameter['char_whitelist'])
            if self.parameter['char_blacklist']:
                tessapi.SetVariable("tessedit_char_blacklist",
                                    self.parameter['char_blacklist'])
            if self.parameter['char_unblacklist']:
                tessapi.SetVariable("tessedit_char_unblacklist",
                                    self.parameter['char_unblacklist'])
            # todo: determine relevancy of these variables:
            # tessapi.SetVariable("tessedit_single_match", "0")
            #
            # tessedit_load_sublangs
            # tessedit_preserve_min_wd_len 2
            # tessedit_prefer_joined_punct 0
            # tessedit_write_rep_codes 0
            # tessedit_parallelize 0
            # tessedit_zero_rejection 0
            # tessedit_zero_kelvin_rejection 0
            # tessedit_reject_mode 0
            # tessedit_use_reject_spaces 1
            # tessedit_fix_fuzzy_spaces 1
            # tessedit_char_blacklist
            # tessedit_char_whitelist
            # chs_leading_punct ('`"
            # chs_trailing_punct1 ).,;:?!
            # chs_trailing_punct2 )'`"
            # numeric_punctuation .,
            # unrecognised_char |
            # ok_repeated_ch_non_alphanum_wds -?*=
            # conflict_set_I_l_1 Il1[]
            # preserve_interword_spaces 0
            # tessedit_enable_dict_correction 0
            # tessedit_enable_bigram_correction 1
            # stopper_smallword_size 2
            # wordrec_max_join_chunks 4
            # suspect_space_level 100
            # suspect_short_words 2
            # language_model_ngram_on 0
            # language_model_ngram_order 8
            # language_model_min_compound_length 3
            # language_model_penalty_non_freq_dict_word 0.1
            # language_model_penalty_non_dict_word 0.15
            # language_model_penalty_punc 0.2
            # language_model_penalty_case 0.1
            # language_model_penalty_script 0.5
            # language_model_penalty_chartype 0.3
            # language_model_penalty_spacing 0.05
            # textord_max_noise_size 7
            # enable_noise_removal 1
            # classify_bln_numeric_mode 0
            # lstm_use_matrix 1
            # user_words_file
            # user_patterns_file
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                page = pcgts.get_Page()

                # add metadata about this operation and its runtime parameters:
                metadata = pcgts.get_Metadata()  # ensured by from_file()
                metadata.add_MetadataItem(
                    MetadataItemType(
                        type_="processingStep",
                        name=self.ocrd_tool['steps'][0],
                        value=TOOL,
                        Labels=[
                            LabelsType(externalModel="ocrd-tool",
                                       externalId="parameters",
                                       Label=[
                                           LabelType(
                                               type_=name,
                                               value=self.parameter[name])
                                           for name in self.parameter.keys()
                                       ])
                        ]))
                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from paramter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                LOG.info("Processing page '%s'", page_id)
                regions = itertools.chain.from_iterable(
                    [page.get_TextRegion()] + [
                        subregion.get_TextRegion()
                        for subregion in page.get_TableRegion()
                    ])
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                else:
                    self._process_regions(tessapi, regions, page_image,
                                          page_xywh)
                page_update_higher_textequiv_levels(maxlevel, pcgts)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemple #17
0
    def process(self):
        """Performs heuristic page frame detection (cropping) on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        (Input should be deskewed already.) Retrieve the raw (non-binarized,
        uncropped) page image.
        
        Detect line segments via edge gradients, and cluster them into contiguous
        horizontal and vertical lines if possible. If candidates which are located
        at the margin and long enough (covering a large fraction of the page) exist
        on all four sides, then pick the best (i.e. thickest, longest and inner-most)
        one on each side and use their intersections as border points.
        
        Otherwise, first try to detect a ruler (i.e. image segment depicting a rule
        placed on the scan/photo for scale references) via thresholding and contour
        detection, identifying a single large rectangular region with a certain aspect
        ratio. Suppress (mask) any such segment during further calculations.

        Next in that line, try to detect text segments on the page. For that purpose,
        get the gradient of grayscale image, threshold and morphologically close it,
        then determine contours to define approximate text boxes. Merge these into
        columns, filtering candidates too small or entirely in the margin areas.
        Finally, merge the remaining columns across short gaps. If only one column
        remains, and it covers a significant fraction of the page, pick that segment
        as solution.
        
        Otherwise, keep the border points derived from line segments (intersecting
        with the full image on each side without line candidates).
        
        Lastly, map coordinates to the original (undeskewed) image and intersect
        the border polygon with the full image frame. Use that to define the page's
        Border.
        
        Moreover, crop (and mask) the image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self.logger = getLogger('processor.AnybaseocrCropper')

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            self.logger.info("INPUT FILE %i / %s", n, page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            # Check for existing Border --> already cropped
            border = page.get_Border()
            if border:
                left, top, right, bottom = bbox_from_points(
                    border.get_Coords().points)
                self.logger.warning('Overwriting existing Border: %i:%i,%i:%i',
                                    left, top, right, bottom)

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page,
                page_id,  # should be deskewed already
                feature_filter='cropped,binarized,grayscale_normalized')
            if self.parameter['dpi'] > 0:
                zoom = 300.0 / self.parameter['dpi']
            elif page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi *= 2.54
                self.logger.info('Page "%s" uses %f DPI', page_id, dpi)
                zoom = 300.0 / dpi
            else:
                zoom = 1

            self._process_page(page, page_image, page_coords, input_file, zoom)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemple #18
0
    def process(self):
        """Detect font shapes via rule-based OCR with Tesseract on the workspace.
        
        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the line level.
        
        Set up Tesseract to recognise each word's image (either from
        AlternativeImage or cropping the bounding box rectangle and masking
        it from the polygon outline) in word mode and with the ``osd`` model.
        
        Query the result's font attributes and write them into the word element's
        ``TextStyle``.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrFontShape')
        LOG.debug("TESSDATA: %s, installed Tesseract models: %s",
                  *get_languages())

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        model = self.parameter['model']
        if model not in get_languages()[1]:
            raise Exception(
                "model " + model +
                " (needed for font style detection) is not installed")

        with PyTessBaseAPI(
                path=get_tessdata_path(),
                #oem=OEM.TESSERACT_LSTM_COMBINED, # legacy required for OSD or WordFontAttributes!
                oem=OEM.
                TESSERACT_ONLY,  # legacy required for OSD or WordFontAttributes!
                lang=model) as tessapi:
            LOG.info(
                "Using model '%s' in %s for recognition at the word level",
                model,
                get_languages()[0])
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                LOG.info("Processing page '%s'", page_id)
                regions = page.get_AllRegions(classes=['Text'])
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                else:
                    self._process_regions(tessapi, regions, page_image,
                                          page_coords)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
    def process(self):
        """Performs deskewing of the page / region with Tesseract on the workspace.

        Open and deserialise PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level
        for all text and table regions.

        Set up Tesseract to recognise the region image's orientation, skew
        and script (with both OSD and AnalyseLayout). Rotate the image
        accordingly, and annotate the angle, readingDirection and textlineOrder.
        
        Create a corresponding image file, and reference it as AlternativeImage
        in the element. Add the new image file to the workspace with the fileGrp USE
        given in the second position of the output fileGrp, or ``OCR-D-IMG-DESKEW``,
        and an ID based on input file and input element.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrDeskew')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        oplevel = self.parameter['operation_level']

        with PyTessBaseAPI(
                path=TESSDATA_PREFIX,
                lang="osd",  # osd required for legacy init!
                oem=OEM.TESSERACT_LSTM_COMBINED,  # legacy required for OSD!
                psm=PSM.AUTO_OSD) as tessapi:
            for n, input_file in enumerate(self.input_files):
                file_id = make_file_id(input_file, self.output_file_grp)
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                pcgts.set_pcGtsId(file_id)
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been rotated already,
                    # (we will overwrite @orientation anyway,)
                    # abort if no such image can be produced:
                    feature_filter='deskewed' if oplevel == 'page' else '')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                LOG.info("Deskewing on '%s' level in page '%s'", oplevel,
                         page_id)

                if oplevel == 'page':
                    self._process_segment(tessapi, page, page_image, page_xywh,
                                          "page '%s'" % page_id,
                                          input_file.pageId, file_id)
                else:
                    regions = page.get_TextRegion() + page.get_TableRegion()
                    if not regions:
                        LOG.warning("Page '%s' contains no text regions",
                                    page_id)
                    for region in regions:
                        region_image, region_xywh = self.workspace.image_from_segment(
                            region,
                            page_image,
                            page_xywh,
                            # image must not have been rotated already,
                            # (we will overwrite @orientation anyway,)
                            # abort if no such image can be produced:
                            feature_filter='deskewed')
                        self._process_segment(tessapi, region, region_image,
                                              region_xywh,
                                              "region '%s'" % region.id,
                                              input_file.pageId,
                                              file_id + '_' + region.id)

                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemple #20
0
    def process(self):
        LOG = getLogger('processor.RepairInconsistencies')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()

            # add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            regions = []
            regions.extend(page.get_TextRegion())
            for special_region in page.get_TableRegion(
            ) + page.get_GraphicRegion():
                regions.extend(special_region.get_TextRegion())

            for region in regions:
                textLineOrder = 'top-to-bottom'
                for segment in [region, page]:
                    if segment.textLineOrder is None:
                        continue
                    else:
                        textLineOrder = segment.textLineOrder
                        break
                if textLineOrder not in ['top-to-bottom', 'bottom-to-top']:
                    LOG.info(
                        'Not processing page "%s" region "%s" (textLineOrder=%s)',
                        page_id, region.id, textLineOrder)
                    continue

                _fix_segment(region,
                             page_id,
                             reverse=(textLineOrder == 'bottom-to-top'))

                lines = region.get_TextLine()
                for line in lines:
                    readingDirection = 'left-to-right'
                    for segment in [line, region, page]:
                        if segment.readingDirection is None:
                            continue
                        else:
                            readingDirection = segment.readingDirection
                            break
                    if readingDirection not in [
                            'left-to-right', 'right-to-left'
                    ]:
                        LOG.info(
                            'Not processing page "%s" line "%s" (readingDirection=%s)',
                            page_id, line.id, readingDirection)
                        continue

                    _fix_segment(line,
                                 page_id,
                                 reverse=(readingDirection == 'right-to-left'))

                    words = line.get_Word()
                    for word in words:
                        readingDirection = 'left-to-right'
                        for segment in [word, line, region, page]:
                            if segment.readingDirection is None:
                                continue
                            else:
                                readingDirection = segment.readingDirection
                                break
                        if readingDirection not in [
                                'left-to-right', 'right-to-left'
                        ]:
                            LOG.info(
                                'Not processing page "%s" word "%s" (readingDirection=%s)',
                                page_id, word.id, readingDirection)
                            continue

                        _fix_segment(
                            word,
                            page_id,
                            reverse=(readingDirection == 'right-to-left'))

            file_id = make_file_id(input_file, self.output_file_grp)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemple #21
0
    def process(self):
        log = getLogger('processor.OcrdSbbTextlineDetectorRecognize')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, input_file)

            file_id = make_file_id(input_file, self.output_file_grp)

            # Process the files
            try:
                os.mkdir(self.output_file_grp)
            except FileExistsError:
                pass

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = \
                self.workspace.image_from_page(
                        page, page_id,
                        feature_filter='cropped,binarized,grayscale_normalized'
                )

            with tempfile.TemporaryDirectory() as tmp_dirname:
                # Save the image
                image_file = tempfile.mkstemp(dir=tmp_dirname,
                                              suffix='.png')[1]
                page_image.save(image_file)

                # Segment the image
                model = self.parameter['model']
                x = textline_detector(image_file, tmp_dirname, file_id, model)
                x.run()

                # Read segmentation results
                tmp_filename = os.path.join(tmp_dirname, file_id) + '.xml'
                tmp_pcgts = ocrd_models.ocrd_page.parse(tmp_filename,
                                                        silence=True)
                tmp_page = tmp_pcgts.get_Page()

            # Create a new PAGE file from the input file
            pcgts.set_pcGtsId(file_id)
            page = pcgts.get_Page()

            # Merge results → PAGE file

            # 1. Border
            if page.get_Border():
                log.warning("Page already contained a border")
            # We need to translate the coordinates:
            text_border = tmp_page.get_Border()
            coords = text_border.get_Coords().get_points()
            polygon = polygon_from_points(coords)
            polygon_new = coordinates_for_segment(polygon, page_image,
                                                  page_coords)
            points_new = points_from_polygon(polygon_new)
            coords_new = CoordsType(points=points_new)
            text_border.set_Coords(coords_new)
            page.set_Border(text_border)

            # 2. ReadingOrder
            if page.get_ReadingOrder():
                log.warning("Page already contained a reading order")
            page.set_ReadingOrder(tmp_page.get_ReadingOrder())

            # 3. TextRegion
            if page.get_TextRegion():
                log.warning("Page already contained text regions")
            # We need to translate the coordinates:
            text_regions_new = []
            for text_region in tmp_page.get_TextRegion():
                coords = text_region.get_Coords().get_points()
                polygon = polygon_from_points(coords)
                polygon_new = coordinates_for_segment(polygon, page_image,
                                                      page_coords)
                points_new = points_from_polygon(polygon_new)
                coords_new = CoordsType(points=points_new)
                text_region.set_Coords(coords_new)
                text_regions_new.append(text_region)
            page.set_TextRegion(text_regions_new)

            # Save metadata about this operation
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=page_id,
                mimetype='application/vnd.prima.page+xml',
                local_filename=os.path.join(self.output_file_grp, file_id) +
                '.xml',
                content=ocrd_models.ocrd_page.to_xml(pcgts))
Exemple #22
0
    def process(self):
        """Performs word segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the textline level,
        and remove any existing Word elements (unless ``overwrite_words``
        is False).
        
        Set up Tesseract to detect words, and add each one to the line
        at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrSegmentWord')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_words = self.parameter['overwrite_words']

        with PyTessBaseAPI(
            psm=PSM.SINGLE_LINE,
            path=TESSDATA_PREFIX
        ) as tessapi:
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()
                
                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info("Page '%s' images will use %d DPI from parameter override", page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info("Page '%s' images will use %d DPI from image meta-data", page_id, dpi)
                else:
                    dpi = 0
                    LOG.info("Page '%s' images will use DPI estimated from segmentation", page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                
                for region in page.get_TextRegion():
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    for line in region.get_TextLine():
                        if line.get_Word():
                            if overwrite_words:
                                LOG.info('removing existing Words in line "%s"', line.id)
                                line.set_Word([])
                            else:
                                LOG.warning('keeping existing Words in line "%s"', line.id)
                        LOG.debug("Detecting words in line '%s'", line.id)
                        line_image, line_coords = self.workspace.image_from_segment(
                            line, region_image, region_coords)
                        tessapi.SetImage(line_image)
                        for word_no, component in enumerate(tessapi.GetComponentImages(RIL.WORD, True, raw_image=True)):
                            word_id = '%s_word%04d' % (line.id, word_no)
                            word_polygon = polygon_from_xywh(component[1])
                            word_polygon = coordinates_for_segment(word_polygon, line_image, line_coords)
                            word_polygon2 = polygon_for_parent(word_polygon, line)
                            if word_polygon2 is not None:
                                word_polygon = word_polygon2
                            word_points = points_from_polygon(word_polygon)
                            if word_polygon2 is None:
                                # could happen due to rotation
                                LOG.info('Ignoring extant word: %s', word_points)
                                continue
                            line.add_Word(WordType(
                                id=word_id, Coords=CoordsType(word_points)))
                            
                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(
                    ID=file_id,
                    file_grp=self.output_file_grp,
                    pageId=input_file.pageId,
                    mimetype=MIMETYPE_PAGE,
                    local_filename=os.path.join(self.output_file_grp,
                                                file_id + '.xml'),
                    content=to_xml(pcgts))
Exemple #23
0
    def process(self):
        """
        Perform text recognition with Calamari on the workspace.

        If ``texequiv_level`` is ``word`` or ``glyph``, then additionally create word / glyph level segments by
        splitting at white space characters / glyph boundaries. In the case of ``glyph``, add all alternative character
        hypotheses down to ``glyph_conf_cutoff`` confidence threshold.
        """
        log = getLogger('processor.CalamariRecognize')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_coords, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector=self.features)

            for region in page.get_AllRegions(classes=['Text']):
                region_image, region_coords = self.workspace.image_from_segment(
                    region,
                    page_image,
                    page_coords,
                    feature_selector=self.features)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                line_images_np = []
                line_coordss = []
                for line in textlines:
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line,
                        region_image,
                        region_coords,
                        feature_selector=self.features)
                    if ('binarized' not in line_coords['features']
                            and 'grayscale_normalized'
                            not in line_coords['features']
                            and self.network_input_channels == 1):
                        # We cannot use a feature selector for this since we don't
                        # know whether the model expects (has been trained on)
                        # binarized or grayscale images; but raw images are likely
                        # always inadequate:
                        log.warning(
                            "Using raw image for line '%s' in region '%s'",
                            line.id, region.id)

                    line_image = line_image if all(line_image.size) else [[0]]
                    line_image_np = np.array(line_image, dtype=np.uint8)
                    line_images_np.append(line_image_np)
                    line_coordss.append(line_coords)
                raw_results_all = self.predictor.predict_raw(
                    line_images_np, progress_bar=False)

                for line, line_coords, raw_results in zip(
                        textlines, line_coordss, raw_results_all):

                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            self.add_metadata(pcgts)
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
    def process(self):
        """Performs table cell segmentation with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the block level
        for table regions. If ``overwrite_regions`` is enabled and any
        layout annotation already exists inside, then remove it.
        
        Set up Tesseract to detect text blocks (as table cells).
        (This is not Tesseract's internal table structure recognition,
        but the general page segmentation.)
        Add each to the block at the detected coordinates.
        
        Produce a new output file by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrSegmentTable')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        overwrite_regions = self.parameter['overwrite_regions']

        with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here, so we won't get
            # tables inside tables, but try to analyse them as
            # independent text/line blocks:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                page_image, page_coords, page_image_info = self.workspace.image_from_page(
                    page, page_id)
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))

                #
                # prepare dict of reading order
                reading_order = dict()
                ro = page.get_ReadingOrder()
                if not ro:
                    LOG.warning("Page '%s' contains no ReadingOrder", page_id)
                    rogroup = None
                else:
                    rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup()
                    page_get_reading_order(reading_order, rogroup)
                #
                # dive into regions
                regions = page.get_TableRegion()
                for region in regions:
                    # delete or warn of existing regions:
                    if region.get_TextRegion():
                        if overwrite_regions:
                            LOG.info(
                                'removing existing TextRegions in block "%s" of page "%s"',
                                region.id, page_id)
                            for subregion in region.get_TextRegion():
                                if subregion.id in reading_order:
                                    regionref = reading_order[subregion.id]
                                    # could be any of the 6 types above:
                                    regionrefs = rogroup.__getattribute__(
                                        regionref.__class__.__name__.replace(
                                            'Type', ''))
                                    # remove in-place
                                    regionrefs.remove(regionref)
                                    # TODO: adjust index to make contiguous again?
                            region.set_TextRegion([])
                        else:
                            LOG.warning(
                                'keeping existing TextRegions in block "%s" of page "%s"',
                                region.id, page_id)
                    # get region image
                    region_image, region_coords = self.workspace.image_from_segment(
                        region, page_image, page_coords)
                    tessapi.SetImage(region_image)
                    LOG.info("Detecting table cells in region '%s'", region.id)
                    #
                    # detect the region segments:
                    tessapi.SetPageSegMode(PSM.SPARSE_TEXT)  # retrieve "cells"
                    # TODO: we should XY-cut the sparse cells in regroup them into consistent cells
                    layout = tessapi.AnalyseLayout()
                    roelem = reading_order.get(region.id)
                    if not roelem:
                        LOG.warning(
                            "Page '%s' table region '%s' is not referenced in reading order (%s)",
                            page_id, region.id, "no target to add cells into")
                    elif isinstance(
                            roelem,
                        (OrderedGroupType, OrderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an ordered group (%s)",
                            page_id, region.id, "cells will be appended")
                    elif isinstance(
                            roelem,
                        (UnorderedGroupType, UnorderedGroupIndexedType)):
                        LOG.warning(
                            "Page '%s' table region '%s' already has an unordered group (%s)",
                            page_id, region.id, "cells will not be appended")
                        roelem = None
                    elif isinstance(roelem, RegionRefIndexedType):
                        # replace regionref by group with same index and ref
                        # (which can then take the cells as subregions)
                        roelem2 = OrderedGroupIndexedType(
                            id=region.id + '_order',
                            index=roelem.index,
                            regionRef=roelem.regionRef)
                        roelem.parent_object_.add_OrderedGroupIndexed(roelem2)
                        roelem.parent_object_.get_RegionRefIndexed().remove(
                            roelem)
                        roelem = roelem2
                    elif isinstance(roelem, RegionRefType):
                        # replace regionref by group with same ref
                        # (which can then take the cells as subregions)
                        roelem2 = OrderedGroupType(id=region.id + '_order',
                                                   regionRef=roelem.regionRef)
                        roelem.parent_object_.add_OrderedGroup(roelem2)
                        roelem.parent_object_.get_RegionRef().remove(roelem)
                        roelem = roelem2
                    self._process_region(layout, region, roelem, region_image,
                                         region_coords)

                file_id = make_file_id(input_file, self.output_file_grp)
                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(force=True,
                                        ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
    def process(self):
        LOG = getLogger('OcrdAnybaseocrDewarper')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        if self.parameter['gpu_id'] > -1 and not torch.cuda.is_available():
            LOG.warning("torch cannot detect CUDA installation.")
            self.parameter['gpu_id'] = -1

        model_path = Path(self.resolve_resource(self.parameter['model_path']))
        if not model_path.is_file():
            LOG.error("""\
                    pix2pixHD model file was not found at '%s'. Make sure this file exists.
                """ % model_path)
            sys.exit(1)

        opt, model = prepare_options(
            gpu_id=self.parameter['gpu_id'],
            dataroot=str(Path(self.workspace.directory, self.input_file_grp)),
            model_path=model_path,
            resize_or_crop=self.parameter['imgresize'],
            loadSize=self.parameter['resizeHeight'],
            fineSize=self.parameter['resizeWidth'],
        )

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %s", page_id)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()

            try:
                page_image, page_xywh, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_filter='dewarped',
                    feature_selector='binarized'
                )  # images should be deskewed and cropped
            except Exception:
                page_image, page_xywh, _ = self.workspace.image_from_page(
                    page, page_id, feature_filter='dewarped'
                )  # images should be deskewed and cropped
            if oplevel == 'page':
                dataset = prepare_data(opt, page_image)
                orig_img_size = page_image.size
                self._process_segment(model, dataset, page, page_xywh, page_id,
                                      input_file, orig_img_size, n)
            else:
                regions = page.get_TextRegion() + page.get_TableRegion(
                )  # get all regions?
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                for _, region in enumerate(regions):
                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)
                    # TODO: not tested on regions
                    # TODO: region has to exist as a physical file to be processed by pix2pixHD
                    dataset = prepare_data(opt, region_image)
                    orig_img_size = region_image.size
                    self._process_segment(model, dataset, page, region_xywh,
                                          region.id, input_file, orig_img_size,
                                          n)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemple #26
0
    def process(self):
        """Performs page cropping with Tesseract on the workspace.
        
        Open and deserialize PAGE input files and their respective images.
        Set up Tesseract to detect text blocks on each page, and find
        the largest coordinate extent spanning all of them. Use this
        extent in defining a Border, and add that to the page.
        
        Moreover, crop the original image accordingly, and reference the
        resulting image file as AlternativeImage in the Page element.
        
        Add the new image file to the workspace along with the output fileGrp,
        and using a file ID with suffix ``.IMG-CROP`` along with further
        identification of the input element.
        
        Produce new output files by serialising the resulting hierarchy.
        """
        LOG = getLogger('processor.TesserocrCrop')
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        padding = self.parameter['padding']
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            # disable table detection here (tables count as text blocks),
            # because we do not want to risk confusing the spine with
            # a column separator and thus creeping into a neighbouring
            # page:
            tessapi.SetVariable("textord_tabfind_find_tables", "0")
            for (n, input_file) in enumerate(self.input_files):
                page_id = input_file.pageId or input_file.ID
                LOG.info("INPUT FILE %i / %s", n, page_id)
                pcgts = page_from_file(
                    self.workspace.download_file(input_file))
                self.add_metadata(pcgts)
                page = pcgts.get_Page()

                # warn of existing Border:
                border = page.get_Border()
                if border:
                    left, top, right, bottom = bbox_from_points(
                        border.get_Coords().points)
                    LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                left, top, right, bottom)

                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    # image must not have been cropped already,
                    # abort if no such image can be produced:
                    feature_filter='cropped')
                if self.parameter['dpi'] > 0:
                    dpi = self.parameter['dpi']
                    LOG.info(
                        "Page '%s' images will use %d DPI from parameter override",
                        page_id, dpi)
                elif page_image_info.resolution != 1:
                    dpi = page_image_info.resolution
                    if page_image_info.resolutionUnit == 'cm':
                        dpi = round(dpi * 2.54)
                    LOG.info(
                        "Page '%s' images will use %d DPI from image meta-data",
                        page_id, dpi)
                else:
                    dpi = 0
                    LOG.info(
                        "Page '%s' images will use DPI estimated from segmentation",
                        page_id)
                if dpi:
                    tessapi.SetVariable('user_defined_dpi', str(dpi))
                    zoom = 300 / dpi
                else:
                    zoom = 1

                # warn of existing segmentation:
                regions = page.get_TextRegion()
                if regions:
                    min_x = page_image.width
                    min_y = page_image.height
                    max_x = 0
                    max_y = 0
                    for region in regions:
                        left, top, right, bottom = bbox_from_points(
                            region.get_Coords().points)
                        min_x = min(min_x, left)
                        min_y = min(min_y, top)
                        max_x = max(max_x, right)
                        max_y = max(max_y, bottom)
                    LOG.warning(
                        'Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                        min_x, max_x, min_y, max_y)

                LOG.debug("Cropping with Tesseract")
                tessapi.SetImage(page_image)
                # PSM.SPARSE_TEXT: get as much text as possible in no particular order
                # PSM.AUTO (default): includes tables (dangerous)
                tessapi.SetPageSegMode(tesserocr.PSM.SPARSE_TEXT)
                #
                # helper variables for saving the box coordinates
                #
                min_x = page_image.width
                min_y = page_image.height
                max_x = 0
                max_y = 0
                # iterate over all text blocks and compare their
                # bbox extent to the running min and max values
                for component in tessapi.GetComponentImages(
                        tesserocr.RIL.BLOCK, True):
                    image, xywh, index, _ = component
                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    left, top, right, bottom = bbox_from_xywh(xywh)
                    LOG.debug("Detected text region '%s': %i:%i,%i:%i", ID,
                              left, right, top, bottom)
                    # filter region results:
                    bin_bbox = image.getbbox()
                    if not bin_bbox:
                        # this does happen!
                        LOG.info(
                            "Ignoring region '%s' because its binarization is empty",
                            ID)
                        continue
                    width = bin_bbox[2] - bin_bbox[0]
                    if width < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.info(
                            "Ignoring region '%s' because its width is too small (%d)",
                            ID, width)
                        continue
                    height = bin_bbox[3] - bin_bbox[1]
                    if height < 25 / zoom:
                        # we must be conservative here: page numbers are tiny regions, too!
                        LOG.debug(
                            "Ignoring region '%s' because its height is too small (%d)",
                            ID, height)
                        continue
                    min_x = min(min_x, left)
                    min_y = min(min_y, top)
                    max_x = max(max_x, right)
                    max_y = max(max_y, bottom)
                    LOG.info("Updated page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)

                #
                # set the identified page border
                #
                if min_x < max_x and min_y < max_y:
                    # add padding:
                    min_x = max(min_x - padding, 0)
                    max_x = min(max_x + padding, page_image.width)
                    min_y = max(min_y - padding, 0)
                    max_y = min(max_y + padding, page_image.height)
                    LOG.info("Padded page border: %i:%i,%i:%i", min_x, max_x,
                             min_y, max_y)
                    polygon = polygon_from_bbox(min_x, min_y, max_x, max_y)
                    polygon = coordinates_for_segment(polygon, page_image,
                                                      page_xywh)
                    polygon = polygon_for_parent(polygon, page)
                    border = BorderType(
                        Coords=CoordsType(points_from_polygon(polygon)))
                    # intersection with parent could have changed bbox,
                    # so recalculate:
                    bbox = bbox_from_polygon(
                        coordinates_of_segment(border, page_image, page_xywh))
                    # update PAGE (annotate border):
                    page.set_Border(border)
                    # update METS (add the image file):
                    page_image = crop_image(page_image, box=bbox)
                    page_xywh['features'] += ',cropped'
                    file_id = make_file_id(input_file, self.output_file_grp)
                    file_path = self.workspace.save_image_file(
                        page_image,
                        file_id + '.IMG-CROP',
                        page_id=input_file.pageId,
                        file_grp=self.output_file_grp)
                    # update PAGE (reference the image file):
                    page.add_AlternativeImage(
                        AlternativeImageType(filename=file_path,
                                             comments=page_xywh['features']))
                else:
                    LOG.error("Cannot find valid extent for page '%s'",
                              page_id)

                pcgts.set_pcGtsId(file_id)
                self.workspace.add_file(ID=file_id,
                                        file_grp=self.output_file_grp,
                                        pageId=input_file.pageId,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename=os.path.join(
                                            self.output_file_grp,
                                            file_id + '.xml'),
                                        content=to_xml(pcgts))
Exemple #27
0
    def process(self):
        """
        Performs the recognition.
        """

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(
                    region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'",
                         len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line.id,
                              region.id)

                    line_image, line_coords = self.workspace.image_from_segment(
                        line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(
                        self.predictor.predict_raw([line_image_np],
                                                   progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    # Build line text on our own
                    #
                    # Calamari does whitespace post-processing on prediction.sentence, while it does not do the same
                    # on prediction.positions. Do it on our own to have consistency.
                    #
                    # XXX Check Calamari's built-in post-processing on prediction.sentence

                    def _sort_chars(p):
                        """Filter and sort chars of prediction p"""
                        chars = p.chars
                        chars = [
                            c for c in chars if c.char
                        ]  # XXX Note that omission probabilities are not normalized?!
                        chars = [
                            c for c in chars if c.probability >=
                            self.parameter['glyph_conf_cutoff']
                        ]
                        chars = sorted(chars,
                                       key=lambda k: k.probability,
                                       reverse=True)
                        return chars

                    def _drop_leading_spaces(positions):
                        return list(
                            itertools.dropwhile(
                                lambda p: _sort_chars(p)[0].char == " ",
                                positions))

                    def _drop_trailing_spaces(positions):
                        return list(
                            reversed(_drop_leading_spaces(
                                reversed(positions))))

                    def _drop_double_spaces(positions):
                        def _drop_double_spaces_generator(positions):
                            last_was_space = False
                            for p in positions:
                                if p.chars[0].char == " ":
                                    if not last_was_space:
                                        yield p
                                    last_was_space = True
                                else:
                                    yield p
                                    last_was_space = False

                        return list(_drop_double_spaces_generator(positions))

                    positions = prediction.positions
                    positions = _drop_leading_spaces(positions)
                    positions = _drop_trailing_spaces(positions)
                    positions = _drop_double_spaces(positions)
                    positions = list(positions)

                    line_text = ''.join(
                        _sort_chars(p)[0].char for p in positions)
                    if line_text != prediction.sentence:
                        log.warning(
                            "Our own line text is not the same as Calamari's: '%s' != '%s'",
                            line_text, prediction.sentence)

                    # Delete existing results
                    if line.get_TextEquiv():
                        log.warning("Line '%s' already contained text results",
                                    line.id)
                    line.set_TextEquiv([])
                    if line.get_Word():
                        log.warning(
                            "Line '%s' already contained word segmentation",
                            line.id)
                    line.set_Word([])

                    # Save line results
                    line_conf = prediction.avg_char_probability
                    line.set_TextEquiv(
                        [TextEquivType(Unicode=line_text, conf=line_conf)])

                    # Save word results
                    #
                    # Calamari OCR does not provide word positions, so we infer word positions from a. text segmentation
                    # and b. the glyph positions. This is necessary because the PAGE XML format enforces a strict
                    # hierarchy of lines > words > glyphs.

                    def _words(s):
                        """Split words based on spaces and include spaces as 'words'"""
                        spaces = None
                        word = ''
                        for c in s:
                            if c == ' ' and spaces is True:
                                word += c
                            elif c != ' ' and spaces is False:
                                word += c
                            else:
                                if word:
                                    yield word
                                word = c
                                spaces = (c == ' ')
                        yield word

                    if self.parameter['textequiv_level'] in ['word', 'glyph']:
                        word_no = 0
                        i = 0

                        for word_text in _words(line_text):
                            word_length = len(word_text)
                            if not all(c == ' ' for c in word_text):
                                word_positions = positions[i:i + word_length]
                                word_start = word_positions[0].global_start
                                word_end = word_positions[-1].global_end

                                polygon = polygon_from_x0y0x1y1([
                                    word_start, 0, word_end, line_image.height
                                ])
                                points = points_from_polygon(
                                    coordinates_for_segment(
                                        polygon, None, line_coords))
                                # XXX Crop to line polygon?

                                word = WordType(id='%s_word%04d' %
                                                (line.id, word_no),
                                                Coords=CoordsType(points))
                                word.add_TextEquiv(
                                    TextEquivType(Unicode=word_text))

                                if self.parameter[
                                        'textequiv_level'] == 'glyph':
                                    for glyph_no, p in enumerate(
                                            word_positions):
                                        glyph_start = p.global_start
                                        glyph_end = p.global_end

                                        polygon = polygon_from_x0y0x1y1([
                                            glyph_start, 0, glyph_end,
                                            line_image.height
                                        ])
                                        points = points_from_polygon(
                                            coordinates_for_segment(
                                                polygon, None, line_coords))

                                        glyph = GlyphType(
                                            id='%s_glyph%04d' %
                                            (word.id, glyph_no),
                                            Coords=CoordsType(points))

                                        # Add predictions (= TextEquivs)
                                        char_index_start = 1  # Must start with 1, see https://ocr-d.github.io/page#multiple-textequivs
                                        for char_index, char in enumerate(
                                                _sort_chars(p),
                                                start=char_index_start):
                                            glyph.add_TextEquiv(
                                                TextEquivType(
                                                    Unicode=char.char,
                                                    index=char_index,
                                                    conf=char.probability))

                                        word.add_Glyph(glyph)

                                line.add_Word(word)
                                word_no += 1

                            i += word_length

            _page_update_higher_textequiv_levels('line', pcgts)

            # Add metadata about this operation and its runtime parameters:
            metadata = pcgts.get_Metadata()  # ensured by from_file()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(externalModel="ocrd-tool",
                                   externalId="parameters",
                                   Label=[
                                       LabelType(type_=name,
                                                 value=self.parameter[name])
                                       for name in self.parameter.keys()
                                   ])
                    ]))

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts))
Exemple #28
0
    def process(self):
        """Recognize lines / words / glyphs of the workspace.

        Open and deserialise each PAGE input file and its respective image,
        then iterate over the element hierarchy down to the requested
        ``textequiv_level``. If any layout annotation below the line level
        already exists, then remove it (regardless of ``textequiv_level``).

        Set up Ocropy to recognise each text line (via coordinates into
        the higher-level image, or from the alternative image; the image
        must have been binarised/grayscale-normalised, deskewed and dewarped
        already). Rescale and pad the image, then recognize.

        Create new elements below the line level, if necessary.
        Put text results and confidence values into new TextEquiv at
        ``textequiv_level``, and make the higher levels consistent with that
        up to the line level (by concatenation joined by whitespace).

        If a TextLine contained any previous text annotation, then compare
        that with the new result by aligning characters and computing the
        Levenshtein distance. Aggregate these scores for each file and print
        the line-wise and the total character error rates (CER).

        Produce a new output file by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        maxlevel = self.parameter['textequiv_level']

        # self.logger.info("Using model %s in %s for recognition", model)
        for (n, input_file) in enumerate(self.input_files):
            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId
                             or input_file.ID)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID  # (PageType has no id)
            page = pcgts.get_Page()

            page_image, page_coords, _ = self.workspace.image_from_page(
                page, page_id)

            self.logger.info("Recognizing text in page '%s'", page_id)
            # region, line, word, or glyph level:
            regions = page.get_AllRegions(classes=['Text'])
            if not regions:
                self.logger.warning("Page '%s' contains no text regions",
                                    page_id)
            self.process_regions(regions, maxlevel, page_image, page_coords)

            # update METS (add the PAGE file):
            file_id = make_file_id(input_file.ID, self.output_file_grp)
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            pcgts.set_pcGtsId(file_id)
            out = self.workspace.add_file(ID=file_id,
                                          file_grp=self.output_file_grp,
                                          pageId=input_file.pageId,
                                          local_filename=file_path,
                                          mimetype=MIMETYPE_PAGE,
                                          content=to_xml(pcgts))
            self.logger.info('created file ID: %s, file_grp: %s, path: %s',
                             file_id, self.output_file_grp, out.local_filename)
Exemple #29
0
    def process(self):

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('OcrdAnybaseocrBlockSegmenter')

        if not tf.test.is_gpu_available():
            LOG.warning(
                "Tensorflow cannot detect CUDA installation. Running without GPU will be slow."
            )

        model_path = Path(self.parameter['block_segmentation_model'])
        model_weights = Path(self.parameter['block_segmentation_weights'])

        confidence = self.parameter['DETECTION_MIN_CONFIDENCE']
        #         DETECTION_MIN_CONFIDENCE = Path(self.parameter['DETECTION_MIN_CONFIDENCE'])

        class_names = [
            'BG', 'page-number', 'paragraph', 'catch-word', 'heading',
            'drop-capital', 'signature-mark', 'header', 'marginalia',
            'footnote', 'footnote-continued', 'caption', 'endnote', 'footer',
            'keynote', 'image', 'table', 'graphics'
        ]

        if not Path(model_weights).is_file():
            LOG.error(
                """\
                Block Segmentation model weights file was not found at '%s'. Make sure the `model_weights` parameter
                points to the local model weights path.
                """, model_weights)
            sys.exit(1)

#         config = InferenceConfig(Config,DETECTION_MIN_CONFIDENCE)

        config = InferenceConfig(confidence)
        #         config = InferenceConfig()
        mrcnn_model = model.MaskRCNN(mode="inference",
                                     model_dir=str(model_path),
                                     config=config)
        mrcnn_model.load_weights(str(model_weights), by_name=True)

        oplevel = self.parameter['operation_level']
        for (n, input_file) in enumerate(self.input_files):

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_id = input_file.pageId or input_file.ID

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized,deskewed,cropped,clipped,non_text')
            # try to load pixel masks
            try:
                mask_image, mask_xywh, mask_image_info = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='clipped',
                    feature_filter='binarized,deskewed,cropped,non_text')
            except:
                mask_image = None
            # Display Warning If image segment results already exist or not in StructMap?
            regions = page.get_TextRegion() + page.get_TableRegion()
            if regions:
                LOG.warning("Image already has text segments!")

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n, mrcnn_model, class_names,
                                      mask_image)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Exemple #30
0
    def process(self):
        """Perform OCR post-correction with encoder-attention-decoder ANN on the workspace.
        
        Open and deserialise PAGE input files, then iterate over the element hierarchy
        down to the requested `textequiv_level`, making sequences of TextEquiv objects
        as lists of lines. Concatenate their string values, obeying rules of implicit
        whitespace, and map the string positions where the objects start.
        
        Next, transcode the input lines into output lines in parallel, and use
        the retrieved soft alignment scores to calculate hard alignment paths
        between input and output string via Viterbi decoding. Then use those
        to map back the start positions and overwrite each TextEquiv with its
        new content, paying special attention to whitespace:
        
        Distribute edits such that whitespace objects cannot become more than whitespace
        (or be deleted) and that non-whitespace objects must not start or end with
        whitespace (but may contain new whitespace in the middle).
        
        Subsequently, unless processing on the `line` level, make the Word segmentation
        consistent with that result again: merge around deleted whitespace tokens and
        split at whitespace inside non-whitespace tokens.
        
        Finally, make the levels above `textequiv_level` consistent with that
        textual result (via concatenation joined by whitespace).
        
        Produce new output files by serialising the resulting hierarchy.
        """
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)
        # Dragging Word/TextLine references along in all lists besides TextEquiv
        # is necessary because the generateDS version of the PAGE-XML model
        # has no references upwards in the hierarchy (from TextEquiv to containing
        # elements, from Glyph/Word/TextLine to Word/TextLine/TextRegion), and
        # its classes are not hashable.
        level = self.parameter['textequiv_level']
        for n, input_file in enumerate(self.input_files):
            self.logger.info("INPUT FILE %i / %s", n, input_file.pageId
                             or input_file.ID)

            pcgts = page_from_file(self.workspace.download_file(input_file))
            page_id = input_file.pageId or input_file.ID  # (PageType has no id)
            self.logger.info("Correcting text in page '%s' at the %s level",
                             page_id, level)

            # annotate processing metadata:
            self.add_metadata(pcgts)

            # get textequiv references for all lines:
            # FIXME: conf with TextEquiv alternatives
            line_sequences = _page_get_line_sequences_at(level, pcgts)

            # concatenate to strings and get dict of start positions to refs:
            input_lines, conf, textequiv_starts, word_starts, textline_starts = (
                _line_sequences2string_sequences(
                    self.s2s.mapping[0],
                    line_sequences,
                    charmap=self.parameter['charmap']))

            # correct string and get input-output alignment:
            # FIXME: split into self.batch_size chunks
            output_lines, output_probs, output_scores, alignments = (
                self.s2s.correct_lines(input_lines,
                                       conf,
                                       fast=self.parameter['fast_mode'],
                                       greedy=self.parameter['fast_mode']))

            # re-align (from alignment scores) and overwrite the textequiv references:
            for (input_line, output_line, output_prob, output_score, alignment,
                 textequivs, words,
                 textlines) in zip(input_lines, output_lines, output_probs,
                                   output_scores, alignments, textequiv_starts,
                                   word_starts, textline_starts):
                self.logger.debug('"%s" -> "%s"', input_line.rstrip('\n'),
                                  output_line.rstrip('\n'))

                # convert soft scores (seen from output) to hard path (seen from input):
                realignment = _alignment2path(alignment, len(input_line),
                                              len(output_line),
                                              1. / self.s2s.voc_size)

                # overwrite TextEquiv references:
                new_sequence = _update_sequence(input_line, output_line,
                                                output_prob, output_score,
                                                realignment, textequivs, words,
                                                textlines)

                # update Word segmentation:
                if level != 'line':
                    _resegment_sequence(new_sequence, level)

                self.logger.info('corrected line with %d elements, ppl: %.3f',
                                 len(new_sequence), np.exp(output_score))

            # make higher levels consistent again:
            page_update_higher_textequiv_levels(level, pcgts)

            # write back result to new annotation:
            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            file_path = os.path.join(self.output_file_grp, file_id + '.xml')
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    local_filename=file_path,
                                    mimetype=MIMETYPE_PAGE,
                                    content=to_xml(pcgts))