Example #1
0
 def test_copies_ok(self):
     with copy_of_directory(assets.url_of('SBB0000F29300010000/data')) as wsdir:
         workspace = Workspace(Resolver(), wsdir)
         input_files = workspace.mets.find_files(fileGrp='OCR-D-IMG')
         self.assertEqual(len(input_files), 3)
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         self.assertEqual(len(output_files), 0)
         run_processor(
             DummyProcessor,
             input_file_grp='OCR-D-IMG',
             output_file_grp='OUTPUT',
             workspace=workspace
         )
         output_files = workspace.mets.find_files(fileGrp='OUTPUT')
         output_files.sort(key=lambda x: x.url)
         print([str(s) for s in output_files])
         self.assertEqual(output_files[0].url, 'OUTPUT/OUTPUT_0001.tif')
         self.assertEqual(output_files[1].url, 'OUTPUT/OUTPUT_0001.xml')
         self.assertEqual(page_from_file(output_files[1]).pcGtsId, output_files[1].ID)
         self.assertEqual(page_from_file(output_files[1]).get_Page().imageFilename, output_files[0].url)
         self.assertEqual(len(output_files), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*')), 6)
         self.assertEqual(len(workspace.mets.find_files(ID='//OUTPUT.*_PAGE')), 3)
         self.assertEqual(len(workspace.mets.find_files(fileGrp='OUTPUT', mimetype=MIMETYPE_PAGE)), 3)
         run_processor(
             DummyProcessor,
             input_file_grp='OUTPUT',
             output_file_grp='OUTPUT2',
             workspace=workspace
         )
         output2_files = workspace.mets.find_files(fileGrp='OUTPUT2')
         output2_files.sort(key=lambda x: x.url)
         self.assertEqual(len(output2_files), 3)
Example #2
0
 def test_page_from_file_no_existe(self):
     with self.assertRaisesRegex(FileNotFoundError,
                                 "File not found: 'no-existe'"):
         mets = OcrdMets.empty_mets()
         ocrd_file = mets.add_file('FOO',
                                   ID='foo',
                                   local_filename='no-existe',
                                   mimetype='foo/bar')
         page_from_file(ocrd_file)
Example #3
0
 def test_rename_file_group(self):
     with copy_of_directory(assets.path_to('kant_aufklaerung_1784-page-region-line-word_glyph/data')) as tempdir:
         workspace = Workspace(self.resolver, directory=tempdir)
         with pushd_popd(tempdir):
             pcgts_before = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_before.get_Page().imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'
             # from os import system
             # print(system('find'))
             workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
             # print(system('find'))
             pcgts_after = page_from_file(next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001')))
             assert pcgts_after.get_Page().imageFilename == 'FOOBAR/OCR-D-IMG_0001.tif'
             assert Path('FOOBAR/OCR-D-IMG_0001.tif').exists()
             assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
Example #4
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(
                 self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 log.debug("Detecting lines in %s with tesseract",
                           region.id)
                 image = self.workspace.resolve_image_as_pil(
                     image_url,
                     polygon_from_points(region.get_Coords().points))
                 tessapi.SetImage(image)
                 offset = xywh_from_points(region.get_Coords().points)
                 for (line_no, component) in enumerate(
                         tessapi.GetComponentImages(RIL.TEXTLINE, True)):
                     line_id = '%s_line%04d' % (region.id, line_no)
                     line_xywh = component[1]
                     line_xywh['x'] += offset['x']
                     line_xywh['y'] += offset['y']
                     line_points = points_from_xywh(line_xywh)
                     region.add_TextLine(
                         TextLineType(id=line_id,
                                      Coords=CoordsType(line_points)))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 mimetype=MIMETYPE_PAGE,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 content=to_xml(pcgts).encode('utf-8'),
             )
Example #5
0
    def process(self):
        if not tf.test.is_gpu_available():
            LOG.error("Your system has no CUDA installed. No GPU detected.")
            sys.exit(1)
        model_path = Path(self.parameter['model_path'])
        class_mapper_path = Path(self.parameter['class_mapping_path'])
        if not Path(model_path).is_file():
            LOG.error("""\
                Layout Classfication model was not found at '%s'. Make sure the `model_path` parameter
                points to the local model path.
                model can be downloaded from http://url
                """ % model_path)
            sys.exit(1)
        else:

            LOG.info('Loading model from file %s', model_path)
            model = self.create_model(str(model_path))
            # load the mapping
            pickle_in = open(str(class_mapper_path), "rb")
            class_indices = pickle.load(pickle_in)
            label_mapping = dict((v, k) for k, v in class_indices.items())

            # print("INPUT FILE HERE",self.input_files)
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            fname = pcgts.get_Page().imageFilename
            page_id = input_file.pageId or input_file.ID
            size = 600, 500

            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameter",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))

            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized')

            img_array = ocrolib.pil2array(
                page_image.resize((500, 600), Image.ANTIALIAS))
            img_array = img_array * 1. / 255.
            img_array = img_array[np.newaxis, :, :, np.newaxis]
            results = self.start_test(model, img_array, fname, label_mapping)
            LOG.info(results)
            self.workspace.mets.set_physical_page_for_file(
                "PHYS_000" + str(n), input_file)
            self.create_logmap_smlink(pcgts)
            self.write_to_mets(results, "PHYS_000" + str(n))
Example #6
0
 def zip_input_files(self, ifgs):
     ifts = list()  # file tuples
     for page_id in self.workspace.mets.physical_pages:
         ifiles = list()
         for ifg in ifgs:
             LOG.debug("adding input file group %s to page %s", ifg,
                       page_id)
             files = self.workspace.mets.find_files(pageId=page_id,
                                                    fileGrp=ifg)
             if not files:
                 # fall back for missing pageId via Page imageFilename:
                 all_files = self.workspace.mets.find_files(fileGrp=ifg)
                 for file_ in all_files:
                     pcgts = page_from_file(
                         self.workspace.download_file(file_))
                     image_url = pcgts.get_Page().get_imageFilename()
                     img_files = self.workspace.mets.find_files(
                         url=image_url)
                     if img_files and img_files[0].pageId == page_id:
                         files = [file_]
                         break
             if not files:
                 # other fallback options?
                 LOG.error('found no page %s in file group %s', page_id,
                           ifg)
                 ifiles.append(None)
             else:
                 ifiles.append(files[0])
         if ifiles[0]:
             ifts.append(tuple(ifiles))
     return ifts
Example #7
0
 def process(self):
     """
     Performs the line segmentation.
     """
     with PyTessBaseAPI(
         psm=PSM.SINGLE_LINE,
         path=TESSDATA_PREFIX,
     ) as tessapi:
         for (n, input_file) in enumerate(self.input_files):
             pcgts = page_from_file(self.workspace.download_file(input_file))
             image_url = pcgts.get_Page().imageFilename
             for region in pcgts.get_Page().get_TextRegion():
                 for line in region.get_TextLine():
                     log.debug("Detecting words in line '%s'", line.id)
                     image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                     tessapi.SetImage(image)
                     offset = xywh_from_points(line.get_Coords().points)
                     for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)):
                         word_id = '%s_word%04d' % (line.id, word_no)
                         word_xywh = component[1]
                         word_xywh['x'] += offset['x']
                         word_xywh['y'] += offset['y']
                         line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh))))
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(
                 ID=ID,
                 file_grp=self.output_file_grp,
                 local_filename='%s/%s' % (self.output_file_grp, ID),
                 mimetype=MIMETYPE_PAGE,
                 content=to_xml(pcgts).encode('utf-8'),
             )
Example #8
0
    def process(self):
        """Performs segmentation evaluation with Shapely on the workspace.
        
        Open and deserialize PAGE input files and their respective images,
        then iterate over the element hierarchy down to the region level.
        
        Compare region polygons with each other.
        """

        ifgs = self.input_file_grp.split(",") # input file groups
        if len(ifgs) < 2:
            raise Exception("need multiple input file groups to compare")
        
        # get input files:
        ifts = self._zip_input_files(ifgs) # input file tuples
        for ift in ifts:
            pages = []
            for i, input_file in enumerate(ift):
                if not i:
                    LOG.info("processing page %s", input_file.pageId)
                if not input_file:
                    # file/page was not found in this group
                    continue
                LOG.info("INPUT FILE for '%s': '%s'", ifgs[i], input_file.ID)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                pages.append(pcgts.get_Page())
            gt_page = pages[0]
            for pred_page in pages[1:]:
                #
                self._compare_segmentation(gt_page, pred_page, input_file.pageId)
Example #9
0
    def validate(filename=None,
                 ocrd_page=None,
                 ocrd_file=None,
                 strictness='strict',
                 strategy='index1'):
        """
        Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.

        Arguments:
            filename (string): Path to PAGE
            ocrd_page (OcrdPage): OcrdPage instance
            ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
            strictness (string): 'strict', 'lax', 'fix' or 'off'
            strategy (string): Currently only 'index1'

        Returns:
            report (:class:`ValidationReport`) Report on the validity
        """
        if ocrd_page:
            validator = PageValidator(ocrd_page, strictness, strategy)
        elif ocrd_file:
            validator = PageValidator(page_from_file(ocrd_file), strictness,
                                      strategy)
        elif filename:
            validator = PageValidator(parse(filename, silence=True),
                                      strictness, strategy)
        else:
            raise Exception(
                "At least one of ocrd_page, ocrd_file or filename must be set")
        return validator._validate()  # pylint: disable=protected-access
Example #10
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            angle = page.get_orientation()
            if angle:
                LOG.warning('Overwriting existing deskewing angle: %i', angle)
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_filter='deskewed')

            if oplevel == "page":
                self._process_segment(page_image, page, page_xywh, page_id,
                                      input_file, n)
            else:
                LOG.warning('Operation level %s, but should be "page".',
                            oplevel)
                break

            # Use input_file's basename for the new file -
            # this way the files retain the same basenames:
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Example #11
0
 def process(self):
     """
     Performs the binarization.
     """
     for (n, input_file) in enumerate(self.input_files):
         log.info("INPUT FILE %i / %s", n, input_file)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         image_url = pcgts.get_Page().imageFilename
         log.info("pcgts %s", pcgts)
         for region in pcgts.get_Page().get_TextRegion():
             textlines = region.get_TextLine()
             log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
             for (line_no, line) in enumerate(textlines):
                 log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
                 image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
                 print(dir(kraken.binarization))
                 bin_image = kraken.binarization.nlbin(image)
                 bin_image_bytes = io.BytesIO()
                 bin_image.save(bin_image_bytes, format='PNG')
                 ID = concat_padded(self.output_file_grp, n)
                 self.workspace.add_file(
                     self.output_file_grp,
                     pageId=input_file.pageId,
                     ID=ID,
                     basename="%s.bin.png" % ID,
                     mimetype='image/png',
                     content=bin_image_bytes.getvalue())
Example #12
0
 def _resolve_image_file(self, input_file: OcrdFile) -> str:
     if input_file.mimetype == MIMETYPE_PAGE:
         pcgts = page_from_file(self.workspace.download_file(input_file))
         page = pcgts.get_Page()
         image_file = page.imageFilename
     else:
         image_file = input_file.local_filename
     return image_file
Example #13
0
 def test_page_from_file(self):
     f = create_ocrd_file_with_defaults(mimetype='image/tiff',
                                        local_filename=SAMPLE_IMG,
                                        ID='file1')
     self.assertEqual(f.mimetype, 'image/tiff')
     p = page_from_file(f)
     self.assertEqual(p.pcGtsId, f.ID)
     self.assertEqual(p.get_Page().imageWidth, 1457)
Example #14
0
    def process(self):
        """
        Performs the cropping.
        """
        with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
            #  print(self.input_file_grp)
            for (n, input_file) in enumerate(self.input_files):
                #  print(input_file)
                pcgts = page_from_file(self.workspace.download_file(input_file))
                image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename)
                log.debug("Cropping with tesseract")
                tessapi.SetImage(image)
                
                #
                # helper variables for saving the box coordinates
                #
                min_x = image.width
                min_y = image.height
                max_x = 0
                max_y = 0

                # iterate over all boxes and compare their extent
                # to the min and max values
                for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True):
                    points, index = points_from_xywh(component[1]), component[2]

                    #
                    # the region reference in the reading order element
                    #
                    ID = "region%04d" % index
                    log.debug("Detected region '%s': %s", ID, points)

                    for pair in points.split(' '):
                        x, y = (int(pair.split(',')[0]), int(pair.split(',')[1]))
                        if x < min_x:
                            min_x = x
                        if y < min_y:
                            min_y = y
                        elif x > max_x:
                            max_x = x
                        elif y > max_y:
                            max_y = y
                    log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))

                #
                # set the identified page border
                #
                brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)))
                pcgts.get_Page().set_Border(brd)

                ID = concat_padded(self.output_file_grp, n)
                self.workspace.add_file(
                    ID=ID,
                    file_grp=self.output_file_grp,
                    mimetype=MIMETYPE_PAGE,
                    local_filename='%s/%s' % (self.output_file_grp, ID),
                    content=to_xml(pcgts).encode('utf-8'),
                )
Example #15
0
    def process(self):
        """
        Segment with ocropy
        """

        for (n, input_file) in enumerate(self.input_files):
            log.info("INPUT FILE %i / %s", n, input_file)
            downloaded_file = self.workspace.download_file(input_file)
            log.info("downloaded_file %s", downloaded_file)
            pcgts = page_from_file(downloaded_file)
            page_width = pcgts.get_Page().get_imageWidth()
            page_height = pcgts.get_Page().get_imageHeight()
            # TODO binarized variant from get_AlternativeImage()
            image_url = pcgts.get_Page().imageFilename
            log.info("pcgts %s", pcgts)

            binary = ocrolib.read_image_binary(
                self.workspace.download_url(image_url))
            binary = 1 - binary

            scale = self.parameter['scale'] if self.parameter[
                'scale'] != 0 else psegutils.estimate_scale(binary)
            log.debug(binary)

            pseg = self.compute_segmentation(binary, scale)
            log.debug("pseg=%s", pseg)

            # TODO reading order / enumber
            #  log.debug("finding reading order")
            #  lines = psegutils.compute_lines(pseg, scale)
            #  order = psegutils.reading_order([l.bounds for l in lines])
            #  lsort = psegutils.topsort(order)

            regions = ocrolib.RegionExtractor()
            regions.setPageLines(pseg)

            dummyRegion = TextRegionType(
                id="dummy",
                Coords=CoordsType(
                    points="0,0 %s,0 %s,%s 0,%s" %
                    (page_width, page_width, page_height, page_height)))
            pcgts.get_Page().add_TextRegion(dummyRegion)

            for lineno in range(1, regions.length()):
                log.debug("id=%s bbox=%s", regions.id(lineno),
                          regions.bbox(lineno))
                textline = TextLineType(
                    id=concat_padded("line", lineno),
                    Coords=CoordsType(
                        points=points_from_y0x0y1x1(regions.bbox(lineno))))
                dummyRegion.add_TextLine(textline)
            ID = concat_padded(self.output_file_grp, n)
            self.workspace.add_file(ID=ID,
                                    file_grp=self.output_file_grp,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename="%s/%s.xml" %
                                    (self.output_file_grp, ID),
                                    content=to_xml(pcgts))
Example #16
0
    def process(self):
        network_file = self.parameter['network']
        stride = self.parameter['stride']
        classifier = TypegroupsClassifier.load(network_file)

        ignore_type = ('Adornment', 'Book covers and other irrelevant data',
                       'Empty Pages', 'Woodcuts - Engravings')

        self.log.debug('Processing: %s', self.input_files)
        for (_, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            image_url = pcgts.get_Page().imageFilename
            pil_image = self.workspace.resolve_image_as_pil(image_url)
            result = classifier.run(pil_image, stride)
            score_sum = 0
            for typegroup in classifier.classMap.cl2id:
                if not typegroup in ignore_type:
                    score_sum += max(0, result[typegroup])

            script_highscore = 0
            noise_highscore = 0
            result_map = {}
            output = ''
            for typegroup in classifier.classMap.cl2id:
                score = result[typegroup]
                if typegroup in ignore_type:
                    noise_highscore = max(noise_highscore, score)
                else:
                    script_highscore = max(script_highscore, score)
                    normalised_score = max(0, score / score_sum)
                    result_map[normalised_score] = typegroup
            if noise_highscore > script_highscore:
                pcgts.get_Page().set_primaryScript(None)
                self.log.debug(
                    'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s',
                    noise_highscore, script_highscore)
            else:
                for k in sorted(result_map, reverse=True):
                    intk = round(100 * k)
                    if intk <= 0:
                        continue
                    if output != '':
                        output = '%s, ' % output
                    output = '%s%s:%d' % (output, result_map[k], intk)
                self.log.debug('Detected %s' % output)
                page = pcgts.get_Page()
                textStyle = page.get_TextStyle()
                if not textStyle:
                    textStyle = TextStyleType()
                    page.set_TextStyle(textStyle)
                textStyle.set_fontFamily(output)
                ID = concat_padded(self.output_file_grp, input_file.ID)
                self.workspace.add_file(ID=ID,
                                        file_grp=self.output_file_grp,
                                        mimetype=MIMETYPE_PAGE,
                                        local_filename="%s/%s" %
                                        (self.output_file_grp, ID),
                                        content=to_xml(pcgts))
Example #17
0
 def process(self):
     """
     Performs the binarization.
     """
     log = getLogger('processor.KrakenBinarize')
     log.debug('Level of operation: "%s"',
               self.parameter['level-of-operation'])
     log.debug('Input file group %s', self.input_file_grp)
     log.debug('Input files %s', [str(f) for f in self.input_files])
     for (n, input_file) in enumerate(self.input_files):
         log.info("INPUT FILE %i / %s", n, input_file)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         image_url = pcgts.get_Page().imageFilename
         log.info("pcgts %s", pcgts)
         if self.parameter['level-of-operation'] == 'page':
             log.info("About to binarize page '%s'", pcgts.pcGtsId)
             image = self.workspace.resolve_image_as_pil(image_url)
             bin_image = kraken.binarization.nlbin(image)
             bin_image_bytes = io.BytesIO()
             bin_image.save(bin_image_bytes, format='PNG')
             ID = concat_padded(self.output_file_grp, n)
             self.workspace.add_file(self.output_file_grp,
                                     pageId=input_file.pageId,
                                     ID=ID,
                                     mimetype='image/png',
                                     local_filename="%s/%s" %
                                     (self.output_file_grp, ID),
                                     content=bin_image_bytes.getvalue())
         else:
             for region in pcgts.get_Page().get_TextRegion():
                 if self.parameter['level-of-operation'] == 'block':
                     log.info("About to binarize region '%s'", region.id)
                     image = self.workspace.resolve_image_as_pil(
                         image_url,
                         polygon_from_points(region.get_Coords().points))
                 else:
                     textlines = region.get_TextLine()
                     log.info("About to binarize %i lines of region '%s'",
                              len(textlines), region.id)
                     for (line_no, line) in enumerate(textlines):
                         log.debug("Binarizing line '%s' in region '%s'",
                                   line_no, region.id)
                         image = self.workspace.resolve_image_as_pil(
                             image_url,
                             polygon_from_points(line.get_Coords().points))
                         bin_image = kraken.binarization.nlbin(image)
                         bin_image_bytes = io.BytesIO()
                         bin_image.save(bin_image_bytes, format='PNG')
                         ID = concat_padded(self.output_file_grp, n,
                                            region.id, line_no)
                         self.workspace.add_file(
                             self.output_file_grp,
                             pageId=input_file.pageId,
                             ID=ID,
                             local_filename="%s/%s" %
                             (self.output_file_grp, ID),
                             mimetype='image/png',
                             content=bin_image_bytes.getvalue())
Example #18
0
 def test_page_from_file(self):
     f = OcrdFile(None,
                  mimetype='image/tiff',
                  local_filename=SAMPLE_IMG,
                  ID='file1')
     self.assertEqual(f.mimetype, 'image/tiff')
     p = page_from_file(f)
     self.assertEqual(p.pcGtsId, f.ID)
     self.assertEqual(p.get_Page().imageWidth, 1457)
Example #19
0
    def process(self):
        """
        Performs the (text) recognition.
        """
        # print(self.parameter)
        linesdir = self.parameter['linesdir']

        # self.log.info("Using model %s in %s for recognition", model)
        for (n, input_file) in enumerate(self.input_files):
            # self.log.info("INPUT FILE %i / %s", n, input_file)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            pil_image = self.workspace.resolve_image_as_pil(
                pcgts.get_Page().imageFilename)

            self.log.info("Preparing page '%s'", pcgts.get_pcGtsId())
            page = pcgts.get_Page()

            # region, line, word, or glyph level:
            regions = page.get_TextRegion()
            if not regions:
                self.log.warning("Page contains no text regions")

            for region in regions:
                self.log.info("Preparing region '%s'", region.id)

                textlines = region.get_TextLine()
                if not textlines:
                    self.log.warning("Region '%s' contains no text lines",
                                     region.id)
                else:

                    for line in textlines:
                        self.log.info("Cutting line '%s'", line.id)

                        # get box from points
                        box = bounding_box(line.get_Coords().points)

                        # crop word from page
                        croped_image = pil_image.crop(box=box)

                        # binarize with Otsu's thresholding after Gaussian filtering
                        bin_image = binarize(croped_image)

                        # resize image to 48 pixel height
                        final_img = resize_keep_ratio(bin_image)

                        index = input_file.url.rfind('/')
                        fgrp = input_file.url[index:-4]
                        # save temp image
                        suffix = fgrp + '-' + str(region.id) + '-' + str(
                            line.id) + '.png'
                        imgpath = linesdir + suffix

                        if not os.path.exists(linesdir):
                            os.makedirs(linesdir)

                        final_img.save(imgpath)
Example #20
0
    def process(self):
        """
        Performs the (text) recognition.
        """

        # print(self.parameter)
        self.maxlevel = self.parameter['textequiv_level']
        linesdir = self.parameter['linesdir']

        if self.maxlevel not in ['line', 'word', 'glyph']:
            raise Exception(
                "currently only implemented at the line/glyph level")

        root, _, files = os.walk(linesdir).__next__()
        self.root = root
        predfiles = []
        for file in files:
            if '.pred' in file:
                predfiles.append(file[:-9])

########################################################################################

# self.log.info("Using model %s in %s for recognition", model)
        for (n, input_file) in enumerate(self.input_files):
            # self.log.info("INPUT FILE %i / %s", n, input_file)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            self.log.info("Processing text in page '%s'", pcgts.get_pcGtsId())
            page = pcgts.get_Page()

            index = input_file.url.rfind('/') + 1
            fgrp = input_file.url[index:-4]

            # region, line, word, or glyph level:
            regions = page.get_TextRegion()
            if not regions:
                self.log.warning("Page contains no text regions")

            self.process_regions(regions, predfiles, fgrp)

            ID = concat_padded(self.output_file_grp, n)
            self.log.info('creating file id: %s, name: %s, file_grp: %s', ID,
                          input_file.basename, self.output_file_grp)

            # Use the input file's basename for the new file
            # this way the files retain the same basenames.
            out = self.workspace.add_file(
                ID=ID,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                basename=self.output_file_grp + '-' + input_file.basename,
                mimetype=MIMETYPE_PAGE,
                content=to_xml(pcgts),
            )
            self.log.info('created file %s', out)
Example #21
0
    def process(self):
        """Segment pages into regions using a Mask R-CNN model."""
        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        LOG = getLogger('processor.AnybaseocrBlockSegmenter')
        if not tf.test.is_gpu_available():
            LOG.warning(
                "Tensorflow cannot detect CUDA installation. Running without GPU will be slow."
            )

        for input_file in self.input_files:
            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            page_id = input_file.pageId or input_file.ID

            # todo rs: why not cropped?
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page,
                page_id,
                feature_filter='binarized,deskewed,cropped,clipped,non_text')
            # try to load pixel masks
            try:
                # todo rs: this combination only works for tiseg with use_deeplr=true
                mask_image, _, _ = self.workspace.image_from_page(
                    page,
                    page_id,
                    feature_selector='clipped',
                    feature_filter='binarized,deskewed,cropped,non_text')
            except:
                mask_image = None
            if page_image_info.resolution != 1:
                dpi = page_image_info.resolution
                if page_image_info.resolutionUnit == 'cm':
                    dpi = round(dpi * 2.54)
            else:
                dpi = None

            self._process_segment(page_image, page, page_xywh, page_id,
                                  input_file, mask_image, dpi)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
Example #22
0
    def process(self):
        try:
            self.page_grp, self.image_grp = self.output_file_grp.split(',')
        except ValueError:
            self.page_grp = self.output_file_grp
            self.image_grp = FALLBACK_IMAGE_GRP
            LOG.info(
                "No output file group for images specified, falling back to '%s'",
                FALLBACK_IMAGE_GRP)
        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.image_grp)
            page_id = input_file.pageId or input_file.ID
            LOG.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))
            metadata = pcgts.get_Metadata()
            metadata.add_MetadataItem(
                MetadataItemType(
                    type_="processingStep",
                    name=self.ocrd_tool['steps'][0],
                    value=TOOL,
                    Labels=[
                        LabelsType(  #externalRef="parameters",
                            Label=[
                                LabelType(type_=name,
                                          value=self.parameter[name])
                                for name in self.parameter.keys()
                            ])
                    ]))
            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id)

            if oplevel == "page":
                self._process_segment(page, page_image.filename, page_id,
                                      file_id + ".ds")

            file_id = input_file.ID.replace(self.input_file_grp,
                                            self.output_file_grp)
            if file_id == input_file.ID:
                file_id = concat_padded(self.output_file_grp, n)

            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))
    def process(self):
        for (n, input_file) in enumerate(self.input_files):
            pcgts = page_from_file(self.workspace.download_file(input_file))
            page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)
            page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id)            
            # image_coords = pcgts.get_Page().get_Border().get_Coords().points.split()

            # why does it return Image type when there is data (border info from crop)
            print("----------", type(page_image), page_xywh)

            # I: binarized-input-image; imftext: output-text-portion.png; imfimage: output-image-portion.png
            '''
Example #24
0
def test_rename_file_group(tmp_path):
    # arrange
    copytree(
        assets.path_to(
            'kant_aufklaerung_1784-page-region-line-word_glyph/data'),
        str(tmp_path))
    workspace = Workspace(Resolver(), directory=str(tmp_path))

    # before act
    # TODO clear semantics
    # requires rather odd additional path-setting because root path from
    # workspace is not propagated - works only if called inside workspace
    # which can be achieved with pushd_popd functionalities
    ocrd_file = next(workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
    relative_name = ocrd_file.local_filename
    ocrd_file.local_filename = join(tmp_path, relative_name)
    pcgts_before = page_from_file(ocrd_file)
    # before assert
    assert pcgts_before.get_Page(
    ).imageFilename == 'OCR-D-IMG/OCR-D-IMG_0001.tif'

    # act
    workspace.rename_file_group('OCR-D-IMG', 'FOOBAR')
    next_ocrd_file = next(
        workspace.mets.find_files(ID='OCR-D-GT-SEG-WORD_0001'))
    next_ocrd_file.local_filename = join(tmp_path, relative_name)
    pcgts_after = page_from_file(next_ocrd_file)

    # assert
    assert pcgts_after.get_Page().imageFilename == 'FOOBAR/FOOBAR_0001.tif'
    assert Path(tmp_path / 'FOOBAR/FOOBAR_0001.tif').exists()
    assert not Path('OCR-D-IMG/OCR-D-IMG_0001.tif').exists()
    assert workspace.mets.get_physical_pages(
        for_fileIds=['OCR-D-IMG_0001']) == [None]
    assert workspace.mets.get_physical_pages(for_fileIds=['FOOBAR_0001']) == [
        'phys_0001'
    ]
Example #25
0
    def validate(filename=None,
                 ocrd_page=None,
                 ocrd_file=None,
                 page_textequiv_consistency='strict',
                 page_textequiv_strategy='first',
                 check_baseline=True,
                 check_coords=True):
        """
        Validates a PAGE file for consistency by filename, OcrdFile or passing OcrdPage directly.

        Arguments:
            filename (string): Path to PAGE
            ocrd_page (OcrdPage): OcrdPage instance
            ocrd_file (OcrdFile): OcrdFile instance wrapping OcrdPage
            page_textequiv_consistency (string): 'strict', 'lax', 'fix' or 'off'
            page_textequiv_strategy (string): Currently only 'first'
            check_baseline (bool): whether Baseline must be fully within TextLine/Coords
            check_coords (bool): whether *Region/TextLine/Word/Glyph must each be fully
                                 contained within Border/*Region/TextLine/Word, resp.

        Returns:
            report (:class:`ValidationReport`) Report on the validity
        """
        log = getLogger('ocrd.page_validator.validate')
        if ocrd_page:
            page = ocrd_page
            file_id = ocrd_page.get_pcGtsId()
        elif ocrd_file:
            page = page_from_file(ocrd_file)
            file_id = ocrd_file.ID
        elif filename:
            page = parse(filename, silence=True)
            file_id = filename
        else:
            raise Exception(
                "At least one of ocrd_page, ocrd_file or filename must be set")
        if page_textequiv_strategy not in ('first'):
            raise Exception("page_textequiv_strategy %s not implemented" %
                            page_textequiv_strategy)
        if page_textequiv_consistency not in ('strict', 'lax', 'fix', 'off'):
            raise Exception(
                "page_textequiv_consistency level %s not implemented" %
                page_textequiv_consistency)
        report = ValidationReport()
        log.info("Validating input file '%s'", file_id)
        validate_consistency(page, page_textequiv_consistency,
                             page_textequiv_strategy, check_baseline,
                             check_coords, report, file_id)
        return report
Example #26
0
 def _validate_dimension(self):
     """
     Validate image height and PAGE imageHeight match
     """
     self.log.info('_validate_dimension')
     for f in self.mets.find_files(mimetype=MIMETYPE_PAGE):
         if not is_local_filename(f.url) and not self.download:
             self.report.add_notice("_validate_dimension: Not executed because --download wasn't set and PAGE might reference remote (Alternative)Images <%s>" % f.url)
             continue
         page = page_from_file(f).get_Page()
         _, _, exif = self.workspace.image_from_page(page, f.pageId)
         if page.imageHeight != exif.height:
             self.report.add_error("PAGE '%s': @imageHeight != image's actual height (%s != %s)" % (f.ID, page.imageHeight, exif.height))
         if page.imageWidth != exif.width:
             self.report.add_error("PAGE '%s': @imageWidth != image's actual width (%s != %s)" % (f.ID, page.imageWidth, exif.width))
Example #27
0
 def _validate_page(self):
     """
     Run PageValidator on the PAGE-XML documents referenced in the METS.
     """
     self.log.debug('_validate_page')
     for ocrd_file in self.mets.find_files(mimetype=MIMETYPE_PAGE):
         self.workspace.download_file(ocrd_file)
         page_report = PageValidator.validate(ocrd_file=ocrd_file,
                                              page_textequiv_consistency=self.page_strictness,
                                              check_coords=self.page_coordinate_consistency in ['poly', 'both'],
                                              check_baseline=self.page_coordinate_consistency in ['baseline', 'both'])
         pg = page_from_file(ocrd_file)
         if pg.pcGtsId != ocrd_file.ID:
             page_report.add_warning('pc:PcGts/@pcGtsId differs from mets:file/@ID: "%s" !== "%s"' % (pg.pcGtsId or '', ocrd_file.ID or ''))
         self.report.merge_report(page_report)
Example #28
0
    def process(self):
        """
        Performs the recognition.
        """

        self._init_calamari()

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID
            log.info("INPUT FILE %i / %s", n, page_id)
            pcgts = page_from_file(self.workspace.download_file(input_file))

            page = pcgts.get_Page()
            page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id)

            for region in pcgts.get_Page().get_TextRegion():
                region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh)

                textlines = region.get_TextLine()
                log.info("About to recognize %i lines of region '%s'", len(textlines), region.id)
                for (line_no, line) in enumerate(textlines):
                    log.debug("Recognizing line '%s' in region '%s'", line_no, region.id)

                    line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh)
                    line_image_np = np.array(line_image, dtype=np.uint8)

                    raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0]
                    for i, p in enumerate(raw_results):
                        p.prediction.id = "fold_{}".format(i)

                    prediction = self.voter.vote_prediction_result(raw_results)
                    prediction.id = "voted"

                    line_text = prediction.sentence
                    line_conf = prediction.avg_char_probability

                    line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)])

            _page_update_higher_textequiv_levels('line', pcgts)

            file_id = self._make_file_id(input_file, n)
            self.workspace.add_file(
                ID=file_id,
                file_grp=self.output_file_grp,
                pageId=input_file.pageId,
                mimetype=MIMETYPE_PAGE,
                local_filename=os.path.join(self.output_file_grp, file_id + '.xml'),
                content=to_xml(pcgts))
Example #29
0
 def process(self):
     LOG = getLogger('ocrd.dummy')
     assert_file_grp_cardinality(self.input_file_grp, 1)
     assert_file_grp_cardinality(self.output_file_grp, 1)
     for input_file in self.input_files:
         input_file = self.workspace.download_file(input_file)
         file_id = make_file_id(input_file, self.output_file_grp)
         ext = MIME_TO_EXT.get(input_file.mimetype, '')
         local_filename = join(self.output_file_grp, file_id + ext)
         pcgts = page_from_file(self.workspace.download_file(input_file))
         pcgts.set_pcGtsId(file_id)
         self.add_metadata(pcgts)
         LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id)
         if input_file.mimetype == MIMETYPE_PAGE:
             # Source file is PAGE-XML: Write out in-memory PcGtsType
             self.workspace.add_file(
                 ID=file_id,
                 file_grp=self.output_file_grp,
                 pageId=input_file.pageId,
                 mimetype=input_file.mimetype,
                 local_filename=local_filename,
                 content=to_xml(pcgts).encode('utf-8'))
         else:
             # Source file is not PAGE-XML: Copy byte-by-byte
             with open(input_file.local_filename, 'rb') as f:
                 content = f.read()
                 self.workspace.add_file(
                     ID=file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=input_file.mimetype,
                     local_filename=local_filename,
                     content=content)
             if input_file.mimetype.startswith('image/'):
                 # write out the PAGE-XML representation for this image
                 page_file_id = file_id + '_PAGE'
                 pcgts.set_pcGtsId(page_file_id)
                 pcgts.get_Page().set_imageFilename(local_filename)
                 page_filename = join(self.output_file_grp, file_id + '.xml')
                 LOG.info("Add PAGE-XML %s generated for %s at %s",
                          page_file_id, file_id, page_filename)
                 self.workspace.add_file(
                     ID=page_file_id,
                     file_grp=self.output_file_grp,
                     pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
                     local_filename=page_filename,
                     content=to_xml(pcgts).encode('utf-8'))
    def process(self):
        LOG = getLogger('OcrdAnybaseocrTextline')

        assert_file_grp_cardinality(self.input_file_grp, 1)
        assert_file_grp_cardinality(self.output_file_grp, 1)

        oplevel = self.parameter['operation_level']

        for (n, input_file) in enumerate(self.input_files):
            page_id = input_file.pageId or input_file.ID

            pcgts = page_from_file(self.workspace.download_file(input_file))
            self.add_metadata(pcgts)
            page = pcgts.get_Page()
            LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID)

            page_image, page_xywh, page_image_info = self.workspace.image_from_page(
                page, page_id, feature_selector='binarized,deskewed')

            if oplevel == 'page':
                LOG.warning("Operation level should be region.")
                self._process_segment(page_image, page, None, page_xywh,
                                      page_id, input_file, n)

            else:
                regions = page.get_TextRegion()
                if not regions:
                    LOG.warning("Page '%s' contains no text regions", page_id)
                    continue
                for (k, region) in enumerate(regions):

                    region_image, region_xywh = self.workspace.image_from_segment(
                        region, page_image, page_xywh)

                    self._process_segment(region_image, page, region,
                                          region_xywh, region.id, input_file,
                                          k)

            file_id = make_file_id(input_file, self.output_file_grp)
            pcgts.set_pcGtsId(file_id)
            self.workspace.add_file(ID=file_id,
                                    file_grp=self.output_file_grp,
                                    pageId=input_file.pageId,
                                    mimetype=MIMETYPE_PAGE,
                                    local_filename=os.path.join(
                                        self.output_file_grp,
                                        file_id + '.xml'),
                                    content=to_xml(pcgts).encode('utf-8'))