def test_orderedgroup_export_order(): """ See https://github.com/OCR-D/core/issues/475 """ # arrange with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) # act og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() xml_before = to_xml(og) children = og.get_AllIndexed() # assert assert len(children) == 22 assert [c.index for c in children] == list(range(0, 22)) # mix up the indexes children[0].index = 11 children[11].index = 3 children[3].index = 0 assert [c.index for c in children] == [11, 1, 2, 0, 4, 5, 6, 7, 8, 9, 10, 3, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] assert [c.index for c in og.get_AllIndexed()] == list(range(0, 22)) assert og.get_AllIndexed()[1].__class__ == OrderedGroupIndexedType # serialize and make sure the correct order was serialized new_pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) new_og = new_pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() assert [c.index for c in new_og.get_AllIndexed()] == list(range(0, 22)) xml_after = to_xml(new_og)
def test_to_xml(self): # with open('/tmp/test.xml', 'w') as f: # f.write(to_xml(self.pcgts)) self.assertIn( ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"', to_xml(self.pcgts)[:1000]) self.assertIn('</TextRegion', to_xml(self.pcgts))
def process(self): LOG = getLogger('ocrd.dummy') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for input_file in self.input_files: input_file = self.workspace.download_file(input_file) file_id = make_file_id(input_file, self.output_file_grp) ext = MIME_TO_EXT.get(input_file.mimetype, '') local_filename = join(self.output_file_grp, file_id + ext) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) LOG.info("cp %s %s # %s -> %s", input_file.url, local_filename, input_file.ID, file_id) if input_file.mimetype == MIMETYPE_PAGE: # Source file is PAGE-XML: Write out in-memory PcGtsType self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=to_xml(pcgts).encode('utf-8')) else: # Source file is not PAGE-XML: Copy byte-by-byte with open(input_file.local_filename, 'rb') as f: content = f.read() self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=input_file.mimetype, local_filename=local_filename, content=content) if input_file.mimetype.startswith('image/'): # write out the PAGE-XML representation for this image page_file_id = file_id + '_PAGE' pcgts.set_pcGtsId(page_file_id) pcgts.get_Page().set_imageFilename(local_filename) page_filename = join(self.output_file_grp, file_id + '.xml') LOG.info("Add PAGE-XML %s generated for %s at %s", page_file_id, file_id, page_filename) self.workspace.add_file( ID=page_file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=page_filename, content=to_xml(pcgts).encode('utf-8'))
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI( psm=PSM.SINGLE_LINE, path=TESSDATA_PREFIX, ) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): for line in region.get_TextLine(): log.debug("Detecting words in line '%s'", line.id) image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(line.get_Coords().points) for (word_no, component) in enumerate(tessapi.GetComponentImages(RIL.WORD, True)): word_id = '%s_word%04d' % (line.id, word_no) word_xywh = component[1] word_xywh['x'] += offset['x'] word_xywh['y'] += offset['y'] line.add_Word(WordType(id=word_id, Coords=CoordsType(points_from_xywh(word_xywh)))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, local_filename='%s/%s' % (self.output_file_grp, ID), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts).encode('utf-8'), )
def process(self): """ Performs the line segmentation. """ with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file( self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename for region in pcgts.get_Page().get_TextRegion(): log.debug("Detecting lines in %s with tesseract", region.id) image = self.workspace.resolve_image_as_pil( image_url, polygon_from_points(region.get_Coords().points)) tessapi.SetImage(image) offset = xywh_from_points(region.get_Coords().points) for (line_no, component) in enumerate( tessapi.GetComponentImages(RIL.TEXTLINE, True)): line_id = '%s_line%04d' % (region.id, line_no) line_xywh = component[1] line_xywh['x'] += offset['x'] line_xywh['y'] += offset['y'] line_points = points_from_xywh(line_xywh) region.add_TextLine( TextLineType(id=line_id, Coords=CoordsType(line_points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() angle = page.get_orientation() if angle: LOG.warning('Overwriting existing deskewing angle: %i', angle) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='deskewed') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break # Use input_file's basename for the new file - # this way the files retain the same basenames: file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): network_file = self.parameter['network'] stride = self.parameter['stride'] classifier = TypegroupsClassifier.load(network_file) ignore_type = ('Adornment', 'Book covers and other irrelevant data', 'Empty Pages', 'Woodcuts - Engravings') self.log.debug('Processing: %s', self.input_files) for (_, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) image_url = pcgts.get_Page().imageFilename pil_image = self.workspace.resolve_image_as_pil(image_url) result = classifier.run(pil_image, stride) score_sum = 0 for typegroup in classifier.classMap.cl2id: if not typegroup in ignore_type: score_sum += max(0, result[typegroup]) script_highscore = 0 noise_highscore = 0 result_map = {} output = '' for typegroup in classifier.classMap.cl2id: score = result[typegroup] if typegroup in ignore_type: noise_highscore = max(noise_highscore, score) else: script_highscore = max(script_highscore, score) normalised_score = max(0, score / score_sum) result_map[normalised_score] = typegroup if noise_highscore > script_highscore: pcgts.get_Page().set_primaryScript(None) self.log.debug( 'Detected only noise (such as empty page or book cover). noise_highscore=%s > script_highscore=%s', noise_highscore, script_highscore) else: for k in sorted(result_map, reverse=True): intk = round(100 * k) if intk <= 0: continue if output != '': output = '%s, ' % output output = '%s%s:%d' % (output, result_map[k], intk) self.log.debug('Detected %s' % output) page = pcgts.get_Page() textStyle = page.get_TextStyle() if not textStyle: textStyle = TextStyleType() page.set_TextStyle(textStyle) textStyle.set_fontFamily(output) ID = concat_padded(self.output_file_grp, input_file.ID) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s" % (self.output_file_grp, ID), content=to_xml(pcgts))
def process(self): """ Performs the cropping. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Cropping with tesseract") tessapi.SetImage(image) # # helper variables for saving the box coordinates # min_x = image.width min_y = image.height max_x = 0 max_y = 0 # iterate over all boxes and compare their extent # to the min and max values for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) for pair in points.split(' '): x, y = (int(pair.split(',')[0]), int(pair.split(',')[1])) if x < min_x: min_x = x if y < min_y: min_y = y elif x > max_x: max_x = x elif y > max_y: max_y = y log.debug("Updated page border: %i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) # # set the identified page border # brd = BorderType(Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) pcgts.get_Page().set_Border(brd) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def test_serialize_no_empty_readingorder(): """ https://github.com/OCR-D/core/issues/602 """ pcgts = page_from_image(create_ocrd_file_with_defaults(url=assets.path_to('kant_aufklaerung_1784/data/OCR-D-IMG/INPUT_0017.tif'))) pcgts.get_Page().set_ReadingOrder(ReadingOrderType()) assert pcgts.get_Page().get_ReadingOrder() pcgts = parseString(to_xml(pcgts, skip_declaration=True)) assert not pcgts.get_Page().get_ReadingOrder()
def test_to_xml_unicode_nsprefix(self): with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f: from_xml = f.read() self.assertIn('<Unicode>', from_xml.decode('utf-8'), 'without NS prefix') self.assertIn('<Created', from_xml.decode('utf-8'), 'without NS prefix') pcgts = parseString(from_xml, silence=True) as_xml = to_xml(pcgts) self.assertIn('<pc:Unicode>', as_xml, 'with NS prefix') self.assertIn('<pc:Created>', as_xml, 'with NS prefix')
def process(self): """ Performs the (text) recognition. """ # print(self.parameter) self.maxlevel = self.parameter['textequiv_level'] linesdir = self.parameter['linesdir'] if self.maxlevel not in ['line', 'word', 'glyph']: raise Exception( "currently only implemented at the line/glyph level") root, _, files = os.walk(linesdir).__next__() self.root = root predfiles = [] for file in files: if '.pred' in file: predfiles.append(file[:-9]) ######################################################################################## # self.log.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): # self.log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) self.log.info("Processing text in page '%s'", pcgts.get_pcGtsId()) page = pcgts.get_Page() index = input_file.url.rfind('/') + 1 fgrp = input_file.url[index:-4] # region, line, word, or glyph level: regions = page.get_TextRegion() if not regions: self.log.warning("Page contains no text regions") self.process_regions(regions, predfiles, fgrp) ID = concat_padded(self.output_file_grp, n) self.log.info('creating file id: %s, name: %s, file_grp: %s', ID, input_file.basename, self.output_file_grp) # Use the input file's basename for the new file # this way the files retain the same basenames. out = self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, basename=self.output_file_grp + '-' + input_file.basename, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)
def redraw(self) -> None: if self.current: self.text_view.set_tooltip_text(self.page_id) if self.current.file: with self.document.path(self.current.file).open('r') as f: text = f.read() else: text = to_xml(self.current.pc_gts) self.buffer.set_text(text) else: self.buffer.set_text('')
def process(self): try: self.page_grp, self.image_grp = self.output_file_grp.split(',') except ValueError: self.page_grp = self.output_file_grp self.image_grp = FALLBACK_IMAGE_GRP LOG.info( "No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) metadata = pcgts.get_Metadata() metadata.add_MetadataItem( MetadataItemType( type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, Labels=[ LabelsType( #externalRef="parameters", Label=[ LabelType(type_=name, value=self.parameter[name]) for name in self.parameter.keys() ]) ])) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) if oplevel == "page": self._process_segment(page, page_image.filename, page_id, file_id + ".ds") file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """Segment pages into regions using a Mask R-CNN model.""" assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('processor.AnybaseocrBlockSegmenter') if not tf.test.is_gpu_available(): LOG.warning( "Tensorflow cannot detect CUDA installation. Running without GPU will be slow." ) for input_file in self.input_files: pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() page_id = input_file.pageId or input_file.ID # todo rs: why not cropped? page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='binarized,deskewed,cropped,clipped,non_text') # try to load pixel masks try: # todo rs: this combination only works for tiseg with use_deeplr=true mask_image, _, _ = self.workspace.image_from_page( page, page_id, feature_selector='clipped', feature_filter='binarized,deskewed,cropped,non_text') except: mask_image = None if page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) else: dpi = None self._process_segment(page_image, page, page_xywh, page_id, input_file, mask_image, dpi) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def test_simpletypes(self): pcgts = parseString(simple_page, silence=True) self.assertTrue(isinstance(pcgts.get_Page().imageWidth, int)) el = pcgts.get_Page().get_TextRegion()[0].get_TextLine()[0].get_Word( )[0].get_TextEquiv()[0] self.assertTrue(isinstance(el.conf, float)) # XXX no validation on setting attributes :-( # c.f. https://www.davekuhlman.org/generateDS.html#simpletype # el.set_conf('2.0987') # self.assertTrue(isinstance(el.conf, float)) with self.assertRaisesRegex(TypeError, ''): el.set_conf('I AM NOT A FLOAT DEAL WITH IT') parseString(to_xml(pcgts).encode('utf8'))
def process(self): """ Performs the (text) recognition. """ mainIndex = self.parameter['mainIndex'] for (n, input_file) in enumerate(self.input_files): alignurl = input_file.url pcgts = parse(alignurl, True) page = pcgts.get_Page() regions = page.get_TextRegion() pagecontent = '' for region in regions: regioncontent = '' lines = region.get_TextLine() for line in lines: linecontent = '' words = line.get_Word() for word in words: wordunicode = word.get_TextEquiv()[mainIndex].Unicode word.add_TextEquiv(TextEquivType(Unicode=wordunicode)) linecontent += ' ' + wordunicode line.add_TextEquiv(TextEquivType(Unicode=regioncontent)) regioncontent += '\n' + linecontent region.add_TextEquiv(TextEquivType(Unicode=regioncontent)) pagecontent += '\n' + regioncontent page.add_TextEquiv(TextEquivType(Unicode=pagecontent)) ID = concat_padded(self.output_file_grp, n) self.log.info('creating file id: %s, name: %s, file_grp: %s', ID, input_file.basename, self.output_file_grp) # Use the input file's basename for the new file # this way the files retain the same basenames. out = self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, basename=self.output_file_grp + '-' + input_file.basename, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)
def test_to_xml_unicode_nsprefix(): """see https://github.com/OCR-D/core/pull/474#issuecomment-621477590""" # arrange with open(assets.path_to('kant_aufklaerung_1784-binarized/data/OCR-D-GT-WORD/INPUT_0020.xml'), 'rb') as f: from_xml = f.read() # assert assert '<Unicode>' in from_xml.decode('utf-8'), 'without NS prefix' assert '<Created' in from_xml.decode('utf-8'), 'without NS prefix' pcgts = parseString(from_xml, silence=True) as_xml = to_xml(pcgts) assert '<pc:Unicode>' in as_xml, 'with NS prefix' assert '<pc:Created>' in as_xml, 'with NS prefix'
def process(self): """ Performs the recognition. """ self._init_calamari() for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID log.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) for region in pcgts.get_Page().get_TextRegion(): region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) textlines = region.get_TextLine() log.info("About to recognize %i lines of region '%s'", len(textlines), region.id) for (line_no, line) in enumerate(textlines): log.debug("Recognizing line '%s' in region '%s'", line_no, region.id) line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) line_image_np = np.array(line_image, dtype=np.uint8) raw_results = list(self.predictor.predict_raw([line_image_np], progress_bar=False))[0] for i, p in enumerate(raw_results): p.prediction.id = "fold_{}".format(i) prediction = self.voter.vote_prediction_result(raw_results) prediction.id = "voted" line_text = prediction.sentence line_conf = prediction.avg_char_probability line.set_TextEquiv([TextEquivType(Unicode=line_text, conf=line_conf)]) _page_update_higher_textequiv_levels('line', pcgts) file_id = self._make_file_id(input_file, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def process(self): LOG = getLogger('OcrdAnybaseocrTextline') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() LOG.info("INPUT FILE %s", input_file.pageId or input_file.ID) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized,deskewed') if oplevel == 'page': LOG.warning("Operation level should be region.") self._process_segment(page_image, page, None, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) continue for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) self._process_segment(region_image, page, region, region_xywh, region.id, input_file, k) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def negative2zero(inputfile, outputfile): print("Setting negative coords to zero..") pcgts = parse(inputfile, silence=True) page = pcgts.get_Page() for attr in dir(page): if "get_" in attr and "Region" in attr: for regiondata in getattr(page,attr)(): if attr == "get_TextRegion": for textline in regiondata.get_TextLine(): textcoords = textline.get_Coords() textcoords.set_points(update_points(textcoords.get_points())) regcoords = regiondata.get_Coords() regcoords.set_points(update_points(regcoords.get_points())) content = to_xml(pcgts) with open(outputfile,"w") as fout: fout.write(content)
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def process(self): """Performs border detection on the workspace. """ assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) LOG = getLogger('OcrdAnybaseocrCropper') oplevel = self.parameter['operation_level'] for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # Check for existing Border --> already cropped border = page.get_Border() if border: left, top, right, bottom = bbox_from_points( border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) page = pcgts.get_Page() page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='cropped', feature_selector='binarized') # should also be deskewed if oplevel == "page": self._process_segment( page_image, page, page_coords, page_id, input_file, n) else: raise Exception( 'Operation level %s, but should be "page".', oplevel) file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8') )
def process(self): for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file) local_input_file = self.workspace.download_file(input_file) pcgts = parse(local_input_file.url, silence=True) LOG.info("Scoring text in page '%s' at the %s level", pcgts.get_pcGtsId(), self.parameter['textequiv_level']) self._process_page(pcgts) # write back result file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, local_filename=os.path.join(self.output_file_grp, file_id), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), )
def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] LOG = getLogger('OcrdAnybaseocrBinarizer') for (n, input_file) in enumerate(self.input_files): file_id = make_file_id(input_file, self.output_file_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) pcgts.set_pcGtsId(file_id) self.add_metadata(pcgts) page = pcgts.get_Page() page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter="binarized") LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for (k, region) in enumerate(regions): region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) # TODO: not tested on regions self._process_segment(region_image, page, region_xywh, region.id, input_file, str(n) + "_" + str(k)) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) page = pcgts.get_Page() regions = page.get_TextRegion() for region in regions: if region.readingDirection != 'left-to-right': LOG.info('Not processing region "%s" (not left-to-right)', region.id) continue if len(region.get_TextLine() ) > 1 and region.textLineOrder != 'top-to-bottom': LOG.info('Not processing region "%s" (not top-to-bottom)', region.id) continue _fix_lines(region) lines = region.get_TextLine() for line in lines: _fix_words(line) words = line.get_Word() for word in words: _fix_glyphs(word) file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) if file_id == input_file.ID: file_id = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts))
def test_empty_groups_to_regionrefindexed(self): """ Corrolary See https://github.com/OCR-D/core/issues/475 """ with open(assets.path_to('gutachten/data/TEMP1/PAGE_TEMP1.xml'), 'r') as f: pcgts = parseString(f.read().encode('utf8'), silence=True) og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], OrderedGroupIndexedType)) self.assertTrue(isinstance(children[21], UnorderedGroupIndexedType)) # empty all the elements in the first orederdGroupIndexed children[1].set_RegionRefIndexed([]) # serialize apnd parse to see empty group converted pcgts = parseString(to_xml(pcgts).encode('utf8'), silence=True) og = pcgts.get_Page().get_ReadingOrder().get_OrderedGroup() children = og.get_AllIndexed() self.assertTrue(isinstance(children[1], RegionRefIndexedType)) self.assertTrue(isinstance(children[21], RegionRefIndexedType))
def process(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) oplevel = self.parameter['operation_level'] LOG = getLogger('OcrdAnybaseocrDeskewer') for (n, input_file) in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() angle = page.get_orientation() if angle: LOG.warning('Overwriting existing deskewing angle: %i', angle) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_filter='deskewed', feature_selector='binarized') if oplevel == "page": self._process_segment(page_image, page, page_xywh, page_id, input_file, n) else: LOG.warning('Operation level %s, but should be "page".', oplevel) break file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts).encode('utf-8'))
def process(self): """ Segment with kraken """ log = getLogger('processor.KrakenSegment') for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) im = self.workspace.resolve_image_as_pil(image_url) log.info('Segmenting') log.info('Params %s', self.parameter) res = segment(im, self.parameter['text_direction'], self.parameter['scale'], self.parameter['maxcolseps'], self.parameter['black_colseps']) if self.parameter['script_detect']: res = detect_scripts(im, res) dummyRegion = TextRegionType() pcgts.get_Page().add_TextRegion(dummyRegion) # print(res) for lineno, box in enumerate(res['boxes']): textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType(points=points_from_x0y0x1y1(box))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'))
def process(self): ifgs = self.input_file_grp.split(",") # input file groups if len(ifgs) < 2: raise Exception("need at least two input file groups to align") ifts = self.zip_input_files(ifgs) # input file tuples for _id, ift in enumerate(ifts): alignments = json.loads(self.run_java_aligner(ift)) pcgts = self.align(alignments, ift) # keep the right part after OCR-D-...-filename # and prepend output_file_grp input_file = ift[0].input_file file_id = make_file_id(input_file, self.output_file_grp) pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), mimetype=MIMETYPE_PAGE, content=to_xml(pcgts), ) self.log.info('created file %s', out)