def process(self): """ Segment with ocropy """ for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) page_width = pcgts.get_Page().get_imageWidth() page_height = pcgts.get_Page().get_imageHeight() # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) binary = ocrolib.read_image_binary( self.workspace.download_url(image_url)) binary = 1 - binary scale = self.parameter['scale'] if self.parameter[ 'scale'] != 0 else psegutils.estimate_scale(binary) log.debug(binary) pseg = self.compute_segmentation(binary, scale) log.debug("pseg=%s", pseg) # TODO reading order / enumber # log.debug("finding reading order") # lines = psegutils.compute_lines(pseg, scale) # order = psegutils.reading_order([l.bounds for l in lines]) # lsort = psegutils.topsort(order) regions = ocrolib.RegionExtractor() regions.setPageLines(pseg) dummyRegion = TextRegionType( id="dummy", Coords=CoordsType( points="0,0 %s,0 %s,%s 0,%s" % (page_width, page_width, page_height, page_height))) pcgts.get_Page().add_TextRegion(dummyRegion) for lineno in range(1, regions.length()): log.debug("id=%s bbox=%s", regions.id(lineno), regions.bbox(lineno)) textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType( points=points_from_y0x0y1x1(regions.bbox(lineno)))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts))
def _process_region(self, it, region, rogroup, region_image, region_coords): LOG = getLogger('processor.TesserocrSegmentTable') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, index = 0 if rogroup: for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 while it and not it.Empty(RIL.BLOCK): bbox = it.BoundingBox(RIL.BLOCK) polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, region_image, region_coords) points = points_from_polygon(polygon) coords = CoordsType(points=points) # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (but ignore non-text regions entirely) ID = region.id + "_%04d" % index subregion = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) block_type = it.BlockType() if block_type == PT.FLOWING_TEXT: pass elif block_type == PT.HEADING_TEXT: subregion.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: subregion.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: subregion.set_type(TextTypeSimpleType.CAPTION) elif block_type == PT.VERTICAL_TEXT: subregion.set_orientation(90.0) else: it.Next(RIL.BLOCK) continue LOG.info("Detected cell '%s': %s (%s)", ID, points, membername(PT, block_type)) region.add_TextRegion(subregion) if rogroup: rogroup.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # iterator increment # index += 1 it.Next(RIL.BLOCK)
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names): img_array = ocrolib.pil2array(page_image) results = mrcnn_model.detect([img_array], verbose=1) r = results[0] page_xywh['features'] += ',blksegmented' for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] #small post-processing incase of paragrapgh to not cut last alphabets if (min_x - 5) > width and r['class_ids'][i] == 2: min_x -= 5 if (max_x + 10) < width and r['class_ids'][i] == 2: min_x += 10 # this can be tested, provided whether we need previous comments or not? region_img = img_array[min_x:max_x, min_y: max_y] #extract from points and img_array region_img = ocrolib.array2pil(region_img) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(region_img, file_id + "_" + str(i), page_id=page_id, file_grp=self.image_grp) ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) coords = CoordsType( "%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y)) textregion = TextRegionType(Coords=coords, type_=class_names[r['class_ids'][i]]) textregion.add_AlternativeImage(ai) page.add_TextRegion(textregion)
def render_text_region(self, text_region: TextRegionType) -> None: line: TextLineType word: WordType glyph: GlyphType for line in text_region.get_TextLine(): self.render_type(line) for word in line.get_Word(): self.render_type(word) for glyph in word.get_Glyph(): self.render_type(glyph)
def process(self): """ Segment with kraken """ log = getLogger('processor.KrakenSegment') for (n, input_file) in enumerate(self.input_files): log.info("INPUT FILE %i / %s", n, input_file) downloaded_file = self.workspace.download_file(input_file) log.info("downloaded_file %s", downloaded_file) pcgts = page_from_file(downloaded_file) # TODO binarized variant from get_AlternativeImage() image_url = pcgts.get_Page().imageFilename log.info("pcgts %s", pcgts) im = self.workspace.resolve_image_as_pil(image_url) log.info('Segmenting') log.info('Params %s', self.parameter) res = segment(im, self.parameter['text_direction'], self.parameter['scale'], self.parameter['maxcolseps'], self.parameter['black_colseps']) if self.parameter['script_detect']: res = detect_scripts(im, res) dummyRegion = TextRegionType() pcgts.get_Page().add_TextRegion(dummyRegion) # print(res) for lineno, box in enumerate(res['boxes']): textline = TextLineType( id=concat_padded("line", lineno), Coords=CoordsType(points=points_from_x0y0x1y1(box))) dummyRegion.add_TextLine(textline) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(self.output_file_grp, pageId=input_file.pageId, ID=ID, mimetype=MIMETYPE_PAGE, local_filename="%s/%s.xml" % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'))
def test_alternative_image_additions(): pcgts = PcGtsType(pcGtsId="foo") assert pcgts.pcGtsId == 'foo' # act # Page/AlternativeImage page = PageType() pcgts.set_Page(page) page.add_AlternativeImage(AlternativeImageType()) # TextRegion/AlternativeImage region = TextRegionType() page.add_TextRegion(region) region.add_AlternativeImage(AlternativeImageType()) # TextLine/AlternativeImage line = TextLineType() region.add_TextLine(line) line.add_AlternativeImage(AlternativeImageType()) # Word/AlternativeImage word = WordType() line.add_Word(word) word.add_AlternativeImage(AlternativeImageType()) # Glyph/AlternativeImage glyph = GlyphType() word.add_Glyph(glyph) glyph.add_AlternativeImage(AlternativeImageType())
def process(self): """ Performs the region segmentation. """ with tesserocr.PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: # print(self.input_file_grp) for (n, input_file) in enumerate(self.input_files): # print(input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) log.debug("Detecting regions with tesseract") tessapi.SetImage(image) for component in tessapi.GetComponentImages(tesserocr.RIL.BLOCK, True): points, index = points_from_xywh(component[1]), component[2] # # the region reference in the reading order element # ID = "region%04d" % index log.debug("Detected region '%s': %s", ID, points) # <pg:ReadingOrder> ro = pcgts.get_Page().get_ReadingOrder() if ro is None: ro = ReadingOrderType() pcgts.get_Page().set_ReadingOrder(ro) # <pg:OrderedGroup> og = ro.get_OrderedGroup() if og is None: og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) # <pg:RegionRefIndexed> og.add_RegionRefIndexed(RegionRefIndexedType(regionRef=ID, index=index)) # # text region # pcgts.get_Page().add_TextRegion(TextRegionType(id=ID, Coords=CoordsType(points=points))) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file( ID=ID, file_grp=self.output_file_grp, mimetype=MIMETYPE_PAGE, local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'), )
def add_region(region: RectSegment, index: int, region_type: str): from ocrd_utils import coordinates_for_segment, points_from_polygon polygon = polygon_from_segment(region) polygon = coordinates_for_segment(polygon, page_image, page_coords) points = points_from_polygon(polygon) indexed_id = "region%04d" % index coords = CoordsType(points=points) if region_type == "text": page.add_TextRegion( TextRegionType(id=indexed_id, Coords=coords)) elif region_type == "image": page.add_ImageRegion( ImageRegionType(id=indexed_id, Coords=coords)) else: page.add_NoiseRegion( NoiseRegionType(id=indexed_id, Coords=coords))
def add_region(region: Segment, index: int, type: str): indexed_id = "region%04d" % index points = str([ (region.x_start, region.y_start), (region.x_start, region.y_end), (region.x_end, region.y_start), (region.x_end, region.y_end), ]) coords = CoordsType(points=points) if type == "text": page.add_TextRegion( TextRegionType(id=indexed_id, Coords=coords)) elif type == "image": page.add_ImageRegion( ImageRegionType(id=indexed_id, Coords=coords)) else: page.add_NoiseRegion( NoiseRegionType(id=indexed_id, Coords=coords))
def _process_page(page, page_image, page_xywh, pageId, file_id): settings = SegmentationSettings(debug=False, enable_preprocessing=False) # TODO: does this still need to be cropped or do we not need page_xywh? # Same for points below # page_image[page_xywh["x"]:page_xywh["w"], page_xywh["y"]:page_xywh["h"]] regions, classification = Segmentator(settings).segmentate_image( np.asarray(page_image)) count = 0 for region, prediction in zip(regions, classification): ID = "region%04d" % count points = str(list(region.exterior.coords)) coords = CoordsType(points=points) # FIXME: these are not all types in the model, also check if they match if prediction == 1: page.add_TextRegion(TextRegionType(id=ID, Coords=coords)) elif prediction == 2: page.add_ImageRegion(ImageRegionType(id=ID, Coords=coords)) else: page.add_NoiseRegion(NoiseRegionType(id=ID, Coords=coords)) count += 1
def test_alternativeImage(self): pcgts = PcGtsType(pcGtsId="foo") self.assertEqual(pcgts.pcGtsId, 'foo') # Page/AlternativeImage page = PageType() pcgts.set_Page(page) page.add_AlternativeImage(AlternativeImageType()) # TextRegion/AlternativeImage region = TextRegionType() page.add_TextRegion(region) region.add_AlternativeImage(AlternativeImageType()) # TextLine/AlternativeImage line = TextLineType() region.add_TextLine(line) line.add_AlternativeImage(AlternativeImageType()) # Word/AlternativeImage word = WordType() line.add_Word(word) word.add_AlternativeImage(AlternativeImageType()) # Glyph/AlternativeImage glyph = GlyphType() word.add_Glyph(glyph) glyph.add_AlternativeImage(AlternativeImageType())
def build_pagexml_full_layout( self, found_polygons_text_region, found_polygons_text_region_h, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_found_texline_polygons_h, all_box_coord, all_box_coord_h, found_polygons_text_region_img, found_polygons_tables, found_polygons_drop_capitals, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_h, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml): self.logger.debug('enter build_pagexml_full_layout') # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml( self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() page.set_Border( BorderType(Coords=CoordsType( points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() _counter_marginals = EynollahIdCounter(region_idx=len(order_of_texts)) id_of_marginalia = [ _counter_marginals.next_region_id for _ in found_polygons_marginals ] xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_text_region[mm], page_coord))) page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) self.logger.debug('len(found_polygons_text_region_h) %s', len(found_polygons_text_region_h)) for mm in range(len(found_polygons_text_region_h)): textregion = TextRegionType( id=counter.next_region_id, type_='header', Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_text_region_h[mm], page_coord))) page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons_h, mm, page_coord, all_box_coord_h, slopes_h, counter) for mm in range(len(found_polygons_marginals)): marginal = TextRegionType( id=counter.next_region_id, type_='marginalia', Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) self.serialize_lines_in_marginal( marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_drop_capitals)): page.add_TextRegion( TextRegionType( id=counter.next_region_id, type_='drop-capital', Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_drop_capitals[mm], page_coord)))) for mm in range(len(found_polygons_text_region_img)): page.add_ImageRegion( ImageRegionType( id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_text_region_img[mm], page_coord)))) for mm in range(len(polygons_lines_to_be_written_in_xml)): page.add_SeparatorRegion( ImageRegionType( id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords( polygons_lines_to_be_written_in_xml[mm], [0, 0, 0, 0])))) for mm in range(len(found_polygons_tables)): page.add_TableRegion( TableRegionType( id=counter.next_region_id, Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_tables[mm], page_coord)))) return pcgts
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, mask, dpi): LOG = getLogger('processor.AnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them if page.get_TextRegion() or page.get_TableRegion(): if self.parameter['overwrite']: LOG.info('removing existing text/table regions in page "%s"', page_id) page.set_TextRegion([]) else: LOG.warning('keeping existing text/table regions in page "%s"', page_id) # check if border exists border_polygon = None if page.get_Border(): border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) border_polygon = Polygon(border_points) LOG.info('detecting regions on page "%s"', page_id) img_array = ocrolib.pil2array(page_image) if len(img_array.shape) <= 2: img_array = np.stack((img_array, ) * 3, axis=-1) # convert to incidence matrix class_ids = np.array([[ 1 if category in self.parameter['active_classes'] else 0 for category in CLASS_NAMES ]], dtype=np.int32) results = self.mrcnn_model.detect([img_array], verbose=0, active_class_ids=class_ids) r = results[0] LOG.info('found %d candidates on page "%s"', len(r['rois']), page_id) th = self.parameter['th'] # check for existing semgentation mask # this code executes only when the workflow had tiseg run before with use_deeplr=true if mask: mask = ocrolib.pil2array(mask) mask = mask // 255 mask = 1 - mask # multiply all the bounding box part with 2 for i in range(len(r['rois'])): min_y, min_x, max_y, max_x = r['rois'][i] mask[min_y:max_y, min_x:max_x] *= i + 2 # check for left over pixels and add them to the bounding boxes pixel_added = True while pixel_added: pixel_added = False left_over = np.where(mask == 1) for y, x in zip(left_over[0], left_over[1]): local_mask = mask[y - th:y + th, x - th:x + th] candidates = np.where(local_mask > 1) candidates = [k for k in zip(candidates[0], candidates[1])] if len(candidates) > 0: pixel_added = True # find closest pixel with x>1 candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 + (j[1] - th)**2)) index = local_mask[candidates[0]] - 2 # add pixel to mask/bbox # y,x to bbox with index if y < r['rois'][index][0]: r['rois'][index][0] = y elif y > r['rois'][index][2]: r['rois'][index][2] = y if x < r['rois'][index][1]: r['rois'][index][1] = x elif x > r['rois'][index][3]: r['rois'][index][3] = x # update the mask mask[y, x] = index + 2 for i in range(len(r['rois'])): class_id = r['class_ids'][i] if class_id >= len(CLASS_NAMES): raise Exception( 'Unexpected class id %d - model does not match' % class_id) # find hull contours on masks if self.parameter['use_masks']: r.setdefault('polygons', list()) # estimate glyph scale (roughly) scale = int(dpi / 6) scale = scale + (scale + 1) % 2 # odd for i in range(len(r['rois'])): mask = r['masks'][:, :, i] mask = cv2.dilate(mask.astype(np.uint8), np.ones((scale, scale), np.uint8)) > 0 # close mask until we have a single outer contour contours = None for _ in range(10): mask = cv2.morphologyEx( mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((scale, scale), np.uint8)) > 0 contours, _ = cv2.findContours(mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) if len(contours) == 1: break r['polygons'].append(Polygon( contours[0][:, 0, :])) # already in x,y order # to reduce overlaps, apply IoU-based non-maximum suppression # (and other post-processing against overlaps) across classes, # but not on the raw pixels, but the smoothed hull polygons LOG.info('post-processing detections on page "%s"', page_id) worse = [] if self.parameter['post_process']: active = True def _merge_rois(i, j): """merges i into j""" nonlocal r, active r['rois'][j][0] = min(r['rois'][i][0], r['rois'][j][0]) r['rois'][j][1] = min(r['rois'][i][1], r['rois'][j][1]) r['rois'][j][2] = max(r['rois'][i][2], r['rois'][j][2]) r['rois'][j][3] = max(r['rois'][i][3], r['rois'][j][3]) r['polygons'][j] = r['polygons'][i].union(r['polygons'][j]) #r['scores'][j] = max(r['scores'][i], r['scores'][i]) active = True # find overlapping pairs while active: active = False for i in range(len(r["class_ids"])): if i in worse: continue for j in range(i + 1, len(r['class_ids'])): if j in worse: continue iclass = r['class_ids'][i] jclass = r['class_ids'][j] iname = CLASS_NAMES[iclass] jname = CLASS_NAMES[jclass] if (iname == 'drop-capital') != (jname == 'drop-capital'): # ignore drop-capital overlapping with others continue # rs todo: lower priority for footnote? if (r['rois'][i][1] > r['rois'][j][3] or r['rois'][i][3] < r['rois'][j][1] or r['rois'][i][0] > r['rois'][j][2] or r['rois'][i][2] < r['rois'][j][0]): # no overlap (cut) continue iscore = r['scores'][i] jscore = r['scores'][j] if not self.parameter['use_masks']: LOG.debug( "roi %d[%s] overlaps roi %d[%s] and %s (replacing)", i, iname, j, jname, "looses" if iscore < jscore else "wins") if iscore < jscore: worse.append(i) break else: worse.append(j) continue # compare masks ipoly = r['polygons'][i] jpoly = r['polygons'][j] isize = ipoly.area jsize = jpoly.area inter = ipoly.intersection(jpoly).area union = ipoly.union(jpoly).area # LOG.debug("%d/%d %dpx/%dpx shared %dpx overall %dpx", # i, j, isize, jsize, inter, union) if inter / isize > self.parameter['min_share_drop']: LOG.debug( "roi %d[%s] contains roi %d[%s] (replacing)", j, jname, i, iname) worse.append(i) break elif inter / jsize > self.parameter['min_share_drop']: LOG.debug( "roi %d[%s] contains roi %d[%s] (replacing)", i, iname, j, jname) worse.append(j) elif inter / union > self.parameter['min_iou_drop']: LOG.debug( "roi %d[%s] heavily overlaps roi %d[%s] and %s (replacing)", i, iname, j, jname, "looses" if iscore < jscore else "wins") if iscore < jscore: worse.append(i) break else: worse.append(j) elif inter / isize > self.parameter['min_share_merge']: LOG.debug("roi %d[%s] covers roi %d[%s] (merging)", j, jname, i, iname) worse.append(i) _merge_rois(i, j) break elif inter / jsize > self.parameter['min_share_merge']: LOG.debug("roi %d[%s] covers roi %d[%s] (merging)", i, iname, j, jname) worse.append(j) _merge_rois(j, i) elif inter / union > self.parameter['min_iou_merge']: LOG.debug( "roi %d[%s] slightly overlaps roi %d[%s] and %s (merging)", i, iname, j, jname, "looses" if iscore < jscore else "wins") if iscore < jscore: worse.append(i) _merge_rois(i, j) break else: worse.append(j) _merge_rois(j, i) # define reading order on basis of coordinates partial_order = np.zeros((len(r['rois']), len(r['rois'])), np.uint8) for i, (min_y_i, min_x_i, max_y_i, max_x_i) in enumerate(r['rois']): for j, (min_y_j, min_x_j, max_y_j, max_x_j) in enumerate(r['rois']): if min_x_i < max_x_j and max_x_i > min_x_j: # xoverlaps if min_y_i < min_y_j: partial_order[i, j] = 1 else: min_y = min(min_y_i, min_y_j) max_y = max(max_y_i, max_y_j) min_x = min(min_x_i, min_x_j) max_x = max(max_x_i, max_x_j) if next( (False for (min_y_k, min_x_k, max_y_k, max_x_k) in r['rois'] if (min_y_k < max_y and max_y_k > min_y and min_x_k < max_x and max_x_k > min_x)), True): # no k in between if ((min_y_j + max_y_j) / 2 < min_y_i and (min_y_i + max_y_i) / 2 > max_y_j): # vertically unrelated partial_order[j, i] = 1 elif max_x_i < min_x_j: partial_order[i, j] = 1 def _topsort(po): visited = np.zeros(po.shape[0], np.bool) result = list() def _visit(k): if visited[k]: return visited[k] = True for l in np.nonzero(po[:, k])[0]: _visit(l) result.append(k) for k in range(po.shape[0]): _visit(k) return result reading_order = _topsort(partial_order) # Creating Reading Order object in PageXML order_group = OrderedGroupType(caption="Regions reading order", id=page_id) reading_order_object = ReadingOrderType() reading_order_object.set_OrderedGroup(order_group) page.set_ReadingOrder(reading_order_object) for i in range(len(r['rois'])): width, height, _ = img_array.shape min_y, min_x, max_y, max_x = r['rois'][i] score = r['scores'][i] class_id = r['class_ids'][i] class_name = CLASS_NAMES[class_id] if i in worse: LOG.debug( "Ignoring instance %d[%s] overlapping better/larger neighbour", i, class_name) continue if self.parameter['use_masks']: region_polygon = r['polygons'][i].exterior.coords[:-1] else: region_polygon = polygon_from_bbox( max(min_x - 5, 0) if class_name == 'paragraph' else min_x, min_y, min(max_x + 10, width) if class_name == 'paragraph' else max_x, max_y) # convert to absolute coordinates region_polygon = coordinates_for_segment(region_polygon, page_image, page_xywh) # intersect with parent and plausibilize cut_region_polygon = Polygon(region_polygon) if border_polygon: cut_region_polygon = border_polygon.intersection( cut_region_polygon) if cut_region_polygon.is_empty: LOG.warning('region %d does not intersect page frame', i) continue if not cut_region_polygon.is_valid: LOG.warning('region %d has invalid polygon', i) continue region_polygon = cut_region_polygon.exterior.coords[:-1] region_coords = CoordsType(points_from_polygon(region_polygon), conf=score) read_order = reading_order.index(i) region_args = { 'custom': 'readingOrder {index:' + str(read_order) + ';}', 'id': 'region%04d' % i, 'Coords': region_coords } if class_name == 'image': image_region = ImageRegionType(**region_args) page.add_ImageRegion(image_region) elif class_name == 'table': table_region = TableRegionType(**region_args) page.add_TableRegion(table_region) elif class_name == 'graphics': graphic_region = GraphicRegionType(**region_args) page.add_GraphicRegion(graphic_region) else: region_args['type_'] = class_name textregion = TextRegionType(**region_args) page.add_TextRegion(textregion) order_index = reading_order.index(i) regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_args['id']) order_group.add_RegionRefIndexed(regionRefIndex) LOG.info('added %s region on page "%s"', class_name, page_id)
def process(self): """Performs region segmentation by reading from COCO annotations. Open and deserialize the COCO JSON file from the second input file group. (It lists region categories/subtypes, file names and segmentations for all pages.) Open and deserialize each PAGE input file (or generate from image input file) from the first input file group. Now find this page in COCO: - try to match the PAGE ``imageFilename`` or METS file path matches to some COCO ``file_name``, otherwise - try to match the numeric part of the METS physical page ID to some COCO ``id``, otherwise - skip with an error. Then create and add a region for each ``segmentation``, converting its polygon to coordinate points and its COCO category to a region type (and subtype), either for a PubLayNet classification or PAGE classification (as produced by ocrd-segment-extract-pages), as indicated by ``source``. Produce a new output file by serialising the resulting hierarchy. Afterwards, if there are still COCO images left unaccounted for (i.e. without corresponding input files), then show a warning. """ LOG = getLogger('processor.ImportCOCOSegmentation') # Load JSON assert_file_grp_cardinality(self.input_file_grp, 2, 'base and COCO') # pylint: disable=attribute-defined-outside-init self.input_file_grp, coco_grp = self.input_file_grp.split(',') # pylint: disable=attribute-defined-outside-init if not self.input_files: LOG.warning('No input files to process') return if coco_grp in self.workspace.mets.file_groups: try: cocofile = next( f for f in self.workspace.mets.find_files(fileGrp=coco_grp) # if f.mimetype == 'application/json' and not f.pageId if not f.pageId) except StopIteration: raise Exception( "no non-page-specific file in second file group (COCO file)", coco_grp) cocofile = self.workspace.download_file(cocofile).local_filename elif os.path.isfile(coco_grp): cocofile = coco_grp else: raise Exception("file not found in second file group (COCO file)", coco_grp) LOG.info('Loading COCO annotations from "%s" into memory...', cocofile) with open(cocofile, 'r') as inp: coco = json.load(inp) LOG.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) coco_source = 'PubLayNet' # Convert to usable dicts # classes: categories = dict() subcategories = dict() for cat in coco['categories']: if cat['source'] == 'PAGE': coco_source = 'PAGE' if 'supercategory' in cat and cat['supercategory']: categories[cat['id']] = cat['supercategory'] subcategories[cat['id']] = cat['name'] else: categories[cat['id']] = cat['name'] # images and annotations: images_by_id = dict() images_by_filename = dict() for image in coco['images']: images_by_id[image['id']] = image images_by_filename[image['file_name']] = image for annotation in coco['annotations']: image = images_by_id[annotation['image_id']] regions = image.setdefault('regions', list()) regions.append(annotation) del coco LOG.info('Converting %s annotations into PAGE-XML', coco_source) for n, input_file in enumerate(self.input_files): page_id = input_file.pageId or input_file.ID num_page_id = int(page_id.strip(page_id.strip("0123456789"))) LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page = pcgts.get_Page() # find COCO image if page.imageFilename in images_by_filename: image = images_by_filename[page.imageFilename] elif num_page_id in images_by_id: image = images_by_id[num_page_id] else: LOG.error('Page "%s" / file "%s" not found in COCO', page_id, page.imageFilename) # todo: maybe we should at least write the (unchanged) output PAGE? continue if image['width'] != page.imageWidth: LOG.error( 'Page "%s" width %d does not match annotated width %d', page_id, page.imageWidth, image['width']) if image['height'] != page.imageHeight: LOG.error( 'Page "%s" height %d does not match annotated height %d', page_id, page.imageHeight, image['height']) # todo: remove existing segmentation first? for region in image['regions']: assert isinstance( region['segmentation'], list), "importing RLE/mask segmentation not implemented" polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2)) coords = CoordsType(points=points_from_polygon(polygon)) category = categories[region['category_id']] if region['category_id'] in subcategories: subcategory = subcategories[region['category_id']] else: subcategory = None region_id = 'r' + str(region['id']) LOG.info('Adding region %s:%s [area %d]', category, subcategory or '', region['area']) if coco_source == 'PubLayNet': if category == 'text': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.PARAGRAPH) page.add_TextRegion(region_obj) elif category == 'title': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.HEADING) # CAPTION? page.add_TextRegion(region_obj) elif category == 'list': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.LISTLABEL) # OTHER? page.add_TextRegion(region_obj) elif category == 'table': region_obj = TableRegionType(id=region_id, Coords=coords) page.add_TableRegion(region_obj) elif category == 'figure': region_obj = ImageRegionType(id=region_id, Coords=coords) page.add_ImageRegion(region_obj) else: raise Exception('unknown region category: %s' % category) else: # 'PAGE' args = {'id': region_id, 'Coords': coords} if subcategory: typedict = { "TextRegion": TextTypeSimpleType, "GraphicRegion": GraphicsTypeSimpleType, "ChartType": ChartTypeSimpleType } if category in typedict: subtype = membername(typedict[category], subcategory) if subtype == subcategory: # not predefined in PAGE: use other + custom args['custom'] = "subtype:%s" % subcategory args['type_'] = "other" else: args['type_'] = subcategory else: args['custom'] = "subtype:%s" % subcategory if category + 'Type' not in globals(): raise Exception('unknown region category: %s' % category) region_type = globals()[category + 'Type'] if region_type is BorderType: page.set_Border(BorderType(Coords=coords)) else: region_obj = region_type(**args) getattr(page, 'add_%s' % category)(region_obj) # remove image from dicts images_by_id.pop(num_page_id, None) images_by_filename.pop(page.imageFilename, None) file_id = make_file_id(input_file, self.output_file_grp) self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, local_filename=os.path.join( self.output_file_grp, file_id + '.xml'), content=to_xml(pcgts)) # warn of remaining COCO images if images_by_filename and not self.page_id: LOG.warning('%d images remain unaccounted for after processing', len(images_by_filename)) if LOG.isEnabledFor(logging.DEBUG): for filename in images_by_filename: LOG.debug('not found in workspace: "%s"', filename)
def build_pagexml_no_full_layout( self, found_polygons_text_region, page_coord, order_of_texts, id_of_texts, all_found_texline_polygons, all_box_coord, found_polygons_text_region_img, found_polygons_marginals, all_found_texline_polygons_marginals, all_box_coord_marginals, slopes, slopes_marginals, cont_page, polygons_lines_to_be_written_in_xml, found_polygons_tables): self.logger.debug('enter build_pagexml_no_full_layout') # create the file structure pcgts = self.pcgts if self.pcgts else create_page_xml( self.image_filename, self.height_org, self.width_org) page = pcgts.get_Page() page.set_Border( BorderType(Coords=CoordsType( points=self.calculate_page_coords(cont_page)))) counter = EynollahIdCounter() if len(found_polygons_text_region) > 0: _counter_marginals = EynollahIdCounter( region_idx=len(order_of_texts)) id_of_marginalia = [ _counter_marginals.next_region_id for _ in found_polygons_marginals ] xml_reading_order(page, order_of_texts, id_of_marginalia) for mm in range(len(found_polygons_text_region)): textregion = TextRegionType( id=counter.next_region_id, type_='paragraph', Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_text_region[mm], page_coord)), ) page.add_TextRegion(textregion) self.serialize_lines_in_region(textregion, all_found_texline_polygons, mm, page_coord, all_box_coord, slopes, counter) for mm in range(len(found_polygons_marginals)): marginal = TextRegionType( id=counter.next_region_id, type_='marginalia', Coords=CoordsType(points=self.calculate_polygon_coords( found_polygons_marginals[mm], page_coord))) page.add_TextRegion(marginal) self.serialize_lines_in_marginal( marginal, all_found_texline_polygons_marginals, mm, page_coord, all_box_coord_marginals, slopes_marginals, counter) for mm in range(len(found_polygons_text_region_img)): img_region = ImageRegionType(id=counter.next_region_id, Coords=CoordsType()) page.add_ImageRegion(img_region) points_co = '' for lmm in range(len(found_polygons_text_region_img[mm])): points_co += str( int((found_polygons_text_region_img[mm][lmm, 0, 0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((found_polygons_text_region_img[mm][lmm, 0, 1] + page_coord[0]) / self.scale_y)) points_co += ' ' img_region.get_Coords().set_points(points_co[:-1]) for mm in range(len(polygons_lines_to_be_written_in_xml)): sep_hor = SeparatorRegionType(id=counter.next_region_id, Coords=CoordsType()) page.add_SeparatorRegion(sep_hor) points_co = '' for lmm in range(len(polygons_lines_to_be_written_in_xml[mm])): points_co += str( int((polygons_lines_to_be_written_in_xml[mm][lmm, 0, 0]) / self.scale_x)) points_co += ',' points_co += str( int((polygons_lines_to_be_written_in_xml[mm][lmm, 0, 1]) / self.scale_y)) points_co += ' ' sep_hor.get_Coords().set_points(points_co[:-1]) for mm in range(len(found_polygons_tables)): tab_region = TableRegionType(id=counter.next_region_id, Coords=CoordsType()) page.add_TableRegion(tab_region) points_co = '' for lmm in range(len(found_polygons_tables[mm])): points_co += str( int((found_polygons_tables[mm][lmm, 0, 0] + page_coord[2]) / self.scale_x)) points_co += ',' points_co += str( int((found_polygons_tables[mm][lmm, 0, 1] + page_coord[0]) / self.scale_y)) points_co += ' ' tab_region.get_Coords().set_points(points_co[:-1]) return pcgts
def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. Add the resulting sub-segments to the parent ``element``. If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: LOG.warning("Skipping '%s' with zero size", element_id) return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), np.bool) sep_bin = np.zeros_like(element_bin, np.bool) ignore_labels = np.zeros_like(element_bin, np.int) for i, segment in enumerate(ignore): LOG.debug('masking foreground of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, element_id) # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: # - ignored during text line segmentation (but not h/v-line detection) # - kept and reading-ordered during region segmentation (but not seps) segment_polygon = coordinates_of_segment(segment, image, coords) # If segment_polygon lies outside of element (causing # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. if isinstance(segment, SeparatorRegionType): sep_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], sep_bin.shape)] = True ignore_labels[draw.polygon( segment_polygon[:, 1], segment_polygon[:, 0], ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and isinstance(element.parent_object_, TableRegionType)): element_name = 'table' fullpage = True report = check_region(element_bin, zoom) else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) LOG.info('computing line segmentation for %s "%s"', element_name, element_id) # TODO: we should downscale if DPI is large enough to save time try: if report: raise Exception(report) line_labels, hlines, vlines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], maximages=self.parameter['maximages'] if element_name != 'table' else 0, csminheight=self.parameter['csminheight'], hlminwidth=self.parameter['hlminwidth']) except Exception as err: if isinstance(element, TextRegionType): LOG.error('Cannot line-segment region "%s": %s', element_id, err) # as a fallback, add a single text line comprising the whole region: element.add_TextLine( TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) return LOG.info('Found %d text lines for %s "%s"', len(np.unique(line_labels)) - 1, element_name, element_id) # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions try: # pass ignored regions as "line labels with initial assignment", # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices sepmask = np.maximum(np.maximum(hlines, vlines), np.maximum(sep_bin, images)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) LOG.info('Found %d text regions for %s "%s"', len(np.unique(region_labels)) - 1, element_name, element_id) except Exception as err: LOG.error('Cannot region-segment %s "%s": %s', element_name, element_id, err) region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 # start counting from largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 else: index = None # find contours around region labels (can be non-contiguous): region_no = 0 for region_label in np.unique(region_labels): if not region_label: continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) if not np.all(region_line_labels0 > len(ignore)): # existing region from `ignore` merely to be ordered # (no new region, no actual text lines) region_line_labels0 = np.intersect1d( region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ "region label %d has both existing regions and new lines (%s)" % ( region_label, str(region_line_labels0)) region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ == element and not isinstance( region, SeparatorRegionType): index = page_add_to_reading_order( rogroup, region.id, index) LOG.debug('Region label %d is for ignored region "%s"', region_label, region.id) continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally order = morph.reading_order(region_line_labels) order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, seps=np.maximum( sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) regions, _ = masks2polygons( region_mask * region_label, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) lines, _ = masks2polygons(region_line_labels, element_bin, 'region "%s"' % element_id, min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon in lines] for _, region_polygon in regions: region_poly = prep(Polygon(region_polygon)) # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( region_polygon, image, coords) region_polygon = polygon_for_parent( region_polygon, element) if region_polygon is None: LOG.warning( 'Ignoring extant region contour for region label %d', region_label) continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) region = TextRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment( line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: LOG.warning( 'Ignoring extant line contour for region label %d line label %d', region_label, line_label) continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) line = TextLineType( id=line_id, Coords=CoordsType( points=points_from_polygon(line_polygon))) region.add_TextLine(line) # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) LOG.info('Added region "%s" with %d lines for %s "%s"', region_id, line_no, element_name, element_id) if rogroup: index = page_add_to_reading_order( rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... image_labels, num_images = morph.label(images) LOG.info('Found %d large non-text/image regions for %s "%s"', num_images, element_name, element_id) # find contours around region labels (can be non-contiguous): image_polygons, _ = masks2polygons( image_labels, element_bin, '%s "%s"' % (element_name, element_id)) for image_label, polygon in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: LOG.warning( 'Ignoring extant region contour for image label %d', image_label) continue region_no += 1 # annotate result: region_id = element_id + "_image%04d" % region_no element.add_ImageRegion( ImageRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # split rulers into separator regions: hline_labels, num_hlines = morph.label(hlines) vline_labels, num_vlines = morph.label(vlines) LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines, num_vlines, element_name, element_id) # find contours around region labels (can be non-contiguous): hline_polygons, _ = masks2polygons( hline_labels, element_bin, '%s "%s"' % (element_name, element_id)) vline_polygons, _ = masks2polygons( vline_labels, element_bin, '%s "%s"' % (element_name, element_id)) for _, polygon in hline_polygons + vline_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment( polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: LOG.warning('Ignoring extant region contour for separator') continue # annotate result: region_no += 1 region_id = element_id + "_sep%04d" % region_no element.add_SeparatorRegion( SeparatorRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( image_clipped, file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) element.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=coords['features'] + ',clipped')) else: # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, np.bool) region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): line_polygons, _ = masks2polygons(line_labels, element_bin, 'region "%s"' % element_id, min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: LOG.warning( 'Ignoring extant line contour for line label %d', line_label) continue # annotate result: line_no += 1 line_id = element_id + "_line%04d" % line_no element.add_TextLine( TextLineType( id=line_id, Coords=CoordsType( points=points_from_polygon(line_polygon)))) if not sep_bin.any(): return # no derived image # annotate a text/image-separated image element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( image_clipped, file_id + '.IMG-CLIP', page_id=page_id, file_grp=self.output_file_grp) # update PAGE (reference the image file): element.add_AlternativeImage( AlternativeImageType(filename=file_path, comments=coords['features'] + ',clipped'))
def _process_segment(self, page_image, page, region_xywh, page_id, input_file, n): binary = ocrolib.pil2array(page_image) binary = np.array(1 - binary / np.amax(binary), 'B') if page.get_TextRegion() is None or len(page.get_TextRegion()) < 1: min_x, max_x = (0, binary.shape[0]) min_y, max_y = (0, binary.shape[1]) textregion = TextRegionType( Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) page.add_TextRegion(textregion) else: textregion = page.get_TextRegion()[-1] ocrolib.write_image_binary("test.bin.png", binary) if self.parameter['scale'] == 0: scale = psegutils.estimate_scale(binary) else: scale = self.parameter['scale'] if np.isnan( scale) or scale > 1000.0 or scale < self.parameter['minscale']: LOG.warning("%s: bad scale (%g); skipping\n" % (fname, scale)) return segmentation = self.compute_segmentation(binary, scale) if np.amax(segmentation) > self.parameter['maxlines']: LOG.warning("%s: too many lines %i", (fname, np.amax(segmentation))) return lines = psegutils.compute_lines(segmentation, scale) order = psegutils.reading_order([l.bounds for l in lines]) lsort = psegutils.topsort(order) # renumber the labels so that they conform to the specs nlabels = np.amax(segmentation) + 1 renumber = np.zeros(nlabels, 'i') for i, v in enumerate(lsort): renumber[lines[v].label] = 0x010000 + (i + 1) segmentation = renumber[segmentation] lines = [lines[i] for i in lsort] cleaned = ocrolib.remove_noise(binary, self.parameter['noise']) region_xywh['features'] += ",textline" for i, l in enumerate(lines): ocrolib.write_image_binary("test.bin.png", binary[l.bounds[0], l.bounds[1]]) min_x, max_x = (l.bounds[0].start, l.bounds[0].stop) min_y, max_y = (l.bounds[1].start, l.bounds[1].stop) img = binary[l.bounds[0], l.bounds[1]] img = np.array(255 * (img > ocrolib.midrange(img)), 'B') img = ocrolib.array2pil(img) file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(img, file_id + "_" + str(i), page_id=page_id, file_grp=self.image_grp) ai = AlternativeImageType(filename=file_path, comments=region_xywh['features']) line = TextLineType( Coords=CoordsType("%i,%i %i,%i %i,%i %i,%i" % (min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y))) line.add_AlternativeImage(ai) textregion.add_TextLine(line)
def test_deskewing(plain_workspace): #from ocrd_utils import initLogging, setOverrideLogLevel #setOverrideLogLevel('DEBUG') size = (3000, 4000) poly = [[1403, 2573], [1560, 2573], [1560, 2598], [2311, 2598], [2311, 2757], [2220, 2757], [2220, 2798], [2311, 2798], [2311, 2908], [1403, 2908]] xywh = xywh_from_polygon(poly) bbox = bbox_from_polygon(poly) skew = 4.625 image = Image.new('L', size) image = polygon_mask(image, poly) #image.show(title='image') pixels = np.count_nonzero(np.array(image) > 0) name = 'foo0' assert plain_workspace.save_image_file(image, name, 'IMG') pcgts = page_from_file(next(plain_workspace.mets.find_files(ID=name))) page = pcgts.get_Page() region = TextRegionType( id='nonrect', Coords=CoordsType(points=points_from_polygon(poly)), orientation=-skew) page.add_TextRegion(region) page_image, page_coords, _ = plain_workspace.image_from_page(page, '') #page_image.show(title='page_image') assert list(image.getdata()) == list(page_image.getdata()) assert np.all(page_coords['transform'] == np.eye(3)) reg_image, reg_coords = plain_workspace.image_from_segment( region, page_image, page_coords, feature_filter='deskewed', fill=0) assert list(image.crop(bbox).getdata()) == list(reg_image.getdata()) assert reg_image.width == xywh['w'] == 908 assert reg_image.height == xywh['h'] == 335 assert reg_coords['transform'][0, 2] == -xywh['x'] assert reg_coords['transform'][1, 2] == -xywh['y'] # same fg after cropping to minimal bbox reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert pixels == reg_pixels # now with deskewing (test for size after recropping) reg_image, reg_coords = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0) #reg_image.show(title='reg_image') assert reg_image.width == 932 > xywh['w'] assert reg_image.height == 382 > xywh['h'] assert reg_coords['transform'][0, 1] != 0 assert reg_coords['transform'][1, 0] != 0 assert 'deskewed' in reg_coords['features'] # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels = np.count_nonzero(np.array(reg_image) > 0) assert np.abs(pixels - reg_pixels) / pixels < 0.005 reg_array = np.array(reg_image) > 0 # now via AlternativeImage path = plain_workspace.save_image_file(reg_image, region.id + '_img', 'IMG') region.add_AlternativeImage( AlternativeImageType(filename=path, comments=reg_coords['features'])) logger_capture = FIFOIO(256) logger_handler = logging.StreamHandler(logger_capture) #logger_handler.setFormatter(logging.Formatter(fmt=LOG_FORMAT, datefmt=LOG_TIMEFMT)) logger = logging.getLogger('ocrd_utils.crop_image') logger.addHandler(logger_handler) reg_image2, reg_coords2 = plain_workspace.image_from_segment(region, page_image, page_coords, fill=0) #reg_image2.show(title='reg_image2') logger_output = logger_capture.getvalue() logger_capture.close() assert logger_output == '' assert reg_image2.width == reg_image.width assert reg_image2.height == reg_image.height assert np.allclose(reg_coords2['transform'], reg_coords['transform']) assert reg_coords2['features'] == reg_coords['features'] # same fg after cropping to minimal bbox (roughly - due to aliasing) reg_pixels2 = np.count_nonzero(np.array(reg_image) > 0) assert reg_pixels2 == reg_pixels reg_array2 = np.array(reg_image2) > 0 assert 0.98 < np.sum(reg_array == reg_array2) / reg_array.size <= 1.0
def _process_page(self, it, page, page_image, page_coords, page_id): LOG = getLogger('processor.TesserocrSegmentRegion') # equivalent to GetComponentImages with raw_image=True, # (which would also give raw coordinates), # except we are also interested in the iterator's BlockType() here, # and its BlockPolygon() index = 0 ro = page.get_ReadingOrder() if not ro: ro = ReadingOrderType() page.set_ReadingOrder(ro) og = ro.get_OrderedGroup() if og: # start counting from largest existing index for elem in (og.get_RegionRefIndexed() + og.get_OrderedGroupIndexed() + og.get_UnorderedGroupIndexed()): if elem.index >= index: index = elem.index + 1 else: # new top-level group og = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(og) while it and not it.Empty(RIL.BLOCK): # (padding will be passed to both BoundingBox and GetImage) # (actually, Tesseract honours padding only on the left and bottom, # whereas right and top are increased less!) bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding']) # sometimes these polygons are not planar, which causes # PIL.ImageDraw.Draw.polygon (and likely others as well) # to misbehave; however, PAGE coordinate semantics prohibit # multi-path polygons! # (probably a bug in Tesseract itself, cf. tesseract#2826): if self.parameter['crop_polygons']: polygon = it.BlockPolygon() else: polygon = polygon_from_x0y0x1y1(bbox) polygon = coordinates_for_segment(polygon, page_image, page_coords) polygon2 = polygon_for_parent(polygon, page) if polygon2 is not None: polygon = polygon2 points = points_from_polygon(polygon) coords = CoordsType(points=points) if polygon2 is None: LOG.info('Ignoring extant region: %s', points) it.Next(RIL.BLOCK) continue # if xywh['w'] < 30 or xywh['h'] < 30: # LOG.info('Ignoring too small region: %s', points) # it.Next(RIL.BLOCK) # continue # region_image_bin = it.GetBinaryImage(RIL.BLOCK) # if not region_image_bin.getbbox(): # LOG.info('Ignoring binary-empty region: %s', points) # it.Next(RIL.BLOCK) # continue # # add the region reference in the reading order element # (will be removed again if Separator/Noise region below) ID = "region%04d" % index og.add_RegionRefIndexed( RegionRefIndexedType(regionRef=ID, index=index)) # # region type switch # block_type = it.BlockType() if block_type in [ PT.FLOWING_TEXT, PT.HEADING_TEXT, PT.PULLOUT_TEXT, PT.CAPTION_TEXT, # TABLE is contained in PTIsTextType, but # it is a bad idea to create a TextRegion # for it (better set `find_tables` False): # PT.TABLE, # will also get a 90° @orientation # (but that can be overridden by deskew/OSD): PT.VERTICAL_TEXT ]: region = TextRegionType(id=ID, Coords=coords, type=TextTypeSimpleType.PARAGRAPH) if block_type == PT.VERTICAL_TEXT: region.set_orientation(90.0) elif block_type == PT.HEADING_TEXT: region.set_type(TextTypeSimpleType.HEADING) elif block_type == PT.PULLOUT_TEXT: region.set_type(TextTypeSimpleType.FLOATING) elif block_type == PT.CAPTION_TEXT: region.set_type(TextTypeSimpleType.CAPTION) page.add_TextRegion(region) if self.parameter['sparse_text']: region.set_type(TextTypeSimpleType.OTHER) region.add_TextLine( TextLineType(id=region.id + '_line', Coords=coords)) elif block_type in [ PT.FLOWING_IMAGE, PT.HEADING_IMAGE, PT.PULLOUT_IMAGE ]: region = ImageRegionType(id=ID, Coords=coords) page.add_ImageRegion(region) elif block_type in [PT.HORZ_LINE, PT.VERT_LINE]: region = SeparatorRegionType(id=ID, Coords=coords) page.add_SeparatorRegion(region) # undo appending in ReadingOrder og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1]) elif block_type in [PT.INLINE_EQUATION, PT.EQUATION]: region = MathsRegionType(id=ID, Coords=coords) page.add_MathsRegion(region) elif block_type == PT.TABLE: # without API access to StructuredTable we cannot # do much for a TableRegionType (i.e. nrows, ncols, # coordinates of cells for recursive regions etc), # but this can be achieved afterwards by segment-table region = TableRegionType(id=ID, Coords=coords) page.add_TableRegion(region) else: region = NoiseRegionType(id=ID, Coords=coords) page.add_NoiseRegion() # undo appending in ReadingOrder og.set_RegionRefIndexed(og.get_RegionRefIndexed()[:-1]) LOG.info("Detected region '%s': %s (%s)", ID, points, membername(PT, block_type)) # # iterator increment # index += 1 it.Next(RIL.BLOCK) if (not og.get_RegionRefIndexed() and not og.get_OrderedGroupIndexed() and not og.get_UnorderedGroupIndexed()): # schema forbids empty OrderedGroup ro.set_OrderedGroup(None)
def convert(cocofile, directory): """Convert MS-COCO JSON to METS/PAGE XML files. Load JSON ``cocofile`` (in MS-COCO format) and chdir to ``directory`` (which it refers to). Start a METS file mets.xml with references to the image files (under fileGrp ``OCR-D-IMG``) and their corresponding PAGE-XML annotations (under fileGrp ``OCR-D-GT-SEG-BLOCK``), as parsed from ``cocofile`` and written using the same basename. """ resolver = Resolver() with pushd_popd(directory): workspace = resolver.workspace_from_nothing('.') # https://github.com/ibm-aur-nlp/PubLayNet workspace.mets.unique_identifier = 'ocrd_PubLayNet_' + directory coco = json.load(cocofile) LOG.info('Loaded JSON for %d images with %d regions in %d categories', len(coco['images']), len(coco['annotations']), len(coco['categories'])) categories = dict() for cat in coco['categories']: categories[cat['id']] = cat['name'] images = dict() for image in coco['images']: images[image['id']] = image for annotation in coco['annotations']: image = images[annotation['image_id']] regions = image.setdefault('regions', list()) regions.append(annotation) del coco LOG.info('Parsing annotations into PAGE-XML') for image in images.values(): page_id = 'p' + str(image['id']) file_base, file_ext = os.path.splitext(image['file_name']) filename = file_base + '.xml' image_file = workspace.add_file('OCR-D-IMG', ID='OCR-D-IMG_' + page_id, pageId=page_id, mimetype=EXT_TO_MIME[file_ext], local_filename=image['file_name']) LOG.info('Added page %s file %s of type %s', image_file.pageId, image_file.local_filename, image_file.mimetype) pcgts = page_from_image(image_file) pcgts.set_pcGtsId(page_id) page = pcgts.get_Page() assert page.imageWidth == image['width'] assert page.imageHeight == image['height'] for region in image['regions']: polygon = np.array(region['segmentation']) polygon = np.reshape(polygon, (polygon.shape[1] // 2, 2)) coords = CoordsType(points=points_from_polygon(polygon)) category = categories[region['category_id']] region_id = 'r' + str(region['id']) if category == 'text': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.PARAGRAPH) page.add_TextRegion(region_obj) elif category == 'title': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.HEADING) # CAPTION? page.add_TextRegion(region_obj) elif category == 'list': region_obj = TextRegionType( id=region_id, Coords=coords, type_=TextTypeSimpleType.LISTLABEL) # OTHER? page.add_TextRegion(region_obj) elif category == 'table': region_obj = TableRegionType(id=region_id, Coords=coords) page.add_TableRegion(region_obj) elif category == 'figure': region_obj = ImageRegionType(id=region_id, Coords=coords) page.add_ImageRegion(region_obj) else: raise Exception('unknown image category: %s' % category) page_file = workspace.add_file('OCR-D-GT-SEG-BLOCK', ID='OCR-D-GT-SEG-BLOCK_' + page_id, pageId=page_id, mimetype=MIMETYPE_PAGE, local_filename=filename, content=to_xml(pcgts)) LOG.info('Added page %s file %s with %d regions', page_file.pageId, page_file.local_filename, len(image['regions'])) LOG.info('All done') workspace.save_mets()
def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested level. Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. - If ``overwrite_regions=True`` on ``page`` level, then delete any top-level TextRegions (along with ReadingOrder). - If ``overwrite_regions=True`` on ``table`` level, then delete any TextRegions in TableRegions (along with their OrderGroup). - If ``overwrite_lines=True`` on ``region`` level, then delete any TextLines in TextRegions. - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting - up to ``maximages`` large foreground images, - up to ``maxseps`` foreground h/v-line separators and - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: - If ``level-of-operation`` is ``region``, then append the new lines to the parent region. - If it is ``table``, then append the new lines to their respective regions, and append the new regions to the parent table. (Also, create an OrderedGroup for it as the parent's RegionRef.) - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) Produce a new output file by serialising the resulting hierarchy. """ LOG = getLogger('processor.OcropySegment') # FIXME: allow passing a-priori info on reading order / textline order # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture # of different scripts; also, vertical writing needs internal rotation # because our line segmentation only works for horizontal writing) overwrite_lines = self.parameter['overwrite_lines'] overwrite_regions = self.parameter['overwrite_regions'] overwrite_separators = self.parameter['overwrite_separators'] overwrite_order = self.parameter['overwrite_order'] oplevel = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') if self.parameter['dpi'] > 0: zoom = 300.0 / self.parameter['dpi'] elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 LOG.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0 / dpi else: zoom = 1 # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + page.get_GraphicRegion() + page.get_ChartRegion() + page.get_MapRegion() + page.get_MathsRegion() + page.get_ChemRegion() + page.get_MusicRegion() + page.get_AdvertRegion() + page.get_NoiseRegion() + page.get_UnknownRegion() + page.get_CustomRegion()) if oplevel == 'page' and overwrite_separators: page.set_SeparatorRegion([]) else: ignore.extend(page.get_SeparatorRegion()) # prepare reading order reading_order = dict() ro = page.get_ReadingOrder() if ro: rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() if rogroup: page_get_reading_order(reading_order, rogroup) # get segments to process / overwrite if oplevel == 'page': ignore.extend(page.get_TableRegion()) regions = list(page.get_TextRegion()) if regions: # page is already region-segmented if overwrite_regions: LOG.info('removing existing TextRegions in page "%s"', page_id) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): page.set_TextRegion([]) page.set_ReadingOrder(None) ro = None else: LOG.warning( 'keeping existing TextRegions in page "%s"', page_id) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: ro = ReadingOrderType() page.set_ReadingOrder(ro) rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() if not rogroup: # new top-level group rogroup = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(rogroup) # go get TextRegions with TextLines (and SeparatorRegions): self._process_element(page, ignore, page_image, page_coords, page_id, file_id, input_file.pageId, zoom, rogroup=rogroup) if (not rogroup.get_RegionRefIndexed() and not rogroup.get_OrderedGroupIndexed() and not rogroup.get_UnorderedGroupIndexed()): # schema forbids empty OrderedGroup ro.set_OrderedGroup(None) elif oplevel == 'table': ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: LOG.warning('Page "%s" contains no table regions', page_id) for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: LOG.info( 'removing existing TextRegions in table "%s"', region.id) region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) reading_order[ region.id] = page_subgroup_in_reading_order( roelem) else: LOG.warning( 'skipping table "%s" with existing TextRegions', region.id) continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') # ignore everything but the current table region subignore = regions + ignore subignore.remove(region) # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: LOG.warning( "Page '%s' table region '%s' is not referenced in reading order (%s)", page_id, region.id, "no target to add cells to") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(roelem) reading_order[region.id] = roelem elif isinstance( roelem, (OrderedGroupType, OrderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an ordered group (%s)", page_id, region.id, "cells will be appended") elif isinstance( roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): LOG.warning( "Page '%s' table region '%s' already has an unordered group (%s)", page_id, region.id, "cells will not be appended") roelem = None else: # replace regionRef(Indexed) by group with same index and ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) self._process_element(region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id, input_file.pageId, zoom, rogroup=roelem) else: # 'region' regions = list(page.get_TextRegion()) # besides top-level text regions, line-segment any table cells, # and for tables without any cells, add a pseudo-cell for region in page.get_TableRegion(): subregions = region.get_TextRegion() if subregions: regions.extend(subregions) else: subregion = TextRegionType( id=region.id + '_text', Coords=region.get_Coords(), # as if generated from parser: parent_object_=region) region.add_TextRegion(subregion) regions.append(subregion) if not regions: LOG.warning('Page "%s" contains no text regions', page_id) for region in regions: if region.get_TextLine(): if overwrite_lines: LOG.info( 'removing existing TextLines in page "%s" region "%s"', page_id, region.id) region.set_TextLine([]) else: LOG.warning( 'keeping existing TextLines in page "%s" region "%s"', page_id, region.id) ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') # if the region images have already been clipped against their neighbours specifically, # then we don't need to suppress all neighbours' foreground generally here if 'clipped' in region_coords['features'].split(','): ignore = [] # go get TextLines self._process_element(region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, input_file.pageId, zoom) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file(ID=file_id, file_grp=self.output_file_grp, pageId=input_file.pageId, local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) LOG.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename)
def _process_segment(self, page_image, page, page_xywh, page_id, input_file, n, mrcnn_model, class_names, mask): LOG = getLogger('OcrdAnybaseocrBlockSegmenter') # check for existing text regions and whether to overwrite them border = None if page.get_TextRegion(): if self.parameter['overwrite']: LOG.info('removing existing TextRegions in page "%s"', page_id) page.set_TextRegion([]) else: LOG.warning('keeping existing TextRegions in page "%s"', page_id) return # check if border exists if page.get_Border(): border_coords = page.get_Border().get_Coords() border_points = polygon_from_points(border_coords.get_points()) border = Polygon(border_points) # page_image, page_xy = self.workspace.image_from_segment(page.get_Border(), page_image, page_xywh) img_array = ocrolib.pil2array(page_image) page_image.save('./checkthis.png') if len(img_array.shape) <= 2: img_array = np.stack((img_array, ) * 3, axis=-1) results = mrcnn_model.detect([img_array], verbose=1) r = results[0] th = self.parameter['th'] # check for existing semgentation mask # this code executes only when use_deeplr is set to True in ocrd-tool.json file if mask: mask = ocrolib.pil2array(mask) mask = mask // 255 mask = 1 - mask # multiply all the bounding box part with 2 for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] mask[min_x:max_x, min_y:max_y] *= i + 2 cv2.imwrite('mask_check.png', mask * (255 / (len(r['rois']) + 2))) # check for left over pixels and add them to the bounding boxes pixel_added = True while pixel_added: pixel_added = False left_over = np.where(mask == 1) for x, y in zip(left_over[0], left_over[1]): local_mask = mask[x - th:x + th, y - th:y + th] candidates = np.where(local_mask > 1) candidates = [k for k in zip(candidates[0], candidates[1])] if len(candidates) > 0: pixel_added = True # find closest pixel with x>1 candidates.sort(key=lambda j: np.sqrt((j[0] - th)**2 + (j[1] - th)**2)) index = local_mask[candidates[0]] - 2 # add pixel to mask/bbox # x,y to bbox with index if x < r['rois'][index][0]: r['rois'][index][0] = x elif x > r['rois'][index][2]: r['rois'][index][2] = x if y < r['rois'][index][1]: r['rois'][index][1] = y elif y > r['rois'][index][3]: r['rois'][index][3] = y # update the mask mask[x, y] = index + 2 # resolving overlapping problem bbox_dict = {} # to check any overlapping bbox class_id_check = [] for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] region_bbox = [min_y, min_x, max_y, max_x] for key in bbox_dict: for bbox in bbox_dict[key]: # checking for ymax case with vertical overlapping # along with y, check both for xmax and xmin if (region_bbox[3] <= bbox[3] and region_bbox[3] >= bbox[1] and ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and r['class_ids'][i] != 5): r['rois'][i][2] = bbox[1] - 1 # checking for ymin now # along with y, check both for xmax and xmin if (region_bbox[1] <= bbox[3] and region_bbox[1] >= bbox[1] and ((region_bbox[0] >= bbox[0] and region_bbox[0] <= bbox[2]) or (region_bbox[2] >= bbox[0] and region_bbox[2] <= bbox[2]) or (region_bbox[0] <= bbox[0] and region_bbox[2] >= bbox[2])) and r['class_ids'][i] != 5): r['rois'][i][0] = bbox[3] + 1 if r['class_ids'][i] not in class_id_check: bbox_dict[r['class_ids'][i]] = [] class_id_check.append(r['class_ids'][i]) bbox_dict[r['class_ids'][i]].append(region_bbox) # resolving overlapping problem code # define reading order on basis of coordinates reading_order = [] for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 reading_order.append((min_y, min_x, max_y, max_x)) reading_order = sorted(reading_order, key=lambda reading_order: (reading_order[1], reading_order[0])) for i in range(len(reading_order)): min_y, min_x, max_y, max_x = reading_order[i] min_y = 0 i_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) for j in range(i + 1, len(reading_order)): min_y, min_x, max_y, max_x = reading_order[j] j_poly = Polygon([[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]) inter = i_poly.intersection(j_poly) if inter: reading_order.insert(j + 1, reading_order[i]) del reading_order[i] # Creating Reading Order object in PageXML order_group = OrderedGroupType(caption="Regions reading order", id=page_id) for i in range(len(r['rois'])): min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 region_polygon = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] if border: cut_region_polygon = border.intersection( Polygon(region_polygon)) if cut_region_polygon.is_empty: continue else: cut_region_polygon = Polygon(region_polygon) order_index = reading_order.index((min_y, min_x, max_y, max_x)) region_id = '%s_region%04d' % (page_id, i) regionRefIndex = RegionRefIndexedType(index=order_index, regionRef=region_id) order_group.add_RegionRefIndexed(regionRefIndex) reading_order_object = ReadingOrderType() reading_order_object.set_OrderedGroup(order_group) page.set_ReadingOrder(reading_order_object) for i in range(len(r['rois'])): width, height, _ = img_array.shape min_x = r['rois'][i][0] min_y = r['rois'][i][1] max_x = r['rois'][i][2] max_y = r['rois'][i][3] if (min_y - 5) > width and r['class_ids'][i] == 2: min_y -= 5 if (max_y + 10) < width and r['class_ids'][i] == 2: min_y += 10 # one change here to resolve flipped coordinates region_polygon = [[min_y, min_x], [max_y, min_x], [max_y, max_x], [min_y, max_x]] cut_region_polygon = border.intersection(Polygon(region_polygon)) if cut_region_polygon.is_empty: continue cut_region_polygon = [ j for j in zip(list(cut_region_polygon.exterior.coords.xy[0]), list(cut_region_polygon.exterior.coords.xy[1])) ][:-1] # checking whether coordinates are flipped region_polygon = coordinates_for_segment(cut_region_polygon, page_image, page_xywh) region_points = points_from_polygon(region_polygon) read_order = reading_order.index((min_y, min_x, max_y, max_x)) # this can be tested, provided whether we need previous comments or not? # resolving overlapping problem region_img = img_array[min_x:max_x, min_y: max_y] # extract from points and img_array region_img = ocrolib.array2pil(region_img) file_id = make_file_id(input_file, self.output_file_grp) file_path = self.workspace.save_image_file( region_img, file_id + "_" + str(i), page_id=page_id, file_grp=self.output_file_grp) # ai = AlternativeImageType(filename=file_path, comments=page_xywh['features']) region_id = '%s_region%04d' % (page_id, i) coords = CoordsType(region_points) # incase of imageRegion if r['class_ids'][i] == 15: image_region = ImageRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # image_region.add_AlternativeImage(ai) page.add_ImageRegion(image_region) continue if r['class_ids'][i] == 16: table_region = TableRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # table_region.add_AlternativeImage(ai) page.add_TableRegion(table_region) continue if r['class_ids'][i] == 17: graphic_region = GraphicRegionType( custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # graphic_region.add_AlternativeImage(ai) page.add_GraphicRegion(graphic_region) continue textregion = TextRegionType(custom='readingOrder {index:' + str(read_order) + ';}', id=region_id, Coords=coords, type_=class_names[r['class_ids'][i]]) # textregion.add_AlternativeImage(ai) #border = page.get_Border() # if border: # border.add_TextRegion(textregion) # else: page.add_TextRegion(textregion)
def process(self): for (n, input_file) in enumerate(self.input_files): pcgts = page_from_file(self.workspace.download_file(input_file)) fname = pcgts.get_Page().imageFilename img = self.workspace.resolve_image_as_pil(fname) param = self.parameter base, _ = ocrolib.allsplitext(fname) #basefile = ocrolib.allsplitext(os.path.basename(fpath))[0] if param['parallel'] < 2: print_info("=== %s " % (fname)) raw = ocrolib.read_image_gray(img.filename) flat = raw #flat = np.array(binImg) # estimate skew angle and rotate if param['maxskew'] > 0: if param['parallel'] < 2: print_info("estimating skew angle") d0, d1 = flat.shape o0, o1 = int(param['bignore'] * d0), int(param['bignore'] * d1) flat = amax(flat) - flat flat -= amin(flat) est = flat[o0:d0 - o0, o1:d1 - o1] ma = param['maxskew'] ms = int(2 * param['maxskew'] * param['skewsteps']) angle = self.estimate_skew_angle(est, linspace(-ma, ma, ms + 1)) flat = interpolation.rotate(flat, angle, mode='constant', reshape=0) flat = amax(flat) - flat else: angle = 0 # self.write_angles_to_pageXML(base,angle) # estimate low and high thresholds if param['parallel'] < 2: print_info("estimating thresholds") d0, d1 = flat.shape o0, o1 = int(param['bignore'] * d0), int(param['bignore'] * d1) est = flat[o0:d0 - o0, o1:d1 - o1] if param['escale'] > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = param['escale'] v = est - filters.gaussian_filter(est, e * 20.0) v = filters.gaussian_filter(v**2, e * 20.0)**0.5 v = (v > 0.3 * amax(v)) v = morphology.binary_dilation(v, structure=ones( (int(e * 50), 1))) v = morphology.binary_dilation(v, structure=ones( (1, int(e * 50)))) if param['debug'] > 0: imshow(v) ginput(1, param['debug']) est = est[v] lo = stats.scoreatpercentile(est.ravel(), param['lo']) hi = stats.scoreatpercentile(est.ravel(), param['hi']) # rescale the image to get the gray scale image if param['parallel'] < 2: print_info("rescaling") flat -= lo flat /= (hi - lo) flat = clip(flat, 0, 1) if param['debug'] > 0: imshow(flat, vmin=0, vmax=1) ginput(1, param['debug']) deskewed = 1 * (flat > param['threshold']) # output the normalized grayscale and the thresholded images print_info("%s lo-hi (%.2f %.2f) angle %4.1f" % (pcgts.get_Page().imageFilename, lo, hi, angle)) if param['parallel'] < 2: print_info("writing") ocrolib.write_image_binary(base + ".ds.png", deskewed) orientation = TextRegionType(orientation=angle) pcgts.get_Page().add_TextRegion(orientation) ID = concat_padded(self.output_file_grp, n) self.workspace.add_file(ID=ID, file_grp=self.output_file_grp, pageId=input_file.pageId, mimetype="image/png", url=base + ".ds.png", local_filename='%s/%s' % (self.output_file_grp, ID), content=to_xml(pcgts).encode('utf-8'))