def addRegionToDom(page, ipage, lc, bVerbose): """ create a dom node for each cluster update DU_cluster for each Textline """ for ic, dC in enumerate(lc): ndRegion = PageXml.createPageXmlNode('TextRegion') #update elements # for id in c.get('content').split(): # elt = page.xpath('.//*[@id="%s"]'%id)[0] # elt.getparent().remove(elt) # ndRegion.append(elt) # lTL.append((elt)) lTL = lc[dC] ndRegion.set('id', "p%d_r%d" % (ipage, ic)) coords = PageXml.createPageXmlNode('Coords') ndRegion.append(coords) coords.set('points', getClusterCoords(lTL)) # propagateTypeToRegion(ndRegion) for tl in lTL: tl.getparent().remove(tl) ndRegion.append(tl) #print (f"{tl.get('id')} added to {ndRegion.get('id')}") page.append(ndRegion)
def addClusterToDom(self, dCluster, bMoveContent=False): """ Add Cluster elements to the Page DOM node """ pageNode = None for x, lnidx in dCluster.items(): #self.analysedCluster() if pageNode is None: pageNode = self.lNode[lnidx[0]].page.node pageNode.append( etree.Comment("Clusters created by the conjugate graph")) # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx] # Make it robust to bad data... lp = [] for _i in lnidx: try: lp.append(ShapeLoader.node_to_Polygon(self.lNode[_i].node)) except ValueError: pass contour = cascaded_union( [p if p.is_valid else p.convex_hull for p in lp]) # print(contour.wkt) try: spoints = ' '.join( "%s,%s" % (int(x[0]), int(x[1])) for x in contour.minimum_rotated_rectangle.exterior.coords) except: try: spoints = ' '.join( "%s,%s" % (int(x[0]), int(x[1])) for x in contour.minimum_rotated_rectangle.coords) # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence except: spoints = "" #print (spoints) ndCluster = PageXml.createPageXmlNode('Cluster') # add the space separated list of node ids ndCluster.set( "content", " ".join(self.lNode[_i].node.get("id") for _i in lnidx)) coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) coords.set('points', spoints) pageNode.append(ndCluster) if bMoveContent: # move the DOM node of the content to the cluster for _i in lnidx: ndCluster.append(self.lNode[_i].node) return
def addEdgeToDoc(self, Y_proba): """ To display the graph conveniently we add new Edge elements # for y_p, x_u, in zip(lY_pred, [X]): # edges = x_u[1][:int(len(x_u[1])/2)] # for i, (p,ie) in enumerate(zip(y_p, edges)): # print(p, g.lNode[ie[0]].text,g.lNode[ie[1]].text, g.lEdge[i]) """ if self.lNode: ndPage = self.lNode[0].page.node ndPage.append( etree.Comment("\nEdges labeled by the conjugate graph\n")) Y = Y_proba.argmax(axis=1) for i, edge in enumerate(self.lEdge): A, B = edge.A, edge.B #shape.centroid, edge.B.shape.centroid ndEdge = PageXml.createPageXmlNode("Edge") try: cls = Y[i] ndEdge.set("label", self.lEdgeLabel[cls]) ndEdge.set("proba", "%.3f" % Y_proba[i, cls]) except IndexError: # case of a conjugate graph without edge, so the edges # of the original graph cannot be labelled pass ndEdge.set("src", edge.A.node.get("id")) ndEdge.set("tgt", edge.B.node.get("id")) ndEdge.set("type", edge.__class__.__name__) ndPage.append(ndEdge) ndEdge.tail = "\n" PageXml.setPoints(ndEdge, [(A.x1, A.y1), (B.x1, B.y1)]) return
def addEdgeToDOM(self, Y_proba): """ To display the graph conveniently we add new Edge elements # for y_p, x_u, in zip(lY_pred, [X]): # edges = x_u[1][:int(len(x_u[1])/2)] # for i, (p,ie) in enumerate(zip(y_p, edges)): # print(p, g.lNode[ie[0]].text,g.lNode[ie[1]].text, g.lEdge[i]) """ Y = Y_proba.argmax(axis=1) ndPage = self.lNode[0].page.node #w = ndPage.get("imageFilename") ndPage.append(etree.Comment("Edges labeled by the conjugate graph")) for i, edge in enumerate(self.lEdge): cls = Y[i] if True or cls > 0: #type(edge) in [HorizontalEdge, VerticalEdge]: A, B = edge.A, edge.B #shape.centroid, edge.B.shape.centroid ndEdge = PageXml.createPageXmlNode("Edge") ndEdge.set("label", self.lEdgeLabel[cls]) ndEdge.set("proba", "%.3f" % Y_proba[i, cls]) ndEdge.set("src", edge.A.node.get("id")) ndEdge.set("tgt", edge.B.node.get("id")) ndEdge.set("type", edge.__class__.__name__) ndPage.append(ndEdge) PageXml.setPoints(ndEdge, [(A.x1, A.y1), (B.x1, B.y1)]) return
def addEdgesToXml(cls, ndPage, sAlgo, lCluster): cnt = 0 ndPage.append(etree.Comment("\nInter-cluster edges by tabulate_cluster scale_H=%.2f sclae_V=%.2f\n" %( cls.scale_H, cls.scale_V))) setEdges = set() for A in lCluster: for edge_type, lLinked in A.dsEdge.items(): for B in lLinked: if A.cnt >= B.cnt: continue if (A, B, edge_type) not in setEdges: # ok, let's add the edge A <--> B ndEdge = PageXml.createPageXmlNode("ClusterEdge") ndEdge.set("src", A.name) ndEdge.set("tgt", B.name) ndEdge.set("type", edge_type) ndEdge.set("algo", sAlgo) if True: ptA = A.shape.representative_point() ptB = B.shape.representative_point() else: ptA, ptB = shapely.ops.nearest_points(A.shape, B.shape) PageXml.setPoints(ndEdge, list(ptA.coords) + list(ptB.coords)) ndEdge.tail = "\n" ndPage.append(ndEdge) setEdges.add((A, B, edge_type)) cnt += 1 del setEdges return cnt
def makeClusterNode(self, sAlgo): """ Create an XML node reflecting the cluster """ ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", self.name) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set("content", " ".join(self.setID)) ndCoords = PageXml.createPageXmlNode('Coords') ndCluster.append(ndCoords) if self.shape is None: ndCoords.set('points', "") else: ndCoords.set('points', ShapeLoader.getCoordsString(self.shape)) ndCluster.tail = "\n" return ndCluster
def addClusterToDom(self, lCluster, bMoveContent=False, sAlgo="", pageNode=None): """ Add Cluster elements to the Page DOM node """ lNdCluster = [] for name, lnidx in enumerate(lCluster): #self.analysedCluster() if pageNode is None: for idx in lnidx: pageNode = self.lNode[idx].page.node break pageNode.append( etree.Comment( "\nClusters created by the conjugate graph\n")) ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", str(name)) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set( "content", " ".join(self.lNode[_i].node.get("id") for _i in lnidx)) coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) spoints = ShapeLoader.minimum_rotated_rectangle( [self.lNode[_i].node for _i in lnidx]) coords.set('points', spoints) pageNode.append(ndCluster) ndCluster.tail = "\n" if bMoveContent: # move the DOM node of the content to the cluster for _i in lnidx: ndCluster.append(self.lNode[_i].node) lNdCluster.append(ndCluster) return lNdCluster
def addEdgeToDOM(self, Y=None): """ To display the graph conveniently we add new Edge elements """ ndPage = self.lNode[0].page.node # w = int(ndPage.get("imageWidth")) ndPage.append(etree.Comment("Edges added to the XML for convenience")) for edge in self.lEdge: A, B = edge.A, edge.B #shape.centroid, edge.B.shape.centroid ndEdge = PageXml.createPageXmlNode("Edge") ndEdge.set("src", edge.A.node.get("id")) ndEdge.set("tgt", edge.B.node.get("id")) ndEdge.set("type", edge.__class__.__name__) ndEdge.tail = "\n" ndPage.append(ndEdge) PageXml.setPoints(ndEdge, [(A.x1, A.y1), (B.x1, B.y1)]) return
def convertDSObject(self, DSObject, pageXmlParentNode): """ convert DSObject and add it as child to pageXmlParentNode <TextLine id="line_1472550984091_215" custom="readingOrder {index:0;}"> <Coords points="218,65 280,65 280,100 218,100"/> <Baseline points="218,95 280,95"/> <TextEquiv> <Unicode>10.</Unicode> </TextEquiv> </TextLine> for table: <TableRegion id="Table_1484215666379_5" custom="readingOrder {index:92;}"> <Coords points="221,246 781,246 781,1094 221,1094"/> <TableCell row="0" col="0" colSpan="1" id="TableCell_1484215672011_8"> <Coords points="221,246 221,1094 451,1094 451,246"/> <CornerPts>0 1 2 3</CornerPts> </TableCell> <TableCell row="0" col="1" colSpan="1" id="TableCell_1484215672011_7"> <Coords points="451,246 451,1094 781,1094 781,246"/> <CornerPts>0 1 2 3</CornerPts> </TableCell> </TableRegion> DS TEXT <TEXT x="52.8" y="41.04" height="6.72" width="76.08" font-size="20" y2="47.76" x2="128.88" points="52.8,41.04,128.88,41.04,128.88,47.76,52.8,47.76" blpoints="52.8,47.76,128.88,44.64" type="RB" id="p1_CVL-{e0004b40-06e0-4b85-97a0-2df0e1f0fe87}"/> """ try: pageXmlName = self.dTagNameMapping[DSObject.getName()] except KeyError: print(DSObject.getName(), " not declared") return # print (DSObject.getName()) domNode = PageXml.createPageXmlNode(pageXmlName) if DSObject.getID(): domNode.set("id", "nle_%s" % DSObject.getID()) else: self.addNLEID(domNode) pageXmlParentNode.append(domNode) coordsNode = etree.Element('{%s}Coords' % (self.pageXmlNS)) # coordsNode.setNs(self.pageXmlNS) if DSObject.hasAttribute('points'): coordsNode.set( 'points', self.DSPoint2PagePoints(DSObject.getAttribute('points'))) else: print(etree.tostring(DSObject.getNode())) coordsNode.set( 'points', self.BB2Polylines(DSObject.getX(), DSObject.getY(), DSObject.getHeight(), DSObject.getWidth())) domNode.append(coordsNode) for attr in [ 'custom', 'structure', 'col', 'type', 'DU_row', 'DU_header', 'DU_col' ]: if DSObject.hasAttribute(attr): domNode.set(attr, DSObject.getAttribute(attr)) # if blpoints: build Baseline # type # if DSObject.hasAttribute('type'): # domNode.set('type', DSObject.getAttribute('type')) # if blpoints: build Baseline # <Baseline points="218,95 280,95"/> ## Baseline needs to be left-right!! if DSObject.hasAttribute('blpoints'): domBaseLine = etree.Element('{%s}Baseline' % (self.pageXmlNS)) # domBaseLine.setNs(self.pageXmlNS) domBaseLine.set( 'points', self.DSPoint2PagePoints(DSObject.getAttribute('blpoints'))) domNode.append(domBaseLine) # collect content and generate a textequiv # <TextEquiv> <Unicode>des</Unicode> </TextEquiv> if DSObject.getContent() is not None: TextEquivDom = etree.Element('{%s}TextEquiv' % (self.pageXmlNS)) unicodeDom = etree.Element('{%s}Unicode' % (self.pageXmlNS)) unicodeDom.text = DSObject.getContent() TextEquivDom.append(unicodeDom) domNode.append(TextEquivDom) ## specific attributes for cell ### row="0" col="2" colSpan="1 if pageXmlName == 'TableCell': domNode.set('row', str(DSObject.getIndex()[0])) domNode.set('col', str(DSObject.getIndex()[1])) cornerNode = etree.Element('{%s}CornerPts' % (self.pageXmlNS)) cornerNode.text = "0 1 2 3" # cornerNode.setNs(self.pageXmlNS) domNode.append(cornerNode) domNode.set('colSpan', str(DSObject.getColSpan())) domNode.set('rowSpan', str(DSObject.getRowSpan())) #process objects for subobject in DSObject.getObjects(): self.convertDSObject(subobject, domNode)
def convertTableCells(self, document): """ needed ? <ReadingOrder> <OrderedGroup id="ro_1493890373963" caption="Regions reading order"> <RegionRefIndexed index="0" regionRef="region_1493009854544_3646"/> <RegionRefIndexed index="1" regionRef="region_1493009858356_3647"/> </OrderedGroup> </ReadingOrder> """ xpath = "//a:%s" % ("TableRegion") lTables = document.getroot().xpath(xpath, namespaces=self.ns) # need xpath = "//a:%s" % ("ReadingOrder") lRO = document.getroot().xpath(xpath, namespaces=self.ns) if lRO == []: ro = PageXml.createPageXmlNode('ReadingOrder') #addPrevSibling else: ro = lRO[0] for table in lTables: orderGroup = PageXml.createPageXmlNode('OrderedGroup') ro.append(orderGroup) orderGroup.set('{%s}id' % PageXml.NS_PAGE_XML, table.get('id')) orderGroup.set('{%s}caption' % PageXml.NS_PAGE_XML, 'Cell2TextRegion') xpath = "./a:%s" % ("TableCell") lCells = table.xpath(xpath, namespaces=self.ns) ## sort cells by rows lCells.sort(key=lambda x: int(x.get('row'))) for i, cell in enumerate(lCells): #??? # cell.unlinkNode() # print cell table.getparent().append(cell) cell.tag = '{%s}TextRegion' % (PageXml.NS_PAGE_XML) cell.set('custom', "readingOrder {index:%d;}" % i) # delete cell props for propname in ['row', 'col', 'rowSpan', 'colSpan']: del cell.attrib[propname] # del leftBorderVisible, topBorderVisible,rightBorderVisible,bottomBorderVisible # to do #del CornerPts xpath = "./a:%s" % ("CornerPts") lCorner = cell.xpath(xpath, namespaces=self.ns) for c in lCorner: c.getparent().remove(c) reind = PageXml.createPageXmlNode('RegionRefIndexed') orderGroup.append(reind) reind.set('{%s}index' % PageXml.NS_PAGE_XML, str(i)) reind.set('{%s}regionRef' % PageXml.NS_PAGE_XML, cell.get('id')) ## resize cell/region: if self.resizeCell(cell, self.ns): cell.getparent().remove(cell) # table.unlinkNode() del (table) PageXml.validate(document)
def addClusterToDom(self, lCluster, sAlgo=""): """ From the predicted clusters, we create new TextRegions. The original TextRegion, if any, are either removed or renamed TextRegion_GT """ if hasattr(options, "bEvalRegion") and options.bEvalRegion: # in order to evaluate, we must keep the original TextRegion and the Cluster elements that are produced return super(My_ConjugateSegmenterGraph_MultiSinglePageXml, self).addClusterToDom(lCluster, sAlgo=sAlgo) lNdCluster = [] dNS = { "pc": "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" } # the graph has been constructed for a certain page pageNode = self.lNode[0].page.node # guess its position to create unique XML ID #pageNum = 1 #+ len( pageNode.xpath(".//preceding::pc:Page", namespaces=dNS) ) #traceln(" Page: ",pageNum) # enumerate TextAreas to remove lNdOldTextRegion = pageNode.xpath(".//pc:TextRegion", namespaces=dNS) # we copy the type attribute to the inner TextLine, to preserve the GT info, if any for ndTR in lNdOldTextRegion: sType = str(ndTR.get("type")) for ndTL in ndTR.xpath(".//pc:TextLine", namespaces=dNS): ndTL.set("type_gt", sType) # replace the ReadingOrder section by a new one ndReadingOrder = pageNode.xpath(".//pc:ReadingOrder", namespaces=dNS)[0] pageNode.remove(ndReadingOrder) ndReadingOrder = PageXml.createPageXmlNode('ReadingOrder') ndOrderedGroup = PageXml.createPageXmlNode('OrderedGroup') ndOrderedGroup.set("id", "ro_1") ndOrderedGroup.set("caption", "Regions reading order") ndReadingOrder.append(ndOrderedGroup) pageNode.append(ndReadingOrder) # loop over clusters for ic, c in enumerate(lCluster): ndCluster = PageXml.createPageXmlNode('TextRegion') #scid = "cluster_p%d_%d" % (pageNum, ic+1) scid = "cluster_%d" % (ic + 1) ndCluster.set("id", scid) ndCluster.set("custom", "readingOrder {index:%d;}" % ic) # TextRegion bounding box coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) spoints = ShapeLoader.minimum_rotated_rectangle( [self.lNode[_i].node for _i in c]) coords.set('points', spoints) # if the inner TextLine are tagged, let's do a vote to tag the Cluster lsType = [self.lNode[_i].node.get('type') for _i in c] dType = Counter([o for o in lsType if o is not None]) mc = dType.most_common(1) if mc: sXmlLabel = mc[0][0] ndCluster.set("type", sXmlLabel) PageXml.setCustomAttr(ndCluster, "structure", "type", sXmlLabel) #TextLine: move the DOM node of the content to the cluster for _i in c: ndCluster.append(self.lNode[_i].node) pageNode.append(ndCluster) ndCluster.tail = "\n" lNdCluster.append(ndCluster) ndRegionRefIndexed = PageXml.createPageXmlNode('RegionRefIndexed') ndRegionRefIndexed.set("index", str(ic)) ndRegionRefIndexed.set("regionRef", scid) ndRegionRefIndexed.tail = "\n" ndOrderedGroup.append(ndRegionRefIndexed) # remove or rename the old TextRegion for nd in lNdOldTextRegion: if False: nd.tag = "TextRegion_GT" else: #pageNode.remove(nd) nd.getparent().remove(nd) return lNdCluster
def makeTableNode(self): """ Make a DOM tree for this table """ lK = self._dCellNd.keys() lRow = list(set(_row for _row, _col in lK)) lRow.sort() lCol = list(set(_col for _row, _col in lK)) lCol.sort() ndTable = PageXml.createPageXmlNode("TableRegion") ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum)) ndTable.tail = "\n" lCellShape = [] lNdCell = [] for row in lRow: for col in lCol: lNdText = self._dCellNd[(row, col)] # <TableCell row="0" col="1" rowSpan="1" colSpan="1" id="TableCell_1500971530732_2485"> # <Coords points="480,42 485,323 878,323 874,38"/> if lNdText: ndCell = PageXml.createPageXmlNode("TableCell") ndCell.set( "id", "p%s_t%s_r%s_c%s" % (self.pagenum, self.tablenum, row, col)) # shape of the cell oHull = ShapeLoader.convex_hull(lNdText, bShapelyObject=True) lCellShape.append( oHull) # keep those to compute table contour # Coords sub-element ndCoords = PageXml.createPageXmlNode("Coords") ndCoords.set( "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True)) ndCoords.tail = "\n" ndCell.append(ndCoords) # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false" ndCell.set("row", str(row)) ndCell.set("rowSpan", "1") ndCell.set("col", str(col)) ndCell.set("colSpan", "1") ndCell.tail = "\n" #add corner cornerNode = PageXml.createPageXmlNode("CornerPts") cornerNode.text = "0 1 2 3" ndCell.append(cornerNode) for nd in lNdText: ndCell.append(nd) lNdCell.append(ndCell) # Table geometry ndCoords = PageXml.createPageXmlNode("Coords") contour = cascaded_union( [p if p.is_valid else p.convex_hull for p in lCellShape]) o = contour.minimum_rotated_rectangle ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True)) ndCoords.tail = "\n" ndTable.append(ndCoords) for nd in lNdCell: ndTable.append(nd) return ndTable
def mergeBaselineCells(self, coldir, colid, docid): """ Take a file (pxml) with stuff processed on Transkribus Tale the CVL template tool xml (xml) merge them regenerate a mpxml """ xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid)) # print (xmlpath) mpxml = xmlpath + ".mpxml" mpxmldoc = etree.parse(mpxml) lxml = glob.glob(os.path.join(xmlpath, "*.xml")) pxmldoc = MultiPageXml.makeMultiPageXml(lxml) lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml")) mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml) lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page') lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page') assert len(lXMLPage) == len(lPXMLPage) for i, cvlpage in enumerate(lXMLPage): ## remove TextRegion from xcvlpage lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion') for tr in lTextRegions: tr.getparent().remove(tr) pxmlpage = lPXMLPage[i] lTL = [] lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion') for x in lTextRegions: lTL.extend(PageXml.getChildByName(x, 'TextLine')) ltable = PageXml.getChildByName(cvlpage, 'TableRegion') if len(ltable) == 0: raise "NO TABLE" lCells = PageXml.getChildByName(ltable[0], 'TableCell') lC = [Polygon(PageXml.getPointList(c)) for c in lCells] lT = [Polygon(PageXml.getPointList(t)) for t in lTL] for i, tl in enumerate(lT): ## normalization lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords') lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline') coordB = lCoordsB[0] coord = lCoordsPoints[0] iHeight = 30 # in pixel x1, y1, x2, y2 = Polygon( PageXml.getPointList(coordB)).getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) tl = Polygon(PageXml.getPointList(coordB)) lOverlap = [] for _, c in enumerate(lC): # print (lCells[j].get('row'),lCells[j].get('col'), self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox()) lOverlap.append(self.signedOverlap( c, tl)) #.getBoundingBox())) ## region of the same size as the textline # print (j,max(lOverlap),lOverlap.index(max(lOverlap))) if max(lOverlap) == 0: region = PageXml.createPageXmlNode('TextRegion') cvlpage.append(region) region.append(lTL[i]) else: cell = lCells[lOverlap.index(max(lOverlap))] cell.append(lTL[i]) # print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext())) pxmldoc.write(mpxml)
def map_to_rows(cls, ndPage, maxRow, lCluster): """ find lienar separators separating rows """ # reflect each cluster by the highest point (highest ending points of baselines) dMinYByRow = defaultdict(lambda :9999999999) n = 2 * sum(len(c) for c in lCluster) X = np.zeros(shape=(n, 2)) # x,y coordinates i = 0 for c in lCluster: c.maxY = -1 c.minY = 9999999999 for _id in c.getSetID(): """ <TextLine id="r1l5" custom="readingOrder {index:4;}" DU_cluster="0" row="0" rowSpan="1" col="0" colSpan="1"> <Coords points="217,688 245,685 273,685 301,688 329,690 358,689 358,646 329,647 301,645 273,642 245,642 217,645"/> <Baseline points="217,688 245,685 273,685 301,688 329,690 358,689"/> <TextEquiv><Unicode>ung.</Unicode></TextEquiv> </TextLine> """ nd = ndPage.xpath(".//*[@id='%s']/pg:Baseline"%_id, namespaces=dNS)[0] ls = ShapeLoader.node_to_LineString(nd) pA, pB = ls.boundary.geoms minY = min(pA.y, pB.y) c.minY = min(c.minY, minY) c.maxY = max(c.maxY, max((pA.y, pB.y))) dMinYByRow[c.minrow] = min(dMinYByRow[c.minrow], minY) # for the linear separators X[i,:] = (pA.x, pA.y) i = i + 1 X[i,:] = (pB.x, pB.y) i = i + 1 # check consistency for c in lCluster: for i in range(maxRow, c.minrow, -1): if c.minY > dMinYByRow[i]: assert c.minrow < i # how possible??? fix!! c.minrow = i break # compute row1 and row2 for c in lCluster: c.row1 = c.minrow c.row2 = c.minrow for i in range(0, maxRow+1): if c.maxY > dMinYByRow[i]: c.row2 = i else: break # now compute maxRow - 1 separators! w = float(ndPage.get("imageWidth")) Y = np.zeros(shape=(n,)) # labels # lAB = [getLinearSeparator(X, np.clip(Y, row, row+1)) # for row in range(maxRow-1)] for nd in ndPage.xpath(".//pg:SeparatorRegion[@algo]", namespaces=dNS): ndPage.remove(nd) for row in range(maxRow+1): Y0 = dMinYByRow[row] - 20 Yw = Y0 ndSep = PageXml.createPageXmlNode("SeparatorRegion") ndSep.set("algo", "tabulate_rows") ndCoords = PageXml.createPageXmlNode("Coords") ndCoords.set("points", "%d,%d %d,%d" %(0, Y0, w, Yw)) ndSep.append(ndCoords) ndSep.tail = "\n" ndPage.append(ndSep) return