def addEdgesToXml(cls, ndPage, sAlgo, lCluster): cnt = 0 ndPage.append(etree.Comment("\nInter-cluster edges by tabulate_cluster scale_H=%.2f sclae_V=%.2f\n" %( cls.scale_H, cls.scale_V))) setEdges = set() for A in lCluster: for edge_type, lLinked in A.dsEdge.items(): for B in lLinked: if A.cnt >= B.cnt: continue if (A, B, edge_type) not in setEdges: # ok, let's add the edge A <--> B ndEdge = PageXml.createPageXmlNode("ClusterEdge") ndEdge.set("src", A.name) ndEdge.set("tgt", B.name) ndEdge.set("type", edge_type) ndEdge.set("algo", sAlgo) if True: ptA = A.shape.representative_point() ptB = B.shape.representative_point() else: ptA, ptB = shapely.ops.nearest_points(A.shape, B.shape) PageXml.setPoints(ndEdge, list(ptA.coords) + list(ptB.coords)) ndEdge.tail = "\n" ndPage.append(ndEdge) setEdges.add((A, B, edge_type)) cnt += 1 del setEdges return cnt
def findTemplate(self,doc): """ find the page where the first TableRegion occurs and extract it """ lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion') if lT == []: return None firstTable=lT[0] # lazy guy! page = firstTable.parent newDoc,_ = PageXml.createPageXmlDocument('XRCE', '', 0,0) page.unlinkNode() newDoc.setRootElement(page) ### need to add the ns!! # print newDoc.serialize('utf-8',True) # Borders must be visible: #leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false lcells = PageXml.getChildByName(newDoc.getRootElement(),'TableCell') for cell in lcells: cell.setProp("leftBorderVisible",'true') cell.setProp("rightBorderVisible",'true') cell.setProp("topBorderVisible",'true') cell.setProp("bottomBorderVisible",'true') return newDoc
def setDomNodeLabel(self, domnode, sLabel): """ Set the DOM node label in the format-dependent way """ if sLabel != self.sDefaultLabel: PageXml.setCustomAttr(domnode, self.sCustAttr_STRUCTURE, self.sCustAttr2_TYPE, self.dLabel2XmlLabel[sLabel]) return sLabel
def setDocNodeLabel(self, graph_node, sLabel): """ Set the DOM node label in the format-dependent way """ try: sXmlLabel = self.dLabel2XmlLabel[sLabel] except KeyError: sXmlLabel = "other" graph_node.node.set(self.sLabelAttr, sXmlLabel) # tagging using the custem XML attribute PageXml.setCustomAttr(graph_node.node, "structure", "type", sXmlLabel)
def run(self): docdom = self.loadDom() tabledom = self.loadDom(filename=self.tableFile) tabledom = self.mergetextRegion2Cell(docdom, tabledom) PageXml.setMetadata(tabledom, None, 'NLE', Comments='TextRegion/TableCell Merging') return tabledom
def addClusterToDom(self, dCluster, bMoveContent=False): """ Add Cluster elements to the Page DOM node """ pageNode = None for x, lnidx in dCluster.items(): #self.analysedCluster() if pageNode is None: pageNode = self.lNode[lnidx[0]].page.node pageNode.append( etree.Comment("Clusters created by the conjugate graph")) # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx] # Make it robust to bad data... lp = [] for _i in lnidx: try: lp.append(ShapeLoader.node_to_Polygon(self.lNode[_i].node)) except ValueError: pass contour = cascaded_union( [p if p.is_valid else p.convex_hull for p in lp]) # print(contour.wkt) try: spoints = ' '.join( "%s,%s" % (int(x[0]), int(x[1])) for x in contour.minimum_rotated_rectangle.exterior.coords) except: try: spoints = ' '.join( "%s,%s" % (int(x[0]), int(x[1])) for x in contour.minimum_rotated_rectangle.coords) # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence except: spoints = "" #print (spoints) ndCluster = PageXml.createPageXmlNode('Cluster') # add the space separated list of node ids ndCluster.set( "content", " ".join(self.lNode[_i].node.get("id") for _i in lnidx)) coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) coords.set('points', spoints) pageNode.append(ndCluster) if bMoveContent: # move the DOM node of the content to the cluster for _i in lnidx: ndCluster.append(self.lNode[_i].node) return
def test_getMetadata(): doc = getMetadataTestDOM() nd = doc.getroot() md = PageXml.getMetadata(doc) assert md.Creator == "Tilla" assert md.Created == "2016-08-18T13:35:08.252+07:00" assert md.LastChange == "2016-12-01T09:53:39.610+01:00" assert md.Comments == None md = PageXml.getMetadata(None, nd[0]) assert md.Creator == "Tilla" assert md.Created == "2016-08-18T13:35:08.252+07:00" assert md.LastChange == "2016-12-01T09:53:39.610+01:00"
def setDocNodeLabel(self, graph_node, sLabel): """ Set the DOM node label in the format-dependent way """ super(NodeType_for_TextRegion, self).setDocNodeLabel(graph_node, sLabel) # tagging the TextRegion with @type # tagging inner TextLine sLabel = str(graph_node.node.get(self.sLabelAttr)) dNS = {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"} lNd = graph_node.node.xpath(".//pc:TextLine", namespaces=dNS) for nd in lNd: nd.set(self.sLabelAttr, sLabel) PageXml.setCustomAttr(nd, "structure", "type", sLabel) return sLabel
def run(self): docdom = self.loadDom() self.convertTableCells(docdom) try: PageXml.setMetadata(docdom, None, 'NLE', Comments='TableCell 2 TextRegion') except ValueError: MultiPageXml.setMetadata(docdom, None, 'NLE', Comments='TableCell 2 TextRegion') return docdom
def regularTextLines(self, doc): """ from a baseline: create a regular TextLine: rectangle along the baseline """ lTextLines = PageXml.getChildByName(doc.getRootElement(), 'TextLine') for tl in lTextLines: coord = tl.children baseline = tl.children.next sPoints = baseline.prop('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) except ValueError: print tl plg = Polygon(lXY) iHeight = 50 # not points!! 300 dpi : 62.5 x1, y1, x2, y2 = plg.getBoundingBox() coord.setProp( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = "" #now we need to infer the bounding box of that object (x1, y1), (x2, y2) = PageXml.getPointList(ndBlock) #the polygon orientation = 0 classIndex = 0 #is computed later on #and create a Block # we pass the coordinates, not x1,y1,w,h !! cutBlk = Block(page, ((x1, y1), (x2, y2)), sText, orientation, classIndex, self, ndBlock, domid=domid) # Create the shapely shape cutBlk.shape = geom.LineString([(x1, y1), (x2, y2)]) cutBlk.angle = float(ndBlock.get("DU_angle")) cutBlk.angle_freq = float(ndBlock.get("DU_angle_freq")) cutBlk.angle_cumul_freq = float(ndBlock.get("DU_angle_cumul_freq")) cutBlk.set_support = literal_eval(ndBlock.get("DU_set_support")) yield cutBlk return
def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes """ assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #order blocks from top to bottom of page lOrderedNdBlock = list() for ndBlock in lNdBlock: lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: raise ValueError("Node %x has invalid coordinates" % str(ndBlock)) plg = Polygon(lXY) _, (xg, yg) = plg.getArea_and_CenterOfMass() lOrderedNdBlock.append( (yg, ndBlock)) #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved lOrderedNdBlock.sort() for _, ndBlock in lOrderedNdBlock: yield ndBlock return
def _get_GraphNodeText(self, doc, domNdPage, ndBlock, ctxt=None): """ Extract the text of a DOM node Get the DOM, the DOM page node, the page object DOM node, and optionally an xpath context return a unicode string """ lNdText = ndBlock.xpath(self.sxpTextual, namespaces=self.dNS) if len(lNdText) != 1: if len(lNdText) > 1: raise ValueError( "More than 1 textual content for this node: %s" % etree.tostring(ndBlock)) #let's try to get th etext of the words, and concatenate... # traceln("Warning: no text in node %s => looking at words!"%ndBlock.prop("id")) # lsText = [ntext.content.decode('utf-8').strip() for ntext in ctxt.xpathEval('.//pc:Word/pc:TextEquiv//text()')] #if we have both PlainText and UnicodeText in XML, :-/ lsText = [ _nd.text.strip() for _nd in ctxt.xpathEval('.//pc:Word/pc:TextEquiv') ] #if we have both PlainText and UnicodeText in XML, :-/ return " ".join(lsText) return PageXml.makeText(lNdText[0])
def cluster2Region(doc, fTH=0.5, bVerbose=True): """ """ root = doc.getroot() xpTextRegions = ".//pg:TextRegion" # get pages for iPage, ndPage in enumerate(PageXml.xpath(root, "//pc:Page")): # get cluster dClusters = getCLusters( ndPage) #ndPage.xpath(xpCluster, namespaces=dNS) lRegionsNd = ndPage.xpath(xpTextRegions, namespaces=dNS) if bVerbose: traceln(" %d clusters and %d regions found" % (len(dClusters), len(lRegionsNd))) addRegionToDom(ndPage, iPage + 1, dClusters, bVerbose) if bVerbose: traceln(" %d regions created" % (len(dClusters))) deleteRegionsinDOM(ndPage, lRegionsNd) # lEdgesNd = ndPage.xpath(".//pg:Edge", namespaces=dNS) deleteRegionsinDOM(ndPage, lEdgesNd) lClustersNd = ndPage.xpath(".//pg:Cluster", namespaces=dNS) deleteRegionsinDOM(ndPage, lClustersNd) return doc
def run(self, domDoc): """ conversion """ ODoc = XMLDSDocument() # ODoc.lastPage=1 ODoc.loadFromDom(domDoc) lPageXmlDoc = [] lPages = ODoc.getPages() for page in lPages: # print("%s %s"%(page, page.getAttribute('imageFilename'))) try: filename = os.path.basename(page.getAttribute('imageFilename')) except: filename = "fakename" pageXmlDoc, pageNode = PageXml.createPageXmlDocument( creatorName='NLE', filename=filename, imgW=convertDot2Pixel(self.dpi, page.getWidth()), imgH=convertDot2Pixel(self.dpi, page.getHeight())) self.pageXmlNS = etree.QName(pageXmlDoc.getroot()).namespace if self.bRegionOnly: self.convertOnlyRegion(page, pageNode) else: self.convertDSPage(page, pageNode) lPageXmlDoc.append( (pageXmlDoc, page.getAttribute('imageFilename'))) return lPageXmlDoc
def parseDomNodeLabel(self, domnode, defaultCls=None): """ Parse and set the graph node label and return its class index raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one """ sLabel = self.sDefaultLabel try: try: sXmlLabel = PageXml.getCustomAttr(domnode, self.sCustAttr_STRUCTURE, self.sCustAttr2_TYPE) except PageXmlException as e: if self.bOther: return self.sDefaultLabel #absence of label but bOther was True (I guess) else: raise e try: sLabel = self.dXmlLabel2Label[sXmlLabel] except KeyError: #not a label of interest try: self.checkIsIgnored(sXmlLabel) #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel: except: raise ValueError("Invalid label '%s' in node %s"%(sXmlLabel, etree.tostring(domnode))) except KeyError: #no label at all if not self.sDefaultLabel: raise ValueError("Missing label in node %s"%etree.tostring(domnode)) return sLabel
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" traceln("Warning: no text in node %s"%domid) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1,y1,x2,y2 = plg.getBoundingBox() except ValueError: x1,y1,x2,y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not(self.BBoxDeltaFun is None): w,h = x2-x1, y2-y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk raise StopIteration()
def makeClusterNode(self, sAlgo): """ Create an XML node reflecting the cluster """ ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", self.name) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set("content", " ".join(self.setID)) ndCoords = PageXml.createPageXmlNode('Coords') ndCluster.append(ndCoords) if self.shape is None: ndCoords.set('points', "") else: ndCoords.set('points', ShapeLoader.getCoordsString(self.shape)) ndCluster.tail = "\n" return ndCluster
def reinitPage(self, doc): """ empty page """ lNodes = PageXml.getChildByName(doc.getroot(), 'Page') for node in lNodes: node.unlinkNode()
def _convertPageAnnotation(self, pnum, page, domNdPage): """ """ for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page): try: sResoNum = None lbl = PageXml.getCustomAttr(nd, "structure", "type") if lbl in ["heading"]: semLabel = self.dAnnotMapping[lbl] #heading may indicate a new resolution! if self.prevResolutionNumber == None: sgmLabel = self._getCurrentSegmentationLabel( ) #for instance 2 consecutive headings else: sgmLabel = self._switchSegmentationLabel() self.prevResolutionNumber = None #so that next number does not switch Heigh/Ho label elif lbl in ["header", "page-number", "marginalia"]: #continuation of a resolution semLabel = self.dAnnotMapping[lbl] sgmLabel = self._getCurrentSegmentationLabel() else: o = self.creResolutionHumanLabel.match(lbl) if not o: raise ValueError("%s is not a valid human annotation" % lbl) semLabel = self.dAnnotMapping[o.group( 1)] #"" for the resolution number #Here we have a resolution number! sResoNum = o.group(2) if not sResoNum: raise ValueError( "%s is not a valid human annotation - missing resolution number" % lbl) #now switch between heigh and ho !! :)) if self.prevResolutionNumber != None and self.prevResolutionNumber != sResoNum: #we got a new number, so switching segmentation label! sgmLabel = self._switchSegmentationLabel() else: #either same number or switching already done due to a heading sgmLabel = self._getCurrentSegmentationLabel() self.prevResolutionNumber = sResoNum except PageXmlException: semLabel = self.sOther sgmLabel = self._getCurrentSegmentationLabel() nd.set(self.sSemAttr, semLabel) nd.set(self.sSgmAttr, sgmLabel) if sResoNum: nd.set(self.sNumAttr, sResoNum ) #only when the number is part of the humanannotation!
def _convertPageAnnotation(self, pnum, page, domNdPage): """ """ #change: on each page we start by Heigh bRestartAtEachPageWithHeigh = True if bRestartAtEachPageWithHeigh: self._initSegmentationLabel() for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page): try: lbl = PageXml.getCustomAttr(nd, "structure", "type") except PageXmlException: nd.set(self.sSemAttr, self.sOther) nd.set(self.sSgmAttr, self.sOther) continue #this node has no annotation whatsoever if lbl in ["heading", "header", "page-number", "marginalia"]: semLabel = lbl sgmLabel = self.sOther #those elements are not part of a resolution sResoNum = None else: o = self.creResolutionHumanLabel.match(lbl) if not o: raise ValueError("%s is not a valid human annotation" % lbl) semLabel = o.group(1) #"" for the resolution number #now decide on the segmentation label sResoNum = o.group(2) if not sResoNum: raise ValueError( "%s is not a valid human annotation - missing resolution number" % lbl) #now switch between heigh and ho !! :)) if self.prevResolutionNumber == sResoNum: sgmLabel = self.prevSgmLbl else: sgmLabel = self._getNextSegmentationLabel(self.prevSgmLbl) assert bRestartAtEachPageWithHeigh or sResoNum not in self.lSeenResoNum, "ERROR: the ordering of the block has not preserved resolution number contiguity" self.lSeenResoNum.append(sResoNum) self.prevResolutionNumber, self.prevSgmLbl = sResoNum, sgmLabel #always have a semantic label sNewSemLbl = self.dAnnotMapping[semLabel] assert sNewSemLbl nd.set(self.sSemAttr, sNewSemLbl) #DU annotation #resolution parts also have a segmentation label and a resolution number assert sgmLabel nd.set(self.sSgmAttr, sgmLabel) #DU annotation if sResoNum: nd.set(self.sNumAttr, sResoNum)
def addEdgeToDOM(self, Y=None): """ To display the graph conveniently we add new Edge elements """ ndPage = self.lNode[0].page.node # w = int(ndPage.get("imageWidth")) ndPage.append(etree.Comment("Edges added to the XML for convenience")) for edge in self.lEdge: A, B = edge.A, edge.B #shape.centroid, edge.B.shape.centroid ndEdge = PageXml.createPageXmlNode("Edge") ndEdge.set("src", edge.A.node.get("id")) ndEdge.set("tgt", edge.B.node.get("id")) ndEdge.set("type", edge.__class__.__name__) ndEdge.tail = "\n" ndPage.append(ndEdge) PageXml.setPoints(ndEdge, [(A.x1, A.y1), (B.x1, B.y1)]) return
def test_malformed_custom(): with pytest.raises(ValueError): PageXml.parseCustomAttr("a {x1;}") with pytest.raises(ValueError): PageXml.parseCustomAttr("a x1;}") with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x1;") with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x1 }") #with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x:1 }") #should it fail? assert PageXml.parseCustomAttr("a { x:1 2}") == {'a': {'x': '1 2'}} #with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x:1 2}")#should it fail? (or do we allow spaces in names or values?) assert PageXml.parseCustomAttr(" a b { x y : 1 2 }") == { 'a b': { 'x y': '1 2' } }
def addClusterToDom(self, lCluster, bMoveContent=False, sAlgo="", pageNode=None): """ Add Cluster elements to the Page DOM node """ lNdCluster = [] for name, lnidx in enumerate(lCluster): #self.analysedCluster() if pageNode is None: for idx in lnidx: pageNode = self.lNode[idx].page.node break pageNode.append( etree.Comment( "\nClusters created by the conjugate graph\n")) ndCluster = PageXml.createPageXmlNode('Cluster') ndCluster.set("name", str(name)) ndCluster.set("algo", sAlgo) # add the space separated list of node ids ndCluster.set( "content", " ".join(self.lNode[_i].node.get("id") for _i in lnidx)) coords = PageXml.createPageXmlNode('Coords') ndCluster.append(coords) spoints = ShapeLoader.minimum_rotated_rectangle( [self.lNode[_i].node for _i in lnidx]) coords.set('points', spoints) pageNode.append(ndCluster) ndCluster.tail = "\n" if bMoveContent: # move the DOM node of the content to the cluster for _i in lnidx: ndCluster.append(self.lNode[_i].node) lNdCluster.append(ndCluster) return lNdCluster
def unLinkTextLines(self,doc): """ delete textlines and baselines """ lT = PageXml.getChildByName(doc.getRootElement(),'TextLine') if lT == []: return doc for text in lT: text.unlinkNode() text.freeNode() return doc
def unLinkTable(self,doc): """ delete table """ lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion') if lT == []: return doc for table in lT: table.unlinkNode() table.freeNode() return doc
def test_getsetCustomAttr(): sXml = b""" <TextRegion type="page-number" id="p1_region_1471502505726_2" custom="readingOrder {index:9;} structure {type:page-number;}"> <Coords points="972,43 1039,43 1039,104 972,104"/> </TextRegion> """ doc = etree.parse(BytesIO(sXml)) nd = doc.getroot() assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '9' assert PageXml.setCustomAttr(nd, "readingOrder", "index", 99) == 99 assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '99' assert PageXml.getCustomAttr(nd, "readingOrder") == {'index': '99'} assert PageXml.setCustomAttr(nd, "readingOrder", "toto", "zou") == "zou" assert PageXml.getCustomAttr(nd, "readingOrder", "toto") == 'zou' with pytest.raises(PageXmlException): PageXml.getCustomAttr(nd, "readingOrder", "axiste_pas") with pytest.raises(PageXmlException): PageXml.getCustomAttr(nd, "axiste_pas_non_plus", "axiste_pas")
def extractFileNamesFromMPXML(self, mpxmldoc): """ to insure correct file order ! duplicated form performCVLLA.py """ xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid)) lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page') # for i in lNd:print i return map( lambda x: "%s%s%s.xml" % (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)
def extractFileNamesFromMPXML(self, doc): """ to insure correct file order ! """ xmlpath = os.path.abspath( "%s%s%s%s%s" % (self.coldir, os.sep, 'col', os.sep, self.docid)) lNd = PageXml.getChildByName(doc.getroot(), 'Page') # for i in lNd:print i return list( map( lambda x: "%s%s%s.xml" % (xmlpath, os.sep, x.get('imageFilename')[:-4]), lNd))
def convertTR2Sep(filename): """ """ print (filename) tagname='TextRegion' xml = etree.parse(filename) ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}) for x in ltextsep: if "separator" in x.get('custom'): x.tag = 'SeparatorRegion' #now we need to convert that object to a line lXY = PageXml.getPointList(x) #the polygon assert lXY, "Separator without Coord??" plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ValueError: print("Warning: Coords might be bad, taking bounding box: ", lXY) x1,y1,x2,y2 = plg.getBoundingBox() # try: # x1,y1, x2,y2 = plg.fitRectangle() # except ZeroDivisionError: # x1,y1,x2,y2 = plg.getBoundingBox() # except ValueError: # x1,y1,x2,y2 = plg.getBoundingBox() if abs(x2-x1) > abs(y2-y1): # horizontal y1 = (y1+y2)/2 y2 = y1 else: x1 = (x1+x2)/2 x2=x1 ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0] PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)]) return xml