def addEdgesToXml(cls, ndPage, sAlgo, lCluster):
     cnt = 0
     ndPage.append(etree.Comment("\nInter-cluster edges by tabulate_cluster scale_H=%.2f sclae_V=%.2f\n" %(
         cls.scale_H, cls.scale_V)))
     
     setEdges = set()
     
     for A in lCluster:
         for edge_type, lLinked in A.dsEdge.items():
             for B in lLinked:
                 if A.cnt >= B.cnt: continue
                 if (A, B, edge_type) not in setEdges:
                     # ok, let's add the edge A <--> B
                     ndEdge = PageXml.createPageXmlNode("ClusterEdge")
                     ndEdge.set("src", A.name)
                     ndEdge.set("tgt", B.name)
                     ndEdge.set("type", edge_type)
                     ndEdge.set("algo", sAlgo)
                     if True:
                         ptA = A.shape.representative_point()
                         ptB = B.shape.representative_point()
                     
                     else:
                         ptA, ptB = shapely.ops.nearest_points(A.shape, B.shape)
                     PageXml.setPoints(ndEdge, list(ptA.coords) + list(ptB.coords)) 
                     ndEdge.tail = "\n"
                     ndPage.append(ndEdge)
                     
                     setEdges.add((A, B, edge_type))
                     cnt += 1
     del setEdges
     
     return cnt
Ejemplo n.º 2
0
    def findTemplate(self,doc):
        """
            find the page where the first TableRegion occurs and extract it
        """
        lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion')
        if lT == []:
            return None
        firstTable=lT[0]
        # lazy guy!
        page = firstTable.parent
        newDoc,_ = PageXml.createPageXmlDocument('XRCE', '', 0,0)
        page.unlinkNode()
        newDoc.setRootElement(page)
        ### need to add the ns!!
#         print newDoc.serialize('utf-8',True)
        # Borders must be visible: 
        #leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false
        lcells = PageXml.getChildByName(newDoc.getRootElement(),'TableCell')
        for cell in lcells:
            cell.setProp("leftBorderVisible",'true')
            cell.setProp("rightBorderVisible",'true')
            cell.setProp("topBorderVisible",'true')
            cell.setProp("bottomBorderVisible",'true')

        return newDoc
Ejemplo n.º 3
0
 def setDomNodeLabel(self, domnode, sLabel):
     """
     Set the DOM node label in the format-dependent way
     """
     if sLabel != self.sDefaultLabel:
         PageXml.setCustomAttr(domnode, self.sCustAttr_STRUCTURE,
                               self.sCustAttr2_TYPE,
                               self.dLabel2XmlLabel[sLabel])
     return sLabel
Ejemplo n.º 4
0
 def setDocNodeLabel(self, graph_node, sLabel):
     """
     Set the DOM node label in the format-dependent way
     """
     try:
         sXmlLabel = self.dLabel2XmlLabel[sLabel]
     except KeyError:
         sXmlLabel = "other"
     graph_node.node.set(self.sLabelAttr, sXmlLabel)        
     # tagging using the custem XML attribute
     PageXml.setCustomAttr(graph_node.node, "structure", "type", sXmlLabel)
Ejemplo n.º 5
0
    def run(self):

        docdom = self.loadDom()
        tabledom = self.loadDom(filename=self.tableFile)
        tabledom = self.mergetextRegion2Cell(docdom, tabledom)
        PageXml.setMetadata(tabledom,
                            None,
                            'NLE',
                            Comments='TextRegion/TableCell Merging')

        return tabledom
Ejemplo n.º 6
0
    def addClusterToDom(self, dCluster, bMoveContent=False):
        """
        Add Cluster elements to the Page DOM node
        """
        pageNode = None
        for x, lnidx in dCluster.items():
            #self.analysedCluster()
            if pageNode is None:
                pageNode = self.lNode[lnidx[0]].page.node
                pageNode.append(
                    etree.Comment("Clusters created by the conjugate graph"))

            # lp = [ShapeLoader.node_to_Polygon(self.lNode[_i].node) for _i in lnidx]
            # Make it robust to bad data...
            lp = []
            for _i in lnidx:
                try:
                    lp.append(ShapeLoader.node_to_Polygon(self.lNode[_i].node))
                except ValueError:
                    pass
            contour = cascaded_union(
                [p if p.is_valid else p.convex_hull for p in lp])
            # print(contour.wkt)
            try:
                spoints = ' '.join(
                    "%s,%s" % (int(x[0]), int(x[1]))
                    for x in contour.minimum_rotated_rectangle.exterior.coords)
            except:
                try:
                    spoints = ' '.join(
                        "%s,%s" % (int(x[0]), int(x[1]))
                        for x in contour.minimum_rotated_rectangle.coords)
                    # JL got once a: NotImplementedError: Multi-part geometries do not provide a coordinate sequence
                except:
                    spoints = ""
            #print (spoints)
            ndCluster = PageXml.createPageXmlNode('Cluster')
            # add the space separated list of node ids
            ndCluster.set(
                "content",
                " ".join(self.lNode[_i].node.get("id") for _i in lnidx))
            coords = PageXml.createPageXmlNode('Coords')
            ndCluster.append(coords)
            coords.set('points', spoints)
            pageNode.append(ndCluster)

            if bMoveContent:
                # move the DOM node of the content to the cluster
                for _i in lnidx:
                    ndCluster.append(self.lNode[_i].node)

        return
Ejemplo n.º 7
0
def test_getMetadata():
    doc = getMetadataTestDOM()
    nd = doc.getroot()

    md = PageXml.getMetadata(doc)
    assert md.Creator == "Tilla"
    assert md.Created == "2016-08-18T13:35:08.252+07:00"
    assert md.LastChange == "2016-12-01T09:53:39.610+01:00"
    assert md.Comments == None

    md = PageXml.getMetadata(None, nd[0])
    assert md.Creator == "Tilla"
    assert md.Created == "2016-08-18T13:35:08.252+07:00"
    assert md.LastChange == "2016-12-01T09:53:39.610+01:00"
Ejemplo n.º 8
0
    def setDocNodeLabel(self, graph_node, sLabel):
        """
        Set the DOM node label in the format-dependent way
        """
        super(NodeType_for_TextRegion, self).setDocNodeLabel(graph_node, sLabel)  # tagging the TextRegion with @type

        # tagging inner TextLine
        sLabel = str(graph_node.node.get(self.sLabelAttr))
        dNS = {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}
        lNd = graph_node.node.xpath(".//pc:TextLine", namespaces=dNS)
        for nd in lNd:
            nd.set(self.sLabelAttr, sLabel)
            PageXml.setCustomAttr(nd, "structure", "type", sLabel)

        return sLabel
Ejemplo n.º 9
0
    def run(self):

        docdom = self.loadDom()
        self.convertTableCells(docdom)
        try:
            PageXml.setMetadata(docdom,
                                None,
                                'NLE',
                                Comments='TableCell 2 TextRegion')
        except ValueError:
            MultiPageXml.setMetadata(docdom,
                                     None,
                                     'NLE',
                                     Comments='TableCell 2 TextRegion')
        return docdom
Ejemplo n.º 10
0
    def regularTextLines(self, doc):
        """
            from a baseline: create a regular TextLine: rectangle along the baseline
                
        """

        lTextLines = PageXml.getChildByName(doc.getRootElement(), 'TextLine')
        for tl in lTextLines:
            coord = tl.children
            baseline = tl.children.next
            sPoints = baseline.prop('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                except ValueError:
                    print tl
            plg = Polygon(lXY)
            iHeight = 50  # not points!!  300 dpi : 62.5
            x1, y1, x2, y2 = plg.getBoundingBox()
            coord.setProp(
                'points', "%d,%d %d,%d %d,%d %d,%d" %
                (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
Ejemplo n.º 11
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """    
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = ""
            
            #now we need to infer the bounding box of that object
            (x1, y1), (x2, y2) = PageXml.getPointList(ndBlock)  #the polygon
            
            orientation = 0 
            classIndex = 0   #is computed later on

            #and create a Block
            # we pass the coordinates, not x1,y1,w,h !!
            cutBlk = Block(page, ((x1, y1), (x2, y2)), sText, orientation, classIndex, self, ndBlock, domid=domid)
            
            # Create the shapely shape
            cutBlk.shape = geom.LineString([(x1, y1), (x2, y2)])
            cutBlk.angle = float(ndBlock.get("DU_angle"))
            cutBlk.angle_freq       = float(ndBlock.get("DU_angle_freq"))
            cutBlk.angle_cumul_freq = float(ndBlock.get("DU_angle_cumul_freq"))
            cutBlk.set_support      = literal_eval(ndBlock.get("DU_set_support"))
            
            yield cutBlk
            
        return        
    def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes
        """    
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS)

        #order blocks from top to bottom of page
        lOrderedNdBlock = list()
        for ndBlock in lNdBlock:
            
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                raise ValueError("Node %x has invalid coordinates" % str(ndBlock))
            
            plg = Polygon(lXY)
            _, (xg, yg) = plg.getArea_and_CenterOfMass()
            
            lOrderedNdBlock.append( (yg, ndBlock))  #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved
            
        lOrderedNdBlock.sort()
        
        for _, ndBlock in lOrderedNdBlock: yield ndBlock
            
        return        
Ejemplo n.º 13
0
    def _get_GraphNodeText(self, doc, domNdPage, ndBlock, ctxt=None):
        """
        Extract the text of a DOM node
        
        Get the DOM, the DOM page node, the page object DOM node, and optionally an xpath context

        return a unicode string
        """
        lNdText = ndBlock.xpath(self.sxpTextual, namespaces=self.dNS)
        if len(lNdText) != 1:
            if len(lNdText) > 1:
                raise ValueError(
                    "More than 1 textual content for this node: %s" %
                    etree.tostring(ndBlock))

            #let's try to get th etext of the words, and concatenate...
            # traceln("Warning: no text in node %s => looking at words!"%ndBlock.prop("id"))
            # lsText = [ntext.content.decode('utf-8').strip() for ntext in ctxt.xpathEval('.//pc:Word/pc:TextEquiv//text()')] #if we have both PlainText and UnicodeText in XML, :-/
            lsText = [
                _nd.text.strip()
                for _nd in ctxt.xpathEval('.//pc:Word/pc:TextEquiv')
            ]  #if we have both PlainText and UnicodeText in XML, :-/
            return " ".join(lsText)

        return PageXml.makeText(lNdText[0])
Ejemplo n.º 14
0
def cluster2Region(doc, fTH=0.5, bVerbose=True):
    """
    
    """
    root = doc.getroot()

    xpTextRegions = ".//pg:TextRegion"

    # get pages
    for iPage, ndPage in enumerate(PageXml.xpath(root, "//pc:Page")):
        # get cluster
        dClusters = getCLusters(
            ndPage)  #ndPage.xpath(xpCluster, namespaces=dNS)
        lRegionsNd = ndPage.xpath(xpTextRegions, namespaces=dNS)
        if bVerbose:
            traceln(" %d clusters and %d regions found" %
                    (len(dClusters), len(lRegionsNd)))

        addRegionToDom(ndPage, iPage + 1, dClusters, bVerbose)
        if bVerbose: traceln(" %d regions created" % (len(dClusters)))
        deleteRegionsinDOM(ndPage, lRegionsNd)

        #
        lEdgesNd = ndPage.xpath(".//pg:Edge", namespaces=dNS)
        deleteRegionsinDOM(ndPage, lEdgesNd)

        lClustersNd = ndPage.xpath(".//pg:Cluster", namespaces=dNS)
        deleteRegionsinDOM(ndPage, lClustersNd)

    return doc
Ejemplo n.º 15
0
    def run(self, domDoc):
        """
            conversion
        """
        ODoc = XMLDSDocument()
        # ODoc.lastPage=1
        ODoc.loadFromDom(domDoc)
        lPageXmlDoc = []
        lPages = ODoc.getPages()
        for page in lPages:
            #             print("%s %s"%(page, page.getAttribute('imageFilename')))
            try:
                filename = os.path.basename(page.getAttribute('imageFilename'))
            except:
                filename = "fakename"
            pageXmlDoc, pageNode = PageXml.createPageXmlDocument(
                creatorName='NLE',
                filename=filename,
                imgW=convertDot2Pixel(self.dpi, page.getWidth()),
                imgH=convertDot2Pixel(self.dpi, page.getHeight()))
            self.pageXmlNS = etree.QName(pageXmlDoc.getroot()).namespace
            if self.bRegionOnly:
                self.convertOnlyRegion(page, pageNode)
            else:
                self.convertDSPage(page, pageNode)
            lPageXmlDoc.append(
                (pageXmlDoc, page.getAttribute('imageFilename')))

        return lPageXmlDoc
Ejemplo n.º 16
0
 def parseDomNodeLabel(self, domnode, defaultCls=None):
     """
     Parse and set the graph node label and return its class index
     raise a ValueError if the label is missing while bOther was not True, or if the label is neither a valid one nor an ignored one
     """
     sLabel = self.sDefaultLabel
     try:
         try:
             sXmlLabel = PageXml.getCustomAttr(domnode, self.sCustAttr_STRUCTURE, self.sCustAttr2_TYPE)
         except PageXmlException as e:
             if self.bOther:
                 return self.sDefaultLabel  #absence of label but bOther was True (I guess)
             else:
                 raise e
         try:
             sLabel = self.dXmlLabel2Label[sXmlLabel]
         except KeyError:
             #not a label of interest
             try:
                 self.checkIsIgnored(sXmlLabel)
                 #if self.lsXmlIgnoredLabel and sXmlLabel not in self.lsXmlIgnoredLabel: 
             except:
                 raise ValueError("Invalid label '%s' in node %s"%(sXmlLabel, etree.tostring(domnode)))
     except KeyError:
         #no label at all
         if not self.sDefaultLabel: raise ValueError("Missing label in node %s"%etree.tostring(domnode))
     
     return sLabel
Ejemplo n.º 17
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """    
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                traceln("Warning: no text in node %s"%domid) 
                #raise ValueError, "No text in node: %s"%ndBlock 
            
            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ZeroDivisionError:
#                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
#                 continue
#             if True:
#                 #we reduce a bit this rectangle, to ovoid overlap
#                 w,h = x2-x1, y2-y1
#                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
#                 dy = max(h * 0.066, min(20, w/3))
#                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1,y1,x2,y2 = plg.getBoundingBox()
            except ValueError:
                x1,y1,x2,y2 = plg.getBoundingBox()
                
                
            #we reduce a bit this rectangle, to ovoid overlap
            if not(self.BBoxDeltaFun is None):
                w,h = x2-x1, y2-y1
                dx = self.BBoxDeltaFun(w)
                dy = self.BBoxDeltaFun(h)
                x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]
                
            
            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0   #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid)
            
            yield blk
            
        raise StopIteration()        
Ejemplo n.º 18
0
 def makeClusterNode(self, sAlgo):
     """
     Create an XML node reflecting the cluster
     """
     ndCluster = PageXml.createPageXmlNode('Cluster')
     ndCluster.set("name", self.name)
     ndCluster.set("algo", sAlgo)
     # add the space separated list of node ids
     ndCluster.set("content", " ".join(self.setID))
     ndCoords = PageXml.createPageXmlNode('Coords')
     ndCluster.append(ndCoords)
     if self.shape is None:
         ndCoords.set('points', "")
     else:
         ndCoords.set('points', ShapeLoader.getCoordsString(self.shape))
     ndCluster.tail = "\n"
     return ndCluster
Ejemplo n.º 19
0
    def reinitPage(self, doc):
        """
         empty page 
        """
        lNodes = PageXml.getChildByName(doc.getroot(), 'Page')

        for node in lNodes:
            node.unlinkNode()
Ejemplo n.º 20
0
    def _convertPageAnnotation(self, pnum, page, domNdPage):
        """
         
        """
        for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):

            try:
                sResoNum = None
                lbl = PageXml.getCustomAttr(nd, "structure", "type")

                if lbl in ["heading"]:
                    semLabel = self.dAnnotMapping[lbl]
                    #heading may indicate a new resolution!
                    if self.prevResolutionNumber == None:
                        sgmLabel = self._getCurrentSegmentationLabel(
                        )  #for instance 2 consecutive headings
                    else:
                        sgmLabel = self._switchSegmentationLabel()
                        self.prevResolutionNumber = None  #so that next number does not switch Heigh/Ho label
                elif lbl in ["header", "page-number", "marginalia"]:
                    #continuation of a resolution
                    semLabel = self.dAnnotMapping[lbl]
                    sgmLabel = self._getCurrentSegmentationLabel()
                else:
                    o = self.creResolutionHumanLabel.match(lbl)
                    if not o:
                        raise ValueError("%s is not a valid human annotation" %
                                         lbl)
                    semLabel = self.dAnnotMapping[o.group(
                        1)]  #"" for the resolution number

                    #Here we have a resolution number!
                    sResoNum = o.group(2)
                    if not sResoNum:
                        raise ValueError(
                            "%s is not a valid human annotation - missing resolution number"
                            % lbl)

                    #now switch between heigh and ho !! :))
                    if self.prevResolutionNumber != None and self.prevResolutionNumber != sResoNum:
                        #we got a new number, so switching segmentation label!
                        sgmLabel = self._switchSegmentationLabel()
                    else:
                        #either same number or switching already done due to a heading
                        sgmLabel = self._getCurrentSegmentationLabel()

                    self.prevResolutionNumber = sResoNum

            except PageXmlException:
                semLabel = self.sOther
                sgmLabel = self._getCurrentSegmentationLabel()

            nd.set(self.sSemAttr, semLabel)
            nd.set(self.sSgmAttr, sgmLabel)
            if sResoNum:
                nd.set(self.sNumAttr, sResoNum
                       )  #only when the number is part of the humanannotation!
Ejemplo n.º 21
0
    def _convertPageAnnotation(self, pnum, page, domNdPage):
        """
        
        """

        #change: on each page we start by Heigh
        bRestartAtEachPageWithHeigh = True
        if bRestartAtEachPageWithHeigh: self._initSegmentationLabel()

        for nd in self._iter_TextRegionNodeTop2Bottom(domNdPage, page):

            try:
                lbl = PageXml.getCustomAttr(nd, "structure", "type")
            except PageXmlException:
                nd.set(self.sSemAttr, self.sOther)
                nd.set(self.sSgmAttr, self.sOther)
                continue  #this node has no annotation whatsoever

            if lbl in ["heading", "header", "page-number", "marginalia"]:
                semLabel = lbl
                sgmLabel = self.sOther  #those elements are not part of a resolution
                sResoNum = None
            else:
                o = self.creResolutionHumanLabel.match(lbl)
                if not o:
                    raise ValueError("%s is not a valid human annotation" %
                                     lbl)
                semLabel = o.group(1)  #"" for the resolution number

                #now decide on the segmentation label
                sResoNum = o.group(2)
                if not sResoNum:
                    raise ValueError(
                        "%s is not a valid human annotation - missing resolution number"
                        % lbl)

                #now switch between heigh and ho !! :))
                if self.prevResolutionNumber == sResoNum:
                    sgmLabel = self.prevSgmLbl
                else:
                    sgmLabel = self._getNextSegmentationLabel(self.prevSgmLbl)
                    assert bRestartAtEachPageWithHeigh or sResoNum not in self.lSeenResoNum, "ERROR: the ordering of the block has not preserved resolution number contiguity"
                    self.lSeenResoNum.append(sResoNum)

                self.prevResolutionNumber, self.prevSgmLbl = sResoNum, sgmLabel

            #always have a semantic label
            sNewSemLbl = self.dAnnotMapping[semLabel]
            assert sNewSemLbl
            nd.set(self.sSemAttr, sNewSemLbl)  #DU annotation

            #resolution parts also have a segmentation label and a resolution number
            assert sgmLabel
            nd.set(self.sSgmAttr, sgmLabel)  #DU annotation

            if sResoNum:
                nd.set(self.sNumAttr, sResoNum)
Ejemplo n.º 22
0
    def addEdgeToDOM(self, Y=None):
        """
        To display the graph conveniently we add new Edge elements
        """
        ndPage = self.lNode[0].page.node
        # w = int(ndPage.get("imageWidth"))
        ndPage.append(etree.Comment("Edges added to the XML for convenience"))
        for edge in self.lEdge:
            A, B = edge.A, edge.B  #shape.centroid, edge.B.shape.centroid
            ndEdge = PageXml.createPageXmlNode("Edge")
            ndEdge.set("src", edge.A.node.get("id"))
            ndEdge.set("tgt", edge.B.node.get("id"))
            ndEdge.set("type", edge.__class__.__name__)
            ndEdge.tail = "\n"
            ndPage.append(ndEdge)
            PageXml.setPoints(ndEdge, [(A.x1, A.y1), (B.x1, B.y1)])

        return
Ejemplo n.º 23
0
def test_malformed_custom():
    with pytest.raises(ValueError):
        PageXml.parseCustomAttr("a {x1;}")
    with pytest.raises(ValueError):
        PageXml.parseCustomAttr("a x1;}")
    with pytest.raises(ValueError):
        PageXml.parseCustomAttr("a { x1;")
    with pytest.raises(ValueError):
        PageXml.parseCustomAttr("a { x1 }")

    #with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x:1 }")  #should it fail?
    assert PageXml.parseCustomAttr("a { x:1  2}") == {'a': {'x': '1  2'}}

    #with pytest.raises(ValueError): PageXml.parseCustomAttr("a { x:1  2}")#should it fail? (or do we allow spaces in names or values?)
    assert PageXml.parseCustomAttr("  a b   {   x y : 1  2  }") == {
        'a b': {
            'x y': '1  2'
        }
    }
    def addClusterToDom(self,
                        lCluster,
                        bMoveContent=False,
                        sAlgo="",
                        pageNode=None):
        """
        Add Cluster elements to the Page DOM node
        """
        lNdCluster = []
        for name, lnidx in enumerate(lCluster):
            #self.analysedCluster()
            if pageNode is None:
                for idx in lnidx:
                    pageNode = self.lNode[idx].page.node
                    break
                pageNode.append(
                    etree.Comment(
                        "\nClusters created by the conjugate graph\n"))

            ndCluster = PageXml.createPageXmlNode('Cluster')
            ndCluster.set("name", str(name))
            ndCluster.set("algo", sAlgo)
            # add the space separated list of node ids
            ndCluster.set(
                "content",
                " ".join(self.lNode[_i].node.get("id") for _i in lnidx))
            coords = PageXml.createPageXmlNode('Coords')
            ndCluster.append(coords)
            spoints = ShapeLoader.minimum_rotated_rectangle(
                [self.lNode[_i].node for _i in lnidx])
            coords.set('points', spoints)
            pageNode.append(ndCluster)
            ndCluster.tail = "\n"

            if bMoveContent:
                # move the DOM node of the content to the cluster
                for _i in lnidx:
                    ndCluster.append(self.lNode[_i].node)
            lNdCluster.append(ndCluster)

        return lNdCluster
Ejemplo n.º 25
0
 def unLinkTextLines(self,doc):
     """
         delete textlines and baselines
     """
     lT = PageXml.getChildByName(doc.getRootElement(),'TextLine')
     if lT == []:
         return doc
     
     for text in lT:
         text.unlinkNode()
         text.freeNode()
     return doc
Ejemplo n.º 26
0
 def unLinkTable(self,doc):
     """
         delete table
     """
     lT = PageXml.getChildByName(doc.getRootElement(),'TableRegion')
     if lT == []:
         return doc
     
     for table in lT:
         table.unlinkNode()
         table.freeNode()
     return doc
Ejemplo n.º 27
0
def test_getsetCustomAttr():
    sXml = b"""
            <TextRegion type="page-number" id="p1_region_1471502505726_2" custom="readingOrder {index:9;} structure {type:page-number;}">
                <Coords points="972,43 1039,43 1039,104 972,104"/>
            </TextRegion>
            """
    doc = etree.parse(BytesIO(sXml))
    nd = doc.getroot()
    assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '9'
    assert PageXml.setCustomAttr(nd, "readingOrder", "index", 99) == 99
    assert PageXml.getCustomAttr(nd, "readingOrder", "index") == '99'

    assert PageXml.getCustomAttr(nd, "readingOrder") == {'index': '99'}

    assert PageXml.setCustomAttr(nd, "readingOrder", "toto", "zou") == "zou"
    assert PageXml.getCustomAttr(nd, "readingOrder", "toto") == 'zou'

    with pytest.raises(PageXmlException):
        PageXml.getCustomAttr(nd, "readingOrder", "axiste_pas")
    with pytest.raises(PageXmlException):
        PageXml.getCustomAttr(nd, "axiste_pas_non_plus", "axiste_pas")
Ejemplo n.º 28
0
    def extractFileNamesFromMPXML(self, mpxmldoc):
        """
            to insure correct file order !
            
            duplicated form performCVLLA.py
        """
        xmlpath = os.path.abspath(os.path.join(self.coldir, sCOL, self.docid))

        lNd = PageXml.getChildByName(mpxmldoc.getRootElement(), 'Page')
        #         for i in lNd:print i
        return map(
            lambda x: "%s%s%s.xml" %
            (xmlpath, os.sep, x.prop('imageFilename')[:-4]), lNd)
Ejemplo n.º 29
0
    def extractFileNamesFromMPXML(self, doc):
        """
            to insure correct file order !
        """
        xmlpath = os.path.abspath(
            "%s%s%s%s%s" % (self.coldir, os.sep, 'col', os.sep, self.docid))

        lNd = PageXml.getChildByName(doc.getroot(), 'Page')
        #         for i in lNd:print i
        return list(
            map(
                lambda x: "%s%s%s.xml" %
                (xmlpath, os.sep, x.get('imageFilename')[:-4]), lNd))
def convertTR2Sep(filename):
    """
    """
    print (filename)
    tagname='TextRegion'
    xml = etree.parse(filename)
    ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"})
    
    for x in ltextsep:
        if "separator" in x.get('custom'):
            x.tag = 'SeparatorRegion'
            
            #now we need to convert that object to a line
            lXY = PageXml.getPointList(x)  #the polygon
            assert lXY, "Separator without Coord??"
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ValueError:
                print("Warning: Coords might be bad, taking bounding box: ", lXY)
                x1,y1,x2,y2 = plg.getBoundingBox()
#             try:
#                 x1,y1, x2,y2 = plg.fitRectangle()
#             except ZeroDivisionError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()
#             except ValueError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()            
            if abs(x2-x1) > abs(y2-y1): # horizontal
                y1 = (y1+y2)/2
                y2 = y1
            else:
                x1 = (x1+x2)/2
                x2=x1
            
            ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0]
            PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)])
                
    return xml