def isBaselineHorizontal(ndText):
    lNdBaseline = MultiPageXml.getChildByName(ndText ,'Baseline')
    if lNdBaseline:
        try:
            o = ShapeLoader.node_to_LineString(lNdBaseline[0])
        except:
            return True
        (minx, miny, maxx, maxy) = o.bounds
        return bool(maxx-minx >= maxy-miny)
    return True            
 def _iter_GraphNode(self, doc, domNdPage, page):
     """
     to add the shape object reflecting the baseline
     """
     for blk in super()._iter_GraphNode(doc, domNdPage, page):
         try:
             ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0]
             try:
                 o = ShapeLoader.node_to_LineString(ndBaseline)
             except ValueError:
                 traceln("SKIPPING INVALID Baseline: ", etree.tostring(ndBaseline))
                 continue
             blk.shape = o
             blk.du_index = int(ndBaseline.get("du_index"))
             yield blk
         except:
             pass
     return
Example #3
0
    def addSeparatorFeature(self):
        """
        We load the graphical separators
        COmpute a set of shapely object
        In turn, for each edge, we compute the intersection
        """

        # graphical separators
        from xml_formats.PageXml import PageXml
        dNS = {"pc": PageXml.NS_PAGE_XML}
        ndRoot = self.lNode[0].node.getroottree()
        lNdSep = ndRoot.xpath(".//pc:SeparatorRegion", namespaces=dNS)
        loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep]

        if self.bVerbose: traceln(" %d graphical separators" % len(loSep))

        # make an indexed rtree
        idx = index.Index()
        for i, oSep in enumerate(loSep):
            idx.insert(i, oSep.bounds)

        # take each edge in turn and list the separators it crosses
        nCrossing = 0
        for edge in self.lEdge:
            # bottom-left corner to bottom-left corner
            oEdge = geom.LineString([(edge.A.x1, edge.A.y1),
                                     (edge.B.x1, edge.B.y1)])
            prepO = prep(oEdge)
            bCrossing = False
            for i in idx.intersection(oEdge.bounds):
                # check each candidate in turn
                if prepO.intersects(loSep[i]):
                    bCrossing = True
                    nCrossing += 1
                    break
            edge.bCrossingSep = bCrossing

        if self.bVerbose:
            traceln(
                " %d (/ %d) edges crossing at least one graphical separator" %
                (nCrossing, len(self.lEdge)))
def normaliseElement(nd, iNorm):
    try:
        ndBaseline = nd.xpath(xpBASELINE, namespaces=dNS)[0]
    except IndexError:
        raise NormaliseException("WARNING: skipped element normalisation: no Baseline: %s" % etree.tostring(nd))
        
    try:
        line = ShapeLoader.node_to_LineString(ndBaseline)
    except ValueError:
        raise NormaliseException("WARNING: skipped element normalisation: invalid Coords: %s" % etree.tostring(nd))
    topline = translate(line, yoff=-iNorm)
    
    # serialise both in circular sequence
    spoints = ' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in line.coords)
    lp=list(topline.coords)
    lp.reverse()
    spoints = spoints+ ' ' +' '.join("%s,%s"%(int(x[0]),int(x[1])) for x in lp)    

    # ad-hoc way of setting the element coodinates
    ndCoords = nd.xpath(".//pg:Coords", namespaces=dNS)[0]
    ndCoords.set("points",spoints)
    
    return
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    cnt, cntS = 0, 0
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        cntDoc, cntDocS = 0, 0

        doc = etree.parse(sFilename, parser)
        root = doc.getroot()
        
        # Separators are not under tableRegion... :-/
        lNdSep = MultiPageXml.getChildByName(root ,'SeparatorRegion')
        loSep = [ShapeLoader.node_to_LineString(ndSep) for ndSep in lNdSep]
        for _o in loSep: _o._bConsistent = True
        
        if not lNdSep:
            traceln("Warning: no separator in %s"%sFilename)
        else:
            traceln("%25s  %d separators" % (sFilename, len(lNdSep)))
            lNdTR = MultiPageXml.getChildByName(root ,'TableRegion')
            for ndTR in lNdTR:
                lNdCells= MultiPageXml.getChildByName(ndTR ,'TableCell')
                if not lNdCells:
                    continue
                
                nbRows = max(int(x.get('row')) for x in lNdCells)
        
                # build a list of Shapely objects augmented with our own table attributes
                loText = [] #
                for ndCell in lNdCells:
                    minRow = int(ndCell.get('row'))
                    minCol = int(ndCell.get('col'))
                    maxRow = minRow + int(ndCell.get('rowSpan')) - 1
                    maxCol = minCol + int(ndCell.get('colSpan')) - 1
#                     # ignore cell spanning the whole table height
#                     if maxRow >= nbRows:
#                         continue
                    for ndText in MultiPageXml.getChildByName(ndCell ,'TextLine'):
                        try:
                            oText = ShapeLoader.node_to_Polygon(ndText)
                        except:
                            traceln("WARNING: SKIPPING 1 TExtLine: cannot make a polygon from: %s" % etree.tostring(ndText))
                            continue
                        # reflecting the textbox as a single point
                        (minx, miny, maxx, maxy) = oText.bounds
                        
                        # is the baseline horizontal or vertical??
                        fDelta = min((maxx-minx) / 2.0, (maxy-miny) / 2.0) 
                        if isBaselineHorizontal(ndText):
                            # supposed Horizontal text
                            oText = geom.Point(minx + fDelta  , (maxy + miny)/2.0)
                            ndText.set("Horizontal", "TRUE")

                        else:
                            ndText.set("Horizontal", "nope")
                            oText = geom.Point((minx + maxx)/2.0  , miny + fDelta)
                            
                        # considering it as a point, using its centroid
                        # does not work well due to loooong texts oText = oText.centroid
                        oText._minRow, oText._minCol = minRow, minCol
                        oText._maxRow, oText._maxCol = maxRow, maxCol
                        if DEBUG: oText._domnd = ndText
                        loText.append(oText)
                
                traceln("    TableRegion  %d texts" % (len(loText)))
                
                if loText:
                    # checking in tun each separator for table-consistency
                    sp = ShapePartition(loText)
                    
                    for oSep in loSep:
                        (minx, miny, maxx, maxy) = oSep.bounds
                        if maxx - minx >= maxy - miny:
                            # supposed Horizontal
                            l = sp.getObjectAboveLine(oSep)
                            if l:
                                maxRowBefore = max(_o._maxRow for _o in l)
                                l = sp.getObjectBelowLine(oSep)
                                if l:
                                    minRowAfter  = min(_o._minRow for _o in l)
                                    if maxRowBefore >= minRowAfter: oSep._bConsistent = False
                        else:
                            l1 = sp.getObjectOnLeftOfLine(oSep)
                            if l1:
                                maxColBefore = max(_o._maxCol for _o in l1)
                                l2 = sp.getObjectOnRightOfLine(oSep)
                                if l2:
                                    minColAfter  = min(_o._minCol for _o in l2)
                                    if maxColBefore >= minColAfter: 
                                        oSep._bConsistent = False
                                        if DEBUG:
                                            # DEBUG
                                            for o in l1:
                                                if o._maxCol >= minColAfter: print("too much on right", etree.tostring(o._domnd))
                                            for o in l2:
                                                if o._minCol <= maxColBefore: print("too much on left", etree.tostring(o._domnd))
                # end of TableRegion
            # end of document
            for ndSep, oSep in zip(lNdSep, loSep): 
                if oSep._bConsistent:
                    ndSep.set("DU_Sep", "S")
                    cntDocS += 1
                else:
                    ndSep.set("DU_Sep", "I")
                cntDoc += 1
            
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        traceln('%.2f%% consistent separators - annotation done for %s  --> %s' % (100*float(cntDocS)/(cntDoc+0.000001), sFilename, sOutFilename))
        
        del doc
        cnt, cntS = cnt+cntDoc, cntS+cntDocS
    traceln('%.2f%% consistent separators - annotation done for %d files' % (100*float(cntS)/(cnt+0.000001), cnt))
    def addSeparatorFeature(self):
        """
        We load the graphical separators
        COmpute a set of shapely object
        In turn, for each edge, we compute the intersection with all separators
        
        The edge features will be:
        - boolean: at least crossing one separator
        - number of crossing points
        - span length of the crossing points
        - average length of the crossed separators
        - average distance between two crossings
        """

        # graphical separators
        dNS = {"pc": PageXml.NS_PAGE_XML}
        someNode = self.lNode[0]
        ndPage = someNode.node.xpath("ancestor::pc:Page", namespaces=dNS)[0]
        lNdSep = ndPage.xpath(".//pc:SeparatorRegion", namespaces=dNS)
        loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep]

        if self.bVerbose: traceln(" %d graphical separators" % len(loSep))

        # make an indexed rtree
        idx = index.Index()
        for i, oSep in enumerate(loSep):
            idx.insert(i, oSep.bounds)

        # take each edge in turn and list the separators it crosses
        nCrossing = 0
        for edge in self.lEdge:
            # bottom-left corner to bottom-left corner
            oEdge = geom.LineString([(edge.A.x1, edge.A.y1),
                                     (edge.B.x1, edge.B.y1)])
            prepO = prep(oEdge)
            lCrossingPoints = []
            fSepTotalLen = 0
            for i in idx.intersection(oEdge.bounds):
                # check each candidate in turn
                oSep = loSep[i]
                if prepO.intersects(oSep):
                    fSepTotalLen += oSep.length
                    oPt = oEdge.intersection(oSep)
                    if type(oPt) != geom.Point:
                        traceln('Intersection in not a point: skipping it')
                    else:
                        lCrossingPoints.append(oPt)

            if lCrossingPoints:
                nCrossing += 1
                edge.bCrossingSep = True
                edge.sep_NbCrossing = len(lCrossingPoints)
                minx, miny, maxx, maxy = geom.MultiPoint(
                    lCrossingPoints).bounds
                edge.sep_SpanLen = abs(minx - maxx) + abs(miny - maxy)
                edge.sep_AvgSpanSgmt = edge.sep_SpanLen / len(lCrossingPoints)
                edge.sep_AvgSepLen = fSepTotalLen / len(lCrossingPoints)
            else:
                edge.bCrossingSep = False
                edge.sep_NbCrossing = 0
                edge.sep_SpanLen = 0
                edge.sep_AvgSpanSgmt = 0
                edge.sep_AvgSepLen = 0

            #traceln((edge.A.domid, edge.B.domid, edge.bCrossingSep, edge.sep_NbCrossing, edge.sep_SpanLen, edge.sep_AvgSpanSgmt, edge.sep_AvgSepLen))

        if self.bVerbose:
            traceln(
                " %d (/ %d) edges crossing at least one graphical separator" %
                (nCrossing, len(self.lEdge)))
Example #7
0
    def addSeparatorFeature(self):
        """
        We load the graphical separators
        COmpute a set of shapely object
        In turn, for each edge, we compute the intersection with all separators
        
        The edge features will be:
        - boolean: at least crossing one separator
        - number of crossing points
        - span length of the crossing points
        - average length of the crossed separators
        - average distance between two crossings
        
        
            xMiddle = (max(edge.A.x1, edge.B.x1) + min(edge.A.x2, edge.B.x2)) / 2.0
            yMiddle = (max(edge.A.y1, edge.B.y1) + min(edge.A.y2, edge.B.y2)) / 2.0
            if max(edge.A.x1, edge.B.x1) > min(edge.A.x2, edge.B.x2):
                # horizontal edge
                oEdge = geom.LineString([  (min(edge.A.x2, edge.B.x2), yMiddle)
                                         , (max(edge.A.x1, edge.B.x1), yMiddle)])
            else:
                # vertical edge or box overlap
                oEdge = geom.LineString([  (xMiddle, min(edge.A.y2, edge.B.y2))
                                         , (xMiddle, max(edge.A.y1, edge.B.y1))])        
        
        
        
        
        """
        
        if self._cachedSelf == self:
            # same call... (by reified edge code maybe)
            (loSep, idx) = PageXmlSeparatorRegion._cached
        else:
        # graphical separators
            dNS = {"pc":PageXml.NS_PAGE_XML}
            someNode = self.lNode[0]
            lndPage =  someNode.node.xpath("//pc:Page", namespaces=dNS)
            assert len(lndPage) == 1, "INTERNAL ERROR: CODE NOT READY FOR MULTIPAGE..."
            ndPage = someNode.node.xpath("ancestor::pc:Page", namespaces=dNS)[0]
            lNdSep = ndPage.xpath(".//pc:SeparatorRegion", namespaces=dNS)
            loSep = [ShapeLoader.node_to_LineString(_nd) for _nd in lNdSep]
        
            if self.bVerbose: traceln("\t\t\t%d graphical separators"%len(loSep))

            # make an indexed rtree
            idx = index.Index()
            for i, oSep in enumerate(loSep):
                idx.insert(i, oSep.bounds)

            PageXmlSeparatorRegion._cachedSelf = self
            PageXmlSeparatorRegion._cached     = (loSep, idx)
            
        # take each edge in turn and list the separators it crosses
        nCrossing = 0
        for edge in self.lEdge:
            # bottom-left corner to bottom-left corner
            #oEdge = geom.LineString([(edge.A.x1, edge.A.y1), (edge.B.x1, edge.B.y1)])
            
            xMiddle = (max(edge.A.x1, edge.B.x1) + min(edge.A.x2, edge.B.x2)) / 2.0
            yMiddle = (max(edge.A.y1, edge.B.y1) + min(edge.A.y2, edge.B.y2)) / 2.0
            if max(edge.A.x1, edge.B.x1) > min(edge.A.x2, edge.B.x2):
                # horizontal edge
                oEdge = geom.LineString([  (min(edge.A.x2, edge.B.x2), yMiddle)
                                         , (max(edge.A.x1, edge.B.x1), yMiddle)])
            else:
                # vertical edge or box overlap
                oEdge = geom.LineString([  (xMiddle, min(edge.A.y2, edge.B.y2))
                                         , (xMiddle, max(edge.A.y1, edge.B.y1))])                 
            
            
            prepO = prep(oEdge)
            lCrossingPoints = []
            fSepTotalLen = 0
            for i in idx.intersection(oEdge.bounds):
                # check each candidate in turn
                oSep = loSep[i]
                if prepO.intersects(oSep):
                    fSepTotalLen += oSep.length
                    oPt = oEdge.intersection(oSep)
                    if type(oPt) != geom.Point and type(oPt) != geom.MultiPoint:
                        traceln('\t\t\tIntersection in not a point: skipping it')
                    elif type(oPt) == geom.Point:
                        lCrossingPoints.append(oPt)
                    elif type(oPt) == geom.MultiPoint:
                        for x in [ (p.x,p.y) for p in oPt]:
                            lCrossingPoints.append(geom.Point(x))
            
            if lCrossingPoints:
                nCrossing += 1
                edge.bCrossingSep = True
                edge.sep_NbCrossing = len(lCrossingPoints)
                minx, miny, maxx, maxy  = geom.MultiPoint(lCrossingPoints).bounds
                edge.sep_SpanLen = abs(minx-maxx) + abs(miny-maxy)
                edge.sep_AvgSpanSgmt = edge.sep_SpanLen / len(lCrossingPoints) 
                edge.sep_AvgSepLen = fSepTotalLen / len(lCrossingPoints)
            else:
                edge.bCrossingSep = False
                edge.sep_NbCrossing = 0
                edge.sep_SpanLen = 0
                edge.sep_AvgSpanSgmt = 0 
                edge.sep_AvgSepLen = 0
                
            #traceln((edge.A.domid, edge.B.domid, edge.bCrossingSep, edge.sep_NbCrossing, edge.sep_SpanLen, edge.sep_AvgSpanSgmt, edge.sep_AvgSepLen))
                
        if self.bVerbose: 
            traceln("\t\t\t%d (/ %d) edges crossing at least one graphical separator"%(nCrossing, len(self.lEdge)))
    def map_to_rows(cls, ndPage, maxRow, lCluster):
        """
        find lienar separators separating rows
        """
        # reflect each cluster by the highest point (highest ending points of baselines)
        dMinYByRow = defaultdict(lambda :9999999999)
        n = 2 * sum(len(c) for c in lCluster)
        X = np.zeros(shape=(n, 2))  # x,y coordinates
        i = 0
        for c in lCluster:
            c.maxY = -1
            c.minY = 9999999999
            for _id in c.getSetID():
                """
                <TextLine id="r1l5" custom="readingOrder {index:4;}" DU_cluster="0" row="0" rowSpan="1" col="0" colSpan="1">
                  <Coords points="217,688 245,685 273,685 301,688 329,690 358,689 358,646 329,647 301,645 273,642 245,642 217,645"/>
                  <Baseline points="217,688 245,685 273,685 301,688 329,690 358,689"/>
                  <TextEquiv><Unicode>ung.</Unicode></TextEquiv>
                </TextLine>
                 """
                nd = ndPage.xpath(".//*[@id='%s']/pg:Baseline"%_id, namespaces=dNS)[0]
                ls = ShapeLoader.node_to_LineString(nd)
                pA, pB = ls.boundary.geoms
                minY = min(pA.y, pB.y)
                c.minY = min(c.minY, minY)
                c.maxY = max(c.maxY, max((pA.y, pB.y)))
                dMinYByRow[c.minrow] = min(dMinYByRow[c.minrow], minY)
                # for the linear separators
                X[i,:] = (pA.x, pA.y)
                i = i + 1
                X[i,:] = (pB.x, pB.y)
                i = i + 1
                
        # check consistency
        for c in lCluster:
            for i in range(maxRow, c.minrow, -1):
                if c.minY > dMinYByRow[i]:
                    assert c.minrow < i
                    # how possible??? fix!!
                    c.minrow = i
                    break
 
        # compute row1 and row2
        for c in lCluster:
            c.row1 = c.minrow
            c.row2 = c.minrow
            for i in range(0, maxRow+1):
                if c.maxY > dMinYByRow[i]:
                    c.row2 = i
                else:
                    break
               
        # now compute maxRow - 1 separators!
        w = float(ndPage.get("imageWidth"))
        Y = np.zeros(shape=(n,))    # labels
#         lAB = [getLinearSeparator(X, np.clip(Y, row, row+1)) 
#                for row in range(maxRow-1)]
        
        for nd in ndPage.xpath(".//pg:SeparatorRegion[@algo]", namespaces=dNS):
            ndPage.remove(nd)
        
        for row in range(maxRow+1):
            Y0 = dMinYByRow[row] - 20
            Yw = Y0
            ndSep = PageXml.createPageXmlNode("SeparatorRegion")
            ndSep.set("algo", "tabulate_rows")
            ndCoords = PageXml.createPageXmlNode("Coords")
            ndCoords.set("points", "%d,%d %d,%d" %(0, Y0, w, Yw))
            ndSep.append(ndCoords)
            ndSep.tail = "\n"
            ndPage.append(ndSep)
        
        return