Esempio n. 1
0
    def computeSpecialEdges(self, lClassicPageNode, lSpecialPageNode,
                            bCutIsBeforeText):
        """
        Compute:
        - edges between each block and the cut line above/across/below the block
        - edges between cut lines
        return a list of edges
        """

        #augment the block with the coordinate of its baseline central point
        for blk in lClassicPageNode:
            try:
                x, y = BaselineCutAnnotator.getDomBaselineXY(blk.node)
                blk.x_bslne = x
                blk.y_bslne = y
            except IndexError:
                traceln("** WARNING: no Baseline in ", blk.domid)
                traceln("** Using x2 and y2 instead... :-/")
                blk.x_bslne = blk.x2
                blk.y_bslne = blk.y2

        for cutBlk in lSpecialPageNode:
            assert cutBlk.y1 == cutBlk.y2
            cutBlk.y1 = int(round(cutBlk.y1))  #DeltaFun make float
            cutBlk.y2 = cutBlk.y1

        #block to cut line edges
        lEdge = []
        for blk in lClassicPageNode:
            for cutBlk in lSpecialPageNode:
                if blk.y_bslne == cutBlk.y1:
                    edge = Edge_BL(blk, cutBlk)
                    edge.len = 0
                    edge._type = 0  # Cut line is crossing the block
                    lEdge.append(edge)
                elif abs(blk.y_bslne - cutBlk.y1) <= self.iBlockVisibility:
                    edge = Edge_BL(blk, cutBlk)
                    # experiments show that abs helps
                    # edge.len = (blk.y_bslne - cutBlk.y1) / self.iBlockVisibility
                    edge.len = abs(blk.y_bslne -
                                   cutBlk.y1) / self.iBlockVisibility
                    edge._type = -1 if blk.y_bslne > cutBlk.y1 else +1
                    lEdge.append(edge)

        #sort those edge from top to bottom
        lEdge.sort(key=lambda o: o.B.y1)  # o.B.y1 == o.B.y2 by construction

        #now filter those edges
        n0 = len(lEdge)
        if False:
            print("--- before filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s yg=%s" %
                      (edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne) +
                      "  %s line %s " %
                      (["↑", "-", "↓"][1 + edge._type], edge.B.y1) +
                      "domid=%s y1=%s  " % (edge.B.domid, edge.B.y1) +
                      str(id(edge)))
        lEdge = self._filterBadEdge(lEdge, lSpecialPageNode, bCutIsBeforeText)
        traceln(" - filtering: removed %d edges due to obstruction." %
                (n0 - len(lEdge)))
        if False:
            print("--- After filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            print(len(lSortedEdge))
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s yg=%s" %
                      (edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne) +
                      "  %s line %s " %
                      (["↑", "-", "↓"][1 + edge._type], edge.B.y1) +
                      "domid=%s y1=%s  " % (edge.B.domid, edge.B.y1) +
                      str(id(edge)))

        if self.iLineVisibility > 0:
            # Cut line to Cut line edges
            lSpecialPageNode.sort(key=lambda o: o.y1)
            for i, A in enumerate(lSpecialPageNode):
                for B in lSpecialPageNode[i + 1:]:
                    if B.y1 - A.y1 <= self.iLineVisibility:
                        edge = Edge_LL(A, B)
                        edge.len = (B.y1 - A.y1) / self.iLineVisibility
                        assert edge.len >= 0
                        lEdge.append(edge)
                    else:
                        break

        return lEdge
Esempio n. 2
0
    def parseDocFile(self, sFilename, iVerbose=0):
        """
        Load that document as a CRF Graph.
        Also set the self.doc variable!
        
        Return a CRF Graph object
        """
        self.doc = etree.parse(sFilename)
        self.lNode, self.lEdge = list(), list()
        self.lNodeBlock = []  # text node
        self.lNodeCutLine = []  # cut line node

        root = self.doc.getroot()

        doer = BaselineCutAnnotator()
        doer.setLabelScheme_SIO()  #use SIO instead of SO labels!

        #doer.setModulo(self.iModulo)  # this is optional

        #load the groundtruth table separators, if any, per page (1 in tABP)
        ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage)
        for (lHi, lVi) in ltlYlX:
            traceln(" - found %d horizontal,  %d vertical  GT separators" %
                    (len(lHi), len(lVi)))

        #create DOM node reflecting the cuts
        #first clean (just in case!)
        n = doer.remove_cuts_from_dom(root)
        if n > 0:
            traceln(" - removed %d pre-existing cut lines" % n)

        # if GT, then we have labelled cut lines in DOM
        _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX)

        lClassicType = [
            nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType
        ]
        lSpecialType = [
            nt for nt in self.getNodeTypeList()
            if nt not in self._lClassicNodeType
        ]

        for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc):
            #now that we have the page, let's create the node for each type!
            lClassicPageNode = [
                nd for nodeType in lClassicType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]
            lSpecialPageNode = [
                nd for nodeType in lSpecialType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]

            self.lNode.extend(lClassicPageNode)  # e.g. the TextLine objects
            self.lNodeBlock.extend(lClassicPageNode)

            self.lNode.extend(lSpecialPageNode)  # e.g. the cut lines!
            self.lNodeCutLine.extend(lSpecialPageNode)

            #no previous page to consider (for cross-page links...) => None
            lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode)
            self.lEdge.extend(lClassicPageEdge)

            # Now, compute edges between special and classic objects...
            lSpecialPageEdge = self.computeSpecialEdges(
                lClassicPageNode, lSpecialPageNode, doer.bCutIsBeforeText)
            self.lEdge.extend(lSpecialPageEdge)

            #if iVerbose>=2: traceln("\tPage %5d    %6d nodes    %7d edges"%(pnum, len(lPageNode), len(lPageEdge)))
            if iVerbose >= 2:
                traceln("\tPage %5d" % (pnum))
                traceln("\t   block: %6d nodes    %7d edges (to block)" %
                        (pnum, len(lClassicPageNode), len(lClassicPageEdge)))
                traceln("\t   line: %6d nodes    %7d edges (from block)" %
                        (pnum, len(lSpecialPageNode), len(lSpecialPageEdge)))

        if iVerbose:
            traceln("\t\t (%d nodes,  %d edges)" %
                    (len(self.lNode), len(self.lEdge)))

        return self
Esempio n. 3
0
    def computeSpecialEdges(self, lClassicPageNode, lSpecialPageNode,
                            bCutIsBeforeText):
        """
        Compute:
        - edges between each block and the cut line above/across/below the block
        - edges between cut lines
        return a list of edges
        """
       
        #augment the block with the coordinate of its baseline central point
        for blk in lClassicPageNode:
            try:
                x,y = BaselineCutAnnotator.getDomBaselineXY(blk.node)
                blk.x_bslne = x
                blk.y_bslne = y
            except IndexError:
                traceln("** WARNING: no Baseline in ", blk.domid)
                traceln("** Using x2 and y2 instead... :-/")
                blk.x_bslne = int(round(blk.x2))
                blk.y_bslne = int(round(blk.y2))
                
                
        
        for cutBlk in lSpecialPageNode:
            assert cutBlk.y1 == cutBlk.y2, "Update code: horizontal cut only for now"
            cutBlk.y1 = int(round(cutBlk.y1))  #DeltaFun make float
            cutBlk.y2 = cutBlk.y1

        #block to cut line edges
        lEdge = []
        for blk in lClassicPageNode:
            for cutBlk in lSpecialPageNode:
                if blk.y1 <= cutBlk.y1 and cutBlk.y1 <= blk.y2:
                    #crossing
                    typ = 2 if blk.y_bslne == cutBlk.y1 else 0
                    edge = Edge_BL(blk, cutBlk, typ)
                    edge.len = (blk.y_bslne - cutBlk.y1) / abs(blk.y1 - blk.y2)
                    lEdge.append(edge)                    
                elif abs(blk.y_bslne - cutBlk.y1) <= self.iBlockVisibility:
                    typ = -1 if blk.y_bslne > cutBlk.y1 else +1
                    edge = Edge_BL(blk, cutBlk, typ)
                    edge.len = abs(blk.y_bslne - cutBlk.y1) / self.iBlockVisibility
                    lEdge.append(edge)                    
        
        #sort those edge from top to bottom
        lEdge.sort(key=lambda o: o.B.y1)  # o.B.y1 == o.B.y2 by construction
        
        #now filter those edges
        n0 = len(lEdge)
        if False:
            print("--- before filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s yg=%s"%(edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne)
                        + "  %s line %s "%(["↑", "x", "↓", "-"][1+edge._type],                                       
                                       edge.B.y1)
                        + "domid=%s y1=%s  " %(edge.B.domid, edge.B.y1)
                        +str(id(edge))
                    )
        lEdge = self._filterBadEdge(lEdge, lSpecialPageNode, bCutIsBeforeText)
        traceln(" - filtering: removed %d edges due to obstruction." % (n0-len(lEdge)))
        if False:
            print("--- After filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            print(len(lSortedEdge))
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s yg=%s"%(edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne)
                        + "  %s line %s "%(["↑", "x", "↓", "-"][1+edge._type],                                       
                                       edge.B.y1)
                        + "domid=%s y1=%s  " %(edge.B.domid, edge.B.y1)
                        +str(id(edge))
                    )

#         # grid line to grid line edges
#         n = len(dGridLineByIndex)
#         for i in range(n):
#             A = dGridLineByIndex[i]
#             for j in range(i+1, min(n, i+cls.iGridVisibility+1)):
#                 edge = Edge_LL(A, dGridLineByIndex[j])
#                 edge.len = (j - i)
#                 lEdge.append(edge) 
        
        return lEdge