def parseDocFile(self, sFilename, iVerbose=0): """ Load that document as a CRF Graph. Also set the self.doc variable! Return a CRF Graph object """ self.doc = etree.parse(sFilename) self.lNode, self.lEdge = list(), list() self.lNodeBlock = [] # text node self.lNodeGridLine = [] # grid line node root = self.doc.getroot() doer = GridAnnotator(self.iGridStep_H, self.iGridStep_V) #map the groundtruth table separators, if any, to our grid ltlHlV = doer.get_grid_GT_index_from_DOM(root, self.fMinPageCoverage) for (lHi, lVi) in ltlHlV: traceln(" - found %d horizontal, %d vertical GT separators" % (len(lHi), len(lVi))) #create DOM node reflecting the grid #first clean (just in case!) n = doer.remove_grid_from_dom(root) if n > 0: traceln(" - removed %d existing grid lines" % n) # we add GridSeparator elements. Groundtruth ones have type="1" n = doer.add_grid_to_DOM(root, ltlHlV) traceln(" - added %d grid lines %s" % (n, (self.iGridStep_H, self.iGridStep_V))) lClassicType = [ nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType ] lSpecialType = [ nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType ] for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc): #now that we have the page, let's create the node for each type! lClassicPageNode = [ nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] lSpecialPageNode = [ nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] self.lNode.extend(lClassicPageNode) # e.g. the TextLine objects self.lNodeBlock.extend(lClassicPageNode) self.lNode.extend(lSpecialPageNode) # e.g. the grid lines! self.lNodeGridLine.extend(lSpecialPageNode) #no previous page to consider (for cross-page links...) => None lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode) self.lEdge.extend(lClassicPageEdge) # Now, compute edges between special and classic objects... lSpecialPageEdge = self.computeSpecialEdges( lClassicPageNode, lSpecialPageNode) self.lEdge.extend(lSpecialPageEdge) #if iVerbose>=2: traceln("\tPage %5d %6d nodes %7d edges"%(pnum, len(lPageNode), len(lPageEdge))) if iVerbose >= 2: traceln("\tPage %5d" % (pnum)) traceln("\t block: %6d nodes %7d edges (to block)" % (pnum, len(lClassicPageNode), len(lClassicPageEdge))) traceln("\t line: %6d nodes %7d edges (from block)" % (pnum, len(lSpecialPageNode), len(lSpecialPageEdge))) if iVerbose: traceln("\t\t (%d nodes, %d edges)" % (len(self.lNode), len(self.lEdge))) return self
def computeSpecialEdges(cls, lClassicPageNode, lSpecialPageNode): """ Compute: - edges between each block and the grid line above/across/below the block - edges between grid lines return a list of edges """ # indexing the grid lines dGridLineByIndex = { GridAnnotator.snapToGridIndex(nd.y1, cls.iGridStep_V): nd for nd in lSpecialPageNode } for nd in lSpecialPageNode: #print(nd, dGridLineByIndex[GridAnnotator.snapToGridIndex(nd.y1, cls.iGridStep_V)]) assert dGridLineByIndex[GridAnnotator.snapToGridIndex( nd.y1, cls.iGridStep_V)] == nd, "internal error inconsistent grid" # block to grid line edges lEdge = [] fLenNorm = float(cls.iGridStep_V * cls.iBlockVisibility) imin, imax = 100, -1 assert lClassicPageNode, "ERROR: empty page!!??" for ndBlock in lClassicPageNode: ### print("---- ", ndBlock) # i1 = GridAnnotator.snapToGridIndex(nd.x1, cls.iGridStep_V) # i2 = GridAnnotator.snapToGridIndex(nd.x2, cls.iGridStep_V) i1 = int(math.floor(ndBlock.y1 / float(cls.iGridStep_V))) i2 = int(math.ceil(ndBlock.y2 / float(cls.iGridStep_V))) assert i2 >= i1 yBlkAvg = (ndBlock.y1 + ndBlock.y2) / 2.0 #Also make visible the iBlockVisibility-1 previous grid lines, if any for i in range(max(0, i1 - cls.iBlockVisibility + 1), i1 + 1): edge = Edge_BL(ndBlock, dGridLineByIndex[i]) edge.len = (yBlkAvg - i * cls.iGridStep_V) / fLenNorm edge._gridtype = -1 lEdge.append(edge) imin = min(i, imin) ### print(ndBlock.y1, i, edge.len) for i in range(max(0, i1 + 1), max(0, i2)): ndLine = dGridLineByIndex[i] edge = Edge_BL(ndBlock, ndLine) edge.len = (yBlkAvg - i * cls.iGridStep_V) / fLenNorm edge._gridtype = 0 # grid line is crossing the block assert ndBlock.y1 < i * cls.iGridStep_V assert i * cls.iGridStep_V < ndBlock.y2 ### print(ndBlock.y1, ndBlock.y2, i, edge.len) lEdge.append(edge) imax = max(imax, i) for i in range(max(0, i2), i2 + cls.iBlockVisibility): try: edge = Edge_BL(ndBlock, dGridLineByIndex[i]) except KeyError: break # out of the grid edge.len = (yBlkAvg - i * cls.iGridStep_V) / fLenNorm edge._gridtype = +1 lEdge.append(edge) imax = max(imax, i) ### print(ndBlock.y2, i, edge.len) #now filter those edges n0 = len(lEdge) lEdge = cls._filterBadEdge(lEdge, imin, imax, dGridLineByIndex) traceln(" - filtering: removed %d edges due to obstruction." % (len(lEdge) - n0)) if False: print("--- After filtering: %d edges" % len(lEdge)) lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid) for edge in lSortedEdge: print("Block domid=%s y1=%s y2=%s" % (edge.A.domid, edge.A.y1, edge.A.y2) + " %s line %s " % (["↑", "-", "↓"][1 + edge._gridtype], edge.B.y1 / cls.iGridStep_V) + "domid=%s y1=%s" % (edge.B.domid, edge.B.y1)) #what differ from previosu version cls._makeConsistentLabelForEmptyGridRow(lEdge, lClassicPageNode, dGridLineByIndex) # grid line to grid line edges n = len(dGridLineByIndex) for i in range(n): A = dGridLineByIndex[i] for j in range(i + 1, min(n, i + cls.iGridVisibility + 1)): edge = Edge_LL(A, dGridLineByIndex[j]) edge.len = (j - i) lEdge.append(edge) return lEdge