def __init__(self, A, B, typ): """ An edge from a block A to a cut line B typ is -1 = upward 0 = crossing +1 = downward 2 = crossing, with cut line exactly at A baseline """ Edge.__init__(self, A, B) self._type = typ
def parseDocFile(self, sFilename, iVerbose=0): """ Load that document as a CRF Graph. Also set the self.doc variable! Return a CRF Graph object """ self.doc = etree.parse(sFilename) self.lNode, self.lEdge = list(), list() self.lNodeBlock = [] # text node self.lNodeCutLine = [] # cut line node root = self.doc.getroot() doer = BaselineCutAnnotator() doer.setLabelScheme_SIO() #use SIO instead of SO labels! #doer.setModulo(self.iModulo) # this is optional #load the groundtruth table separators, if any, per page (1 in tABP) ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage) for (lHi, lVi) in ltlYlX: traceln(" - found %d horizontal, %d vertical GT separators" % (len(lHi), len(lVi))) #create DOM node reflecting the cuts #first clean (just in case!) n = doer.remove_cuts_from_dom(root) if n > 0: traceln(" - removed %d pre-existing cut lines" % n) # if GT, then we have labelled cut lines in DOM _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX) lClassicType = [ nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType ] lSpecialType = [ nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType ] for (pnum, page, domNdPage) in self._iter_Page_DocNode(self.doc): #now that we have the page, let's create the node for each type! lClassicPageNode = [ nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] lSpecialPageNode = [ nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] self.lNode.extend(lClassicPageNode) # e.g. the TextLine objects self.lNodeBlock.extend(lClassicPageNode) self.lNode.extend(lSpecialPageNode) # e.g. the cut lines! self.lNodeCutLine.extend(lSpecialPageNode) #no previous page to consider (for cross-page links...) => None lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode) self.lEdge.extend(lClassicPageEdge) # Now, compute edges between special and classic objects... lSpecialPageEdge = self.computeSpecialEdges( lClassicPageNode, lSpecialPageNode, doer.bCutIsBeforeText) self.lEdge.extend(lSpecialPageEdge) #if iVerbose>=2: traceln("\tPage %5d %6d nodes %7d edges"%(pnum, len(lPageNode), len(lPageEdge))) if iVerbose >= 2: traceln("\tPage %5d" % (pnum)) traceln("\t block: %6d nodes %7d edges (to block)" % (pnum, len(lClassicPageNode), len(lClassicPageEdge))) traceln("\t line: %6d nodes %7d edges (from block)" % (pnum, len(lSpecialPageNode), len(lSpecialPageEdge))) if iVerbose: traceln("\t\t (%d nodes, %d edges)" % (len(self.lNode), len(self.lEdge))) return self
def parseDocFile(self, sFilename, iVerbose=0): """ Load that document as a CRF Graph. Also set the self.doc variable! Return a CRF Graph object """ self.doc = etree.parse(sFilename) self.lNode, self.lEdge = list(), list() self.lNodeBlock = [] # text node self.lNodeGridLine = [] # grid line node root = self.doc.getroot() doer = GridAnnotator(self.iGridStep_H, self.iGridStep_V) #map the groundtruth table separators, if any, to our grid ltlHlV = doer.get_grid_GT_index_from_DOM(root, self.fMinPageCoverage) for (lHi, lVi) in ltlHlV: traceln(" - found %d horizontal, %d vertical GT separators" % (len(lHi), len(lVi))) #create DOM node reflecting the grid #first clean (just in case!) n = doer.remove_grid_from_dom(root) if n > 0: traceln(" - removed %d existing grid lines" % n) # we add GridSeparator elements. Groundtruth ones have type="1" n = doer.add_grid_to_DOM(root, ltlHlV) traceln(" - added %d grid lines %s" % (n, (self.iGridStep_H, self.iGridStep_V))) lClassicType = [ nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType ] lSpecialType = [ nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType ] for pnum, page, domNdPage in self._iter_Page_DocNode(self.doc): #now that we have the page, let's create the node for each type! lClassicPageNode = [ nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] lSpecialPageNode = [ nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] self.lNode.extend(lClassicPageNode) # e.g. the TextLine objects self.lNodeBlock.extend(lClassicPageNode) self.lNode.extend(lSpecialPageNode) # e.g. the grid lines! self.lNodeGridLine.extend(lSpecialPageNode) #no previous page to consider (for cross-page links...) => None lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode) self.lEdge.extend(lClassicPageEdge) # Now, compute edges between special and classic objects... lSpecialPageEdge = self.computeSpecialEdges( lClassicPageNode, lSpecialPageNode) self.lEdge.extend(lSpecialPageEdge) #if iVerbose>=2: traceln("\tPage %5d %6d nodes %7d edges"%(pnum, len(lPageNode), len(lPageEdge))) if iVerbose >= 2: traceln("\tPage %5d" % (pnum)) traceln("\t block: %6d nodes %7d edges (to block)" % (pnum, len(lClassicPageNode), len(lClassicPageEdge))) traceln("\t line: %6d nodes %7d edges (from block)" % (pnum, len(lSpecialPageNode), len(lSpecialPageEdge))) if iVerbose: traceln("\t\t (%d nodes, %d edges)" % (len(self.lNode), len(self.lEdge))) return self