def tag_DU_row_col_header(root, lCells, maxRowSpan): """ Tag the XML nodes corresponding to those cells Modify the XML DOM """ for cell in lCells: lText = MultiPageXml.getChildByName(cell,'TextLine') # HEADER WISE: D CH O if int(cell.get('row')) < maxRowSpan: [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText] else: [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText] # ROW WISE: B I E S O if len(lText) == 0: pass if len(lText) == 1: lText[0].set(sDURow,lLabelsBIESO_R[3]) elif len(lText) > 1: # lText.sort(key=lambda x:float(x.prop('y'))) lText[0].set(sDURow,lLabelsBIESO_R[0]) [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]] lText[-1].set(sDURow,lLabelsBIESO_R[2]) # MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0]) # MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2]) # [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]] #COLUM WISE: M S O lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML}) coord= lCoords[0] sPoints=coord.get('points') plgn = Polygon.parsePoints(sPoints) (cx,cy,cx2,cy2) = plgn.getBoundingBox() for txt in lText: lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML}) coord= lCoords[0] sPoints=coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx,sy) = sPair.split(',') lXY.append( (int(sx), int(sy)) ) except ValueError: traceln("WARNING: invalid coord in TextLine id=%s IGNORED"%txt.get("id")) ## HOW to define a CM element!!!! if lXY: (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox() if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1): txt.set(sDUCol,lLabelsSM_C[0]) else: txt.set(sDUCol,lLabelsSM_C[1]) else: txt.set(sDUCol,lLabelsSM_C[-1]) # textline outside table lRegions= MultiPageXml.getChildByName(root,'TextRegion') for region in lRegions: lText = MultiPageXml.getChildByName(region,'TextLine') [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText] [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText] [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText] return
if len(sys.argv) == 3: # COMPATIBILITY MODE #load mpxml sFilename = sys.argv[1] sOutFilename = sys.argv[2] lsFilename = [sFilename] lsOutFilename = [sOutFilename] else: #we expect a folder sInput = sys.argv[1] if os.path.isdir(sInput): lsFilename = [os.path.join(sInput, "col", s) for s in os.listdir(os.path.join(sInput, "col")) if s.endswith(".mpxml") ] if not lsFilename: lsFilename = [os.path.join(sInput, "col", s) for s in os.listdir(os.path.join(sInput, "col")) if s.endswith(".pxml") ] lsFilename.sort() lsOutFilename = [ os.path.dirname(s) + os.sep + "c_" + os.path.basename(s) for s in lsFilename] else: traceln("%s is not a folder"%sys.argv[1]) raise IndexError() except IndexError: traceln("Usage: %s ( input-file output-file | folder )" % sys.argv[0]) exit(1) traceln(lsFilename) traceln("%d files to be processed" % len(lsFilename)) traceln(lsOutFilename) main(lsFilename, lsOutFilename)
def main(lsFilename, lsOutFilename): #for the pretty printer to format better... parser = etree.XMLParser(remove_blank_text=True) for sFilename, sOutFilename in zip(lsFilename, lsOutFilename): doc = etree.parse(sFilename, parser) root = doc.getroot() lCells= MultiPageXml.getChildByName(root,'TableCell') if not lCells: traceln("ERROR: no TableCell - SKIPPING THIS FILE!!!") continue # default: O for all cells: all cells must have all tags! for cell in lCells: lText = MultiPageXml.getChildByName(cell,'TextLine') [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText] [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText] [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText] if False: # Oct' 2018 RV and JL decided that we keep the binding TextLine (if any!) # ignore "binding" cells # dirty... # lCells = list(filter(lambda x: int(x.get('rowSpan')) < 5, lCells)) # less dirty maxrow = max(int(x.get('row')) for x in lCells) binding_rowspan = max(5, maxrow * 0.8) traceln(" - max row = %d => considering rowspan > %d as binding cells" % (maxrow, binding_rowspan)) lValidCell, lBindingCell = [], [] for ndCell in lCells: if int(ndCell.get('rowSpan')) < binding_rowspan: lValidCell.append(ndCell) else: lBindingCell.append(ndCell) nDiscarded = len(lBindingCell) if nDiscarded > 1: traceln("**************** WARNING ****************") traceln(" - %d cells discarded as binding cells" % nDiscarded) for ndCell in lBindingCell: ndCell.set("type", "table-binding") lCells = lValidCell # FOR COLUMN HEADER: get max(cell[0,i].span) maxRowSpan = computeMaxRowSpan(lCells) tag_DU_row_col_header(root, lCells, maxRowSpan) try: removeSeparator(root) addSeparator(root, lCells) doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True) traceln('annotation done for %s --> %s' % (sFilename, sOutFilename)) except TableAnnotationException: traceln("No Table region in file ", sFilename, " IGNORED!!") del doc
0- TOC-entry 5940 occurences ( 2%) ( 2%) 1- caption 707 occurences ( 0%) ( 0%) 2- catch-word 201 occurences ( 0%) ( 0%) 3- footer 11 occurences ( 0%) ( 0%) 4- footnote 36942 occurences ( 11%) ( 11%) 5- footnote-continued 1890 occurences ( 1%) ( 1%) 6- header 15910 occurences ( 5%) ( 5%) 7- heading 18032 occurences ( 6%) ( 6%) 8- marginalia 4292 occurences ( 1%) ( 1%) 9- page-number 40236 occurences ( 12%) ( 12%) 10- paragraph 194927 occurences ( 60%) ( 60%) 11- signature-mark 4894 occurences ( 2%) ( 2%) """ lActuallySeen = None if lActuallySeen: traceln("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") lIgnoredLabels = [ lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen ] lLabels = [lLabels[i] for i in lActuallySeen] traceln(len(lLabels), lLabels) traceln(len(lIgnoredLabels), lIgnoredLabels) nbClass = len(lLabels) + 1 #because the ignored labels will become OTHER #DEFINING THE CLASS OF GRAPH WE USE DU_GRAPH = Graph_MultiPageXml nt = NodeType_PageXml_type_NestedText( "gtb" #some short prefix because labels below are prefixed with it , lLabels, lIgnoredLabels,
if __name__ == "__main__": version = "v.01" usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version) # parser.add_option("--annotate", dest='bAnnotate', action="store_true",default=False, help="Annotate the textlines with BIES labels") # --- #parse the command line (options, args) = parser.parse_args() # --- try: sModelDir, sModelName = args except Exception as e: traceln("Specify a model folder and a model name!") _exit(usage, 1, e) doer = DU_ABPTable(sModelName, sModelDir, C = options.crf_C, tol = options.crf_tol, njobs = options.crf_njobs, max_iter = options.crf_max_iter, inference_cache = options.crf_inference_cache) if options.rm: doer.rm() sys.exit(0)
def parseXmlFile(self, sFilename, iVerbose=0): """ Load that document as a CRF Graph. Also set the self.doc variable! Return a CRF Graph object """ self.doc = etree.parse(sFilename) self.lNode, self.lEdge = list(), list() self.lNodeBlock = [] # text node self.lNodeCutLine = [] # cut line node root = self.doc.getroot() doer = BaselineCutAnnotator() doer.setLabelScheme_SIO() #use SIO instead of SO labels! #doer.setModulo(self.iModulo) # this is optional #load the groundtruth table separators, if any, per page (1 in tABP) ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage) for (lHi, lVi) in ltlYlX: traceln(" - found %d horizontal, %d vertical GT separators" % (len(lHi), len(lVi))) #create DOM node reflecting the cuts #first clean (just in case!) n = doer.remove_cuts_from_dom(root) if n > 0: traceln(" - removed %d pre-existing cut lines" % n) # if GT, then we have labelled cut lines in DOM _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX) lClassicType = [ nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType ] lSpecialType = [ nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType ] for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc): #now that we have the page, let's create the node for each type! lClassicPageNode = [ nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] lSpecialPageNode = [ nd for nodeType in lSpecialType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] self.lNode.extend(lClassicPageNode) # e.g. the TextLine objects self.lNodeBlock.extend(lClassicPageNode) self.lNode.extend(lSpecialPageNode) # e.g. the cut lines! self.lNodeCutLine.extend(lSpecialPageNode) #no previous page to consider (for cross-page links...) => None lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode) self.lEdge.extend(lClassicPageEdge) # Now, compute edges between special and classic objects... lSpecialPageEdge = self.computeSpecialEdges( lClassicPageNode, lSpecialPageNode, doer.bCutIsBeforeText) self.lEdge.extend(lSpecialPageEdge) #if iVerbose>=2: traceln("\tPage %5d %6d nodes %7d edges"%(pnum, len(lPageNode), len(lPageEdge))) if iVerbose >= 2: traceln("\tPage %5d" % (pnum)) traceln("\t block: %6d nodes %7d edges (to block)" % (pnum, len(lClassicPageNode), len(lClassicPageEdge))) traceln("\t line: %6d nodes %7d edges (from block)" % (pnum, len(lSpecialPageNode), len(lSpecialPageEdge))) if iVerbose: traceln("\t\t (%d nodes, %d edges)" % (len(self.lNode), len(self.lEdge))) return self
def getConfiguredGraphClass(cls): """ In this class method, we must return a configured graph class """ lLabels = [ 'heading', 'header', 'page-number', 'resolution-number', 'resolution-marginalia', 'resolution-paragraph', 'other' ] lIgnoredLabels = None """ if you play with a toy collection, which does not have all expected classes, you can reduce those. """ lActuallySeen = None if lActuallySeen: traceln("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING") lIgnoredLabels = [ lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen ] lLabels = [lLabels[i] for i in lActuallySeen] traceln(len(lLabels), lLabels) traceln(len(lIgnoredLabels), lIgnoredLabels) # DEFINING THE CLASS OF GRAPH WE USE if cls.bPerPage: DU_GRAPH = Graph_MultiSinglePageXml # consider each age as if indep from each other else: DU_GRAPH = Graph_MultiPageXml if cls.bHTR: ntClass = NodeType_PageXml_type else: #ignore text ntClass = NodeType_PageXml_type_woText nt = ntClass( "bar" # some short prefix because labels below are prefixed with it , lLabels, lIgnoredLabels, False # no label means OTHER , BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3)) # we reduce overlap in this way ) nt.setLabelAttribute("DU_sem") if cls.bTextLine: nt.setXpathExpr(( ".//pc:TextRegion/pc:TextLine" #how to find the nodes , "./pc:TextEquiv")) else: nt.setXpathExpr(( ".//pc:TextRegion" #how to find the nodes , "./pc:TextEquiv") #how to get their text ) DU_GRAPH.addNodeType(nt) return DU_GRAPH
def computeSpecialEdges(self, lClassicPageNode, lSpecialPageNode, bCutIsBeforeText): """ Compute: - edges between each block and the cut line above/across/below the block - edges between cut lines return a list of edges """ #augment the block with the coordinate of its baseline central point for blk in lClassicPageNode: try: x, y = BaselineCutAnnotator.getDomBaselineXY(blk.node) blk.x_bslne = x blk.y_bslne = y except IndexError: traceln("** WARNING: no Baseline in ", blk.domid) traceln("** Using x2 and y2 instead... :-/") blk.x_bslne = blk.x2 blk.y_bslne = blk.y2 for cutBlk in lSpecialPageNode: assert cutBlk.y1 == cutBlk.y2 cutBlk.y1 = int(round(cutBlk.y1)) #DeltaFun make float cutBlk.y2 = cutBlk.y1 #block to cut line edges lEdge = [] for blk in lClassicPageNode: for cutBlk in lSpecialPageNode: if blk.y_bslne == cutBlk.y1: edge = Edge_BL(blk, cutBlk) edge.len = 0 edge._type = 0 # Cut line is crossing the block lEdge.append(edge) elif abs(blk.y_bslne - cutBlk.y1) <= self.iBlockVisibility: edge = Edge_BL(blk, cutBlk) # experiments show that abs helps # edge.len = (blk.y_bslne - cutBlk.y1) / self.iBlockVisibility edge.len = abs(blk.y_bslne - cutBlk.y1) / self.iBlockVisibility edge._type = -1 if blk.y_bslne > cutBlk.y1 else +1 lEdge.append(edge) #sort those edge from top to bottom lEdge.sort(key=lambda o: o.B.y1) # o.B.y1 == o.B.y2 by construction #now filter those edges n0 = len(lEdge) if False: print("--- before filtering: %d edges" % len(lEdge)) lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid) for edge in lSortedEdge: print("Block domid=%s y1=%s y2=%s yg=%s" % (edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne) + " %s line %s " % (["↑", "-", "↓"][1 + edge._type], edge.B.y1) + "domid=%s y1=%s " % (edge.B.domid, edge.B.y1) + str(id(edge))) lEdge = self._filterBadEdge(lEdge, lSpecialPageNode, bCutIsBeforeText) traceln(" - filtering: removed %d edges due to obstruction." % (n0 - len(lEdge))) if False: print("--- After filtering: %d edges" % len(lEdge)) lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid) print(len(lSortedEdge)) for edge in lSortedEdge: print("Block domid=%s y1=%s y2=%s yg=%s" % (edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne) + " %s line %s " % (["↑", "-", "↓"][1 + edge._type], edge.B.y1) + "domid=%s y1=%s " % (edge.B.domid, edge.B.y1) + str(id(edge))) if self.iLineVisibility > 0: # Cut line to Cut line edges lSpecialPageNode.sort(key=lambda o: o.y1) for i, A in enumerate(lSpecialPageNode): for B in lSpecialPageNode[i + 1:]: if B.y1 - A.y1 <= self.iLineVisibility: edge = Edge_LL(A, B) edge.len = (B.y1 - A.y1) / self.iLineVisibility assert edge.len >= 0 lEdge.append(edge) else: break return lEdge
def main(sModelDir, sModelName, options): doer = DU_ABPTableRCut(sModelName, sModelDir, iBlockVisibility=options.iBlockVisibility, iLineVisibility=options.iLineVisibility, C=options.crf_C, tol=options.crf_tol, njobs=options.crf_njobs, max_iter=options.max_iter, inference_cache=options.crf_inference_cache) if options.rm: doer.rm() return lTrn, lTst, lRun, lFold = [ _checkFindColDir(lsDir, bAbsolute=False) for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold] ] # if options.bAnnotate: # doer.annotateDocument(lTrn) # traceln('annotation done') # sys.exit(0) traceln("- classes: ", doer.getGraphClass().getLabelNameList()) ## use. a_mpxml files #doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish: if options.iFoldInitNum: """ initialization of a cross-validation """ splitter, ts_trn, lFilename_trn = doer._nfold_Init( lFold, options.iFoldInitNum, bStoreOnDisk=True) elif options.iFoldRunNum: """ Run one fold """ oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum, options.warm, options.pkl) traceln(oReport) elif options.bFoldFinish: tstReport = doer._nfold_Finish() traceln(tstReport) else: assert False, "Internal error" #no more processing!! exit(0) #------------------- if lFold: loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl) sReportPickleFilename = os.path.join(sModelDir, sModelName + "__report.txt") traceln("Results are in %s" % sReportPickleFilename) graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename, loTstRpt) elif lTrn: doer.train_save_test(lTrn, lTst, options.warm, options.pkl) try: traceln("Baseline best estimator: %s" % doer.bsln_mdl.best_params_) #for CutSearch except: pass traceln(" --- CRF Model ---") traceln(doer.getModel().getModelInfo()) elif lTst: doer.load() tstReport = doer.test(lTst) traceln(tstReport) if options.bDetailedReport: traceln(tstReport.getDetailledReport()) sReportPickleFilename = os.path.join( sModelDir, sModelName + "__detailled_report.txt") graph.GraphModel.GraphModel.gzip_cPickle_dump( sReportPickleFilename, tstReport) if lRun: if options.storeX or options.applyY: try: doer.load() except: pass #we only need the transformer lsOutputFilename = doer.runForExternalMLMethod( lRun, options.storeX, options.applyY, options.bRevertEdges) else: doer.load() lsOutputFilename = doer.predict(lRun) traceln("Done, see in:\n %s" % lsOutputFilename)
def main(sInputDir, sAlgoA, sAlgoB, bShape=False, bConvexHull=False, bVerbose=False): sAlgoC = sFMT % (sAlgoA, sAlgoB) # filenames without the path lsFilename = [ os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml") ] traceln(" - %d files to process, to produce clusters '%s'" % (len(lsFilename), sAlgoC)) for sFilename in lsFilename: sFullFilename = os.path.join(sInputDir, sFilename) traceln(" - FILE : ", sFullFilename) cntCluster, cntPage = 0, 0 doc = etree.parse(sFullFilename) for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)): nRemoved = Cluster.remove(ndPage, sAlgoC) lClusterA = Cluster.load(ndPage, sAlgoA) lClusterB = Cluster.load(ndPage, sAlgoB) if bVerbose: trace( "Page %d : (%d clusters REMOVED), %d cluster '%s' %d clusters '%s'" % (iPage + 1, nRemoved, len(lClusterA), sAlgoA, len(lClusterB), sAlgoB)) lClusterC = [] for A in lClusterA: for B in lClusterB: C = Cluster.intersect(A, B) if not C is None: lClusterC.append(C) if bVerbose: traceln(" -> %d clusters" % (len(lClusterC))) if bShape or bConvexHull: for c in lClusterC: c.shape = Cluster.computeShape(ndPage, c.setID, bConvexHull=bConvexHull) cntCluster += len(lClusterC) cntPage += 1 Cluster.store(ndPage, lClusterC, sAlgoC) doc.write(sFullFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) del doc traceln(" %d clusters over %d pages" % (cntCluster, cntPage)) traceln(" done (%d files)" % len(lsFilename))
action="store_true", default=False, help="report baseline method") parser.add_option( "--line_see_line", dest='iLineVisibility', action="store", type=int, default=0, help="seeline2line: how far in pixel can a line see another cut line?") parser.add_option( "--block_see_line", dest='iBlockVisibility', action="store", type=int, default=273, help="seeblock2line: how far in pixel can a block see a cut line?") # --- #parse the command line (options, args) = parser.parse_args() # --- try: sModelDir, sModelName = args except Exception as e: traceln("Specify a model folder and a model name!") _exit(usage, 1, e) main(sModelDir, sModelName, options)
try: sInputDir, sA, sB = args except ValueError: sys.stderr.write(sUsage) sys.exit(1) # ... checking folders if not os.path.normpath(sInputDir).endswith("col"): sInputDir = os.path.join(sInputDir, "col") if not os.path.isdir(sInputDir): sys.stderr.write("Not a directory: %s\n" % sInputDir) sys.exit(2) # ok, go! traceln("Input is : ", os.path.abspath(sInputDir)) traceln("algo A is : ", sA) traceln("algo B is : ", sB) if options.bShape or options.bConvexHull: traceln("Shape of intersections based on content!") else: traceln("Shape of intersections is the intersection of shapes!") main(sInputDir, sA, sB, options.bShape, options.bConvexHull, options.bVerbose) traceln("Input was : ", os.path.abspath(sInputDir)) traceln("algo A was : ", sA) traceln("algo B was : ", sB) if options.bShape or options.bConvexHull: trace("Shape of intersections based on content: ")
def eval_cluster_of_files(lsFilename , sClusterLevel # either "row", "col", "cell", "region", "cluster" , bIgnoreHeader=False , bIgnoreOutOfTable=False , lfSimil=[i / 100.0 for i in [66, 80, 100]] , xpSelector=".//pc:TextLine" , sAlgo=None , sGroupByAttr="" , sClusterGTAttr=None # used when sClusterLevel=="cluster" ): bTable = sClusterLevel in ["row", "col", "cell"] #if not bTable: assert sClusterLevel in ['region', 'cluster'] # sCluelsterLevel can be CLuster or clusterlvl1, 2, ... if not bTable: assert sClusterLevel == 'region' or sClusterLevel.startswith('cluster') dOkErrMiss = { fSimil:(0,0,0) for fSimil in lfSimil } lsRpt = [] for sFilename in lsFilename: doc = etree.parse(sFilename) rootNd = doc.getroot() #assert len(PageXml.xpath(rootNd, "//pc:Page")) == 1, "NOT YET IMPLEMENTED: eval on multi-page files" for iPage, ndPage in enumerate(PageXml.xpath(rootNd, "//pc:Page")): lsRpt.append("PAGE %5d OF FILE %s" % (iPage+1, sFilename)) # cluster -> [node_id] dGT = defaultdict(list) dRun = defaultdict(list) for nd in PageXml.xpath(ndPage, xpSelector): #if bIgnoreHeader and nd.get("DU_header") != "D": continue if bIgnoreHeader and nd.getparent().get("custom") and "table-header" in nd.getparent().get("custom"): continue ndparent = nd.getparent() ndid = nd.get("id") val_run = nd.get("DU_cluster") # ok in most cases if bTable: if sClusterLevel == "cell": val_gt = "%s__%s" % (ndparent.get("row"), ndparent.get("col")) if val_gt == 'None__None' and bIgnoreOutOfTable: continue else: # "col" or "row" val_gt = ndparent.get(sClusterLevel) if val_gt == None and bIgnoreOutOfTable: continue # distinguish each table! val_gt = "%s_%s" % (val_gt, ndparent.getparent().get("id")) else: if sClusterLevel == 'region': val_gt = ndparent.get("id") # WHY??? if val_gt == 'None' and bIgnoreOutOfTable: continue #elif sClusterLevel == 'cluster': elif sClusterLevel == 'cluster': val_gt = nd.get(sClusterGTAttr) elif sClusterLevel.startswith('cluster_lvl'): val_gt = nd.get(sClusterGTAttr) val_run = nd.get("DU_"+sClusterLevel) else: raise Exception("Unknown clustering level: %s"%sClusterLevel) dGT[val_gt].append(ndid) dRun[val_run].append(ndid) #assert ndparent.tag.endswith("TableCell"), "expected TableCell got %s" % nd.getparent().tag if not sAlgo is None: dRun = defaultdict(list) lNdCluster = PageXml.xpath(ndPage, ".//pc:Cluster[@algo='%s']"%sAlgo) # lNdCluster = PageXml.xpath(ndPage, ".//pc:Cluster[@algo='%s' and @rowSpan='1']"%sAlgo) traceln("Loaded %d cluster @algo='%s'"%(len(lNdCluster), sAlgo)) for iCluster, ndCluster in enumerate(lNdCluster): sIDs = ndCluster.get("content") lndid = sIDs.split() if lndid: if sGroupByAttr: # we group them by the value of an attribute dRun[ndCluster.get(sGroupByAttr)].extend(lndid) else: dRun[str(iCluster)] = lndid for fSimil in lfSimil: _nOk, _nErr, _nMiss = evalPartitions( list(dRun.values()) , list(dGT.values()) , fSimil , jaccard_distance) _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss) #traceln("simil:%.2f P %5.2f R %5.2f F1 %5.2f ok=%6d err=%6d miss=%6d" %( lsRpt.append("@simil %.2f P %5.2f R %5.2f F1 %5.2f ok=%6d err=%6d miss=%6d" %( fSimil , _fP, _fR, _fF , _nOk, _nErr, _nMiss )) # , os.path.basename(sFilename))) # sFilename = "" # ;-) # global count nOk, nErr, nMiss = dOkErrMiss[fSimil] nOk += _nOk nErr += _nErr nMiss += _nMiss dOkErrMiss[fSimil] = (nOk, nErr, nMiss) lSummary = [] for fSimil in lfSimil: nOk, nErr, nMiss = dOkErrMiss[fSimil] fP, fR, fF = computePRF(nOk, nErr, nMiss) sLine = "ALL_%s @simil %.2f P %5.2f R %5.2f F1 %5.2f " % ( "TABLES" if bTable else sClusterLevel, fSimil, fP, fR, fF ) \ + " " \ +"ok=%d err=%d miss=%d" %(nOk, nErr, nMiss) lSummary.append(sLine) sRpt = "\n".join(lSummary) + "\n\n" + "\n".join(lsRpt) + "\n\n" + "\n".join(lSummary) return nOk, nErr, nMiss, sRpt
def eval_direct(lCriteria, lsDocDir , bIgnoreHeader=False , bIgnoreOutOfTable=True , lfSimil=[i / 100.0 for i in [66, 80, 100]] , xpSelector=".//pc:TextLine"): """ use the row, col, DU_row, DU_col XML attributes to form the partitions lCriteria is a list containg "row" or "col" or both """ assert lsDocDir llsFile = listParallelFiles(lsDocDir) traceln("-loaded %d files for each criteria"%len(llsFile[0])) dOkErrMiss = { fSimil:(0,0,0) for fSimil in lfSimil } def _reverseDictionary(d): rd = defaultdict(list) for k, v in d.items(): rd[v].append(k) return rd for i, lsCritFile in enumerate(zip(*llsFile)): assert len(lCriteria) == len(lsCritFile) # node_id -> consolidated_criteria_values dIdValue = defaultdict(str) dIdValue_GT = defaultdict(str) for crit, sFilename, sDir in zip(lCriteria, lsCritFile, lsDocDir): doc = etree.parse(os.path.join(sDir, sFilename)) rootNd = doc.getroot() assert len(PageXml.xpath(rootNd, "//pc:Page")) == 1, "NOT YET IMPLEMENTED: eval on multi-page files" for nd in PageXml.xpath(rootNd, xpSelector): ndid = nd.get("id") val_gt = nd.getparent().get(crit) if val_gt is None: if bIgnoreOutOfTable: continue else: val_gt = "-1" #if bIgnoreHeader and nd.get("DU_header") != "D": continue if bIgnoreHeader and nd.getparent().get("custom") and "table-header" in nd.getparent().get("custom"): continue assert nd.getparent().tag.endswith("TableCell"), "expected TableCell got %s" % nd.getparent().tag val = nd.get("DU_"+crit) # import random # if random.random() < 0.10: # val = nd.get("DU_"+crit) # else: # val = nd.getparent().get(crit) dIdValue[ndid] += "_%s_" % val dIdValue_GT[ndid] += "_%s_" % val_gt # print("**run ", str(dIdValue)) # print("**GT ", str(dIdValue_GT)) # reverse dicitonaries dValue_lId = _reverseDictionary(dIdValue) dValue_lId_GT = _reverseDictionary(dIdValue_GT) # print("run ", list(dValue_lId.values())) # print("GT ", list(dValue_lId_GT.values())) for fSimil in lfSimil: _nOk, _nErr, _nMiss = evalPartitions( list(dValue_lId.values()) , list(dValue_lId_GT.values()) , fSimil , jaccard_distance) _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss) traceln("simil:%.2f P %5.2f R %5.2f F1 %5.2f ok=%6d err=%6d miss=%6d %s" %( fSimil , _fP, _fR, _fF , _nOk, _nErr, _nMiss , os.path.basename(sFilename))) sFilename = "" # ;-) nOk, nErr, nMiss = dOkErrMiss[fSimil] nOk += _nOk nErr += _nErr nMiss += _nMiss dOkErrMiss[fSimil] = (nOk, nErr, nMiss) traceln() for fSimil in lfSimil: nOk, nErr, nMiss = dOkErrMiss[fSimil] fP, fR, fF = computePRF(nOk, nErr, nMiss) traceln("ALL simil:%.2f P %5.2f R %5.2f F1 %5.2f " % (fSimil, fP, fR, fF ) , " " ,"ok=%d err=%d miss=%d" %(nOk, nErr, nMiss)) return (nOk, nErr, nMiss)
def eval_oracle(lsRunDir, sClusterLevel , bIgnoreHeader=True , bIgnoreOutOfTable=True , lfSimil=[i / 100.0 for i in [66, 80, 100]] , xpSelector=".//pc:TextLine"): """ evaluate the cluster quality from a run folder We assume to have the groundtruth row and col in the files as well as the predicted clusters """ assert lsRunDir dOkErrMiss = { fSimil:(0,0,0) for fSimil in lfSimil } DU_GraphClass = getConfiguredGraphClass() for sRunDir in lsRunDir: lsFile = listFiles(sRunDir,ext='.pxml') traceln("-loaded %d files from %s" % (len(lsFile), sRunDir)) for sFilename in lsFile: # lg = DU_GraphClass.loadGraphs(DU_GraphClass, [os.path.join(sRunDir, sFilename)], bDetach=False, bLabelled=False, iVerbose=1) # cluster -> [node_id] dGT = defaultdict(list) dRun = defaultdict(list) # doc = etree.parse(os.path.join(sRunDir, sFilename)) # assume 1 page per doc! g=lg[0] rootNd = g.doc.getroot() #assert len(PageXml.xpath(rootNd, "//pc:Page")) == 1, "NOT YET IMPLEMENTED: eval on multi-page files" for iPage, ndPage in enumerate(PageXml.xpath(rootNd, "//pc:Page")): traceln("PAGE %5d OF FILE %s" % (iPage+1, sFilename)) try:g = lg[iPage] except IndexError:continue Y = labelEdges(g,sClusterLevel) g.form_cluster(Y) g.addEdgeToDoc() for nd in PageXml.xpath(ndPage, xpSelector): if bIgnoreHeader and nd.getparent().get("custom") and "table-header" in nd.getparent().get("custom"): continue # if bIgnoreHeader and nd.get("DU_header") != "D": continue ndparent = nd.getparent() ndid = nd.get("id") if sClusterLevel == "cell": val_gt = "%s__%s" % (ndparent.get("row"), ndparent.get("col")) if val_gt == 'None__None' and bIgnoreOutOfTable: continue elif sClusterLevel == "col": val_gt = ndparent.get("col") if val_gt == None and bIgnoreOutOfTable: continue elif sClusterLevel == "row": val_gt = ndparent.get("row") if val_gt == None and bIgnoreOutOfTable: continue else: raise Exception("Unknown clustering level: %s"%sClusterLevel) # distinguish each table! val_gt = val_gt + "_" + ndparent.getparent().get("id") dGT[val_gt].append(ndid) val_run = nd.get("DU_cluster") dRun[val_run].append(ndid) # assert ndparent.tag.endswith("TableCell"), "expected TableCell got %s" % nd.getparent().tag for fSimil in lfSimil: _nOk, _nErr, _nMiss = evalPartitions( # _nOk, _nErr, _nMiss, _lFound, _lErr, _lMissed = evalPartitions( list(dRun.values()) , list(dGT.values()) , fSimil , jaccard_distance) _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss) #traceln("simil:%.2f P %5.2f R %5.2f F1 %5.2f ok=%6d err=%6d miss=%6d" %( traceln("@simil %.2f P %5.2f R %5.2f F1 %5.2f ok=%6d err=%6d miss=%6d" %( fSimil , _fP, _fR, _fF , _nOk, _nErr, _nMiss )) # , os.path.basename(sFilename))) # sFilename = "" # ;-) # global count nOk, nErr, nMiss = dOkErrMiss[fSimil] nOk += _nOk nErr += _nErr nMiss += _nMiss dOkErrMiss[fSimil] = (nOk, nErr, nMiss) traceln() g.doc.write(os.path.join(sRunDir, sFilename)+'.oracle') for fSimil in lfSimil: nOk, nErr, nMiss = dOkErrMiss[fSimil] fP, fR, fF = computePRF(nOk, nErr, nMiss) traceln("ALL_TABLES @simil %.2f P %5.2f R %5.2f F1 %5.2f " % (fSimil, fP, fR, fF ) , " " ,"ok=%d err=%d miss=%d" %(nOk, nErr, nMiss)) return (nOk, nErr, nMiss)
, type=float , help="eval_col, eval_cell : Apply this ratio to the bounding box. This is normally useless as the baseline becomes a point (the centroid)" , default=CutAnnotator.fRATIO) # --- #parse the command line (options, args) = parser.parse_args() options.bCutAbove = not(options.bCutBelow) #load mpxml try: op = args[0] except: traceln(usage) sys.exit(1) traceln("--- %s ---"%op) if op in ["eval", "eval_row", "eval_col", "eval_cell"]: if op == "eval": op = "eval_row" traceln("DEPRECATED: now use ", op[0:4] + "_cut" + op[4:]) exit(1) # -------------------------------------- if op == "cut": sFilename = args[1] sOutFilename = args[2] traceln("- cutting : %s --> %s" % (sFilename, sOutFilename)) lDegAngle = [float(s) for s in options.lsAngle.split(",")] traceln("- Allowed angles (°): %s" % lDegAngle) op_cut(sFilename, sOutFilename, lDegAngle, options.bCutAbove, fCutHeight=options.fCutHeight)