def main(sInputDir, options): # filenames without the path lsFilename = [ os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.mpxml") ] traceln("- %d .mpxml files to process" % len(lsFilename)) for sMPXml in lsFilename: trace(".du_mpxml FILE : ", sMPXml) if options.bVerbose: traceln() # 0 - load input file doc = etree.parse(os.path.join(sInputDir, sMPXml)) cluster2Region(doc, options.bVerbose) if options.bOverwrite: outfilename = sMPXml else: outfilename = sMPXml[:-len('_du.mpxml')] + "." + options.outext traceln(" written in %s" % (os.path.join(sInputDir, outfilename))) doc.write(os.path.join(sInputDir, outfilename), xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 )
def split_by_max(crit, sDir): """ here, we create sub-folders, where the files have the same number of row or col """ assert crit in ["row", "col"] sColDir = os.path.join(sDir, "col") traceln("- looking at ", sColDir) lsFile = [] for _fn in os.listdir(sColDir): _fnl = _fn.lower() if _fnl.endswith("_du.mpxml") or _fnl.endswith("_du.pxml"): continue if not (_fnl.endswith(".mpxml") or _fnl.endswith(".pxml")): continue lsFile.append(_fn) traceln(" %d files" % len(lsFile)) dCnt = defaultdict(int) for sFilename in lsFile: trace("- %s" % sFilename) sInFile = os.path.join(sColDir, sFilename) doc = etree.parse(sInFile) rootNd = doc.getroot() vmax = -999 xp = "//@%s" % crit try: vmax = max(int(_nd) for _nd in PageXml.xpath(rootNd, xp)) assert vmax >= 0 sToDir = "%s_%s_%d" % (sDir, crit, vmax) except ValueError: trace(" ERROR on file %s" % sInFile) vmax = None sToDir = "%s_%s_%s" % (sDir, crit, vmax) del doc sToColDir = os.path.join(sToDir, "col") try: os.mkdir(sToDir) os.mkdir(sToColDir) except FileExistsError: pass copyfile(sInFile, os.path.join(sToColDir, sFilename)) traceln(" -> ", sToColDir) dCnt[vmax] += 1 traceln("WARNING: %d invalid files" % dCnt[None]) del dCnt[None] traceln(sorted(dCnt.items()))
def DYNAMIC_IMPORT(name, package=None): chronoOn("import") trace("SETUP: Dynamic import of '%s' from '%s'" % (name, package)) m = import_module(name, package) traceln(" done [%.1fs]" % chronoOff("import")) return m
def main(sInputDir, sAlgo, bCol=False, scale_H=None, scale_V=None, bVerbose=False): if not scale_H is None: TableCluster.scale_H = scale_H if not scale_V is None: TableCluster.scale_V = scale_V traceln("scale_H=", TableCluster.scale_H) traceln("scale_V=", TableCluster.scale_V) # filenames without the path lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml")] traceln(" - %d files to process, to tabulate clusters '%s'" % ( len(lsFilename) , sAlgo)) lsFilename.sort() for sFilename in lsFilename: sFullFilename = os.path.join(sInputDir, sFilename) traceln(" -------- FILE : ", sFullFilename) cnt = 0 doc = etree.parse(sFullFilename) for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)): lCluster = TableCluster.load(ndPage, sAlgo, bNode=True) # True to keep a pointer to the DOM node if bVerbose: trace(" --- Page %d : %d cluster '%s' " %(iPage+1, len(lCluster), sAlgo)) if len(lCluster) == 0: traceln("*** NO cluster '%s' *** we keep this page unchanged"%sAlgo) continue _nbRm = TableCluster.removeEdgesFromXml(ndPage) if bVerbose: traceln("\n %d ClusterEdge removed"%_nbRm) TableCluster.computeClusterBoundingBox(lCluster) if True: # edges are better this way! lBB = [] for c in lCluster: lBB.append(c.getBB()) c.scale(TableCluster.scale_H, TableCluster.scale_V) TableCluster.computeClusterEdge(ndPage, lCluster) for c, bb in zip(lCluster, lBB): c.setBB(bb) # for c in lCluster: c.scale(1.0/TableCluster.scale_H, 1.0/TableCluster.scale_V) else: # compute inter- cluster edges from inter- cluster-item edges TableCluster.induceClusterEdge(ndPage, lCluster) # store inter-cluster edges cntPage = TableCluster.addEdgesToXml(ndPage, sAlgo, lCluster) if bVerbose: traceln(" %d inter-cluster edges " %(cntPage)) # compute min/max row/col for each cluster # WARNING - side effect on lCluster content and edges if bCol: TableCluster.tabulate(ndPage, lCluster, bVerbose=bVerbose) else: TableCluster.tabulate_rows(ndPage, lCluster, bVerbose=bVerbose) TableCluster.use_cut_columns(ndPage) cnt += cntPage traceln("%d inter-cluster edges" %(cnt)) doc.write(sFullFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) del doc traceln(" done (%d files)" % len(lsFilename))
def main(sInputDir, sGTDir, sOutputDir , xpElement1, xpElement2 , xpArea1, xpArea2 , bNorm, iNorm, bNormOnly , bSep , lsRmId , bEval , bWarm , sExt = ".mpxml" , bVerbose=False): lSkippedFile = [] nOK = 0 # filenames without the path lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith(sExt) and not name.endswith("_du%s"%sExt)] traceln(" - %d %s files to process" % (len(lsFilename), sExt)) lsFilename.sort() for sMPXml in lsFilename: trace(" - %s FILE : " % sExt, sMPXml) if bVerbose: traceln() # -- find individual subfiles sSubDir = os.path.join(sInputDir, sMPXml[:-len(sExt)]) if os.path.isdir(sSubDir): traceln(" (-> ", sSubDir, ")") lsPXml = [os.path.basename(name) for name in os.listdir(sSubDir) if name.endswith(".pxml")] if bVerbose: traceln("\t%d files to process"%len(lsPXml)) else: sSubDir = sInputDir lsPXml = [sMPXml] if bVerbose: traceln("\tprocessing the %s file"%sExt) # -- find GT... for sInputXml in lsPXml: trace("\t", sMPXml, " -- ", sInputXml) sGTFN = os.path.join(sGTDir, sInputXml) if not os.path.isfile(sGTFN): # maybe it is also a folder downloaded from Transkribus? if os.path.isfile(os.path.join(sGTDir, sMPXml[:-len(".mpxml")], sInputXml)): sGTFN = os.path.join(sGTDir, sMPXml[:-len(".mpxml")], sInputXml) else: # hummm, maybe it is a mpxml instead... :-/ sGTFN = sGTFN[:-len(".pxml")] + ".mpxml" if not os.path.isfile(sGTFN): traceln(" *** NO GT *** file skipped ") lSkippedFile.append(sInputXml) continue # ok GT file found trace(" ...") # input Xml sInFN = os.path.join(sSubDir, sInputXml) sOutFN = os.path.join(sOutputDir, sInputXml) if bWarm and os.path.exists(sOutFN): # check existence and freshness t_in = os.path.getmtime(sInFN) t_gt = os.path.getmtime(sGTFN) t_out = os.path.getmtime(sOutFN) if t_out > t_in and t_out > t_gt: traceln("\t\t fresh output file found on disk: %s - skipping it!"%sOutFN) continue # 0 - load input file doc = etree.parse(sInFN) # 1 - normalize input elements if bNorm: doc = normaliseDocElements(doc, xpElement2, iNorm) # 2 - project GT try: if not bNormOnly: gtdoc = etree.parse(sGTFN) if True: doc = project_Elt_to_GT(gtdoc, doc , xpElement1, xpElement2 , xpArea2, bSep, lsRmId, bEval) else: doc = project_Areas_to_Input(gtdoc, doc , xpElement1, xpElement2, xpArea1, xpArea2 , bSep, lsRmId, bEval) # 3 - save doc.write(sOutFN, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) nOK += 1 except ProjectException as e: traceln("Exception: ", e) lSkippedFile.append(sInputXml) # done del doc traceln(" done") traceln(" - %d file produced, %d .pxml files skipped" % (nOK, len(lSkippedFile)))
def project_Elt_to_GT(gtdoc, doc , xpElement1, xpElement2 , xpArea2 , bSep, lsRmId, bEval , fTH=0.5): """ Here we take the element out of the production file to put them in the GT doc WE IGNORE xpArea1 (no need for it) We return the GT doc """ gtroot = gtdoc.getroot() # Evaluation # we build a table of list of TextLineId from the GT to check this SW # table_id -> row -> col -> list of element id dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list))) nOk, nTot = 0, 0 if lsRmId: nbEltRemoved = 0 for sRmId in lsRmId: # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS): for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId): _nd.getparent().remove(_nd) nbEltRemoved += 1 trace(" (Rm by ID: %d elements removed)" % nbEltRemoved) # remove all elements of interest from GT # inside TableRegion, we have TextLine, outside we have TextRegion if xpElement1 != xpArea2: for ndElt in gtroot.xpath(xpElement1, namespaces=dNS): if bEval: for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS): dTable[None][None][None].append(ndElt2.get("id")) ndElt.getparent().remove(ndElt) for ndElt in gtroot.xpath(xpElement2, namespaces=dNS): ndCell = ndElt.getparent() if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id")) ndCell.remove(ndElt) if bEval: traceln("\npEvaluation mode") if bSep: nbSepRemoved, nbSepAdded = 0, 0 for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS): _nd.getparent().remove(_nd) nbSepRemoved += 1 trace(" (Separators: %d removed" % nbSepRemoved) # project the GT areas, page by page lNdPage = doc.getroot().xpath("//pg:Page", namespaces=dNS) lNdPageGT = gtroot.xpath("//pg:Page", namespaces=dNS) if len(lNdPage) != len(lNdPageGT): raise GTProjectionException("GT and input have different numbers of pages") assert len(lNdPage) > 0, "No page??" uniqID = 1 nNdArea2 = 0 for ndPage, ndPageGT in zip(lNdPage, lNdPageGT): lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS) loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2] nNdArea2 += len(lNdArea2) for ndElt in ndPage.xpath(xpElement2, namespaces=dNS): oElt = ShapeLoader.node_to_Polygon(ndElt) lOvrl = [oElt.intersection(o).area for o in loArea2] iMax = argmax(lOvrl) if lOvrl else None vMax = -1 if iMax is None else lOvrl[iMax] # where to add it? if vMax > 0 and vMax / oElt.area > fTH: # ok, this is a match ndCell = lNdArea2[iMax] # add it directly to the area2 (TableCell) ndCell.append(deepcopy(ndElt)) if bEval: if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]: nOk += 1 else: try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text)) except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id"))) else: # add it outside of any area bestNd = ndPageGT # add it in its own TextRegion ndTR = etree.Element("TextRegion") ndTR.set("id", "prjct_region_%d" % uniqID) uniqID += 1 ndTR.set("custom", "") ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0])) ndTR.append(deepcopy(ndElt)) bestNd.append(ndTR) if bEval: if ndElt.get("id") in dTable[None][None][None]: nOk += 1 else: try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text)) except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id"))) nTot += 1 if bSep: for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS): ndPageGT.append(deepcopy(_nd)) nbSepAdded += 1 if bSep: trace(", %d added.) " % nbSepAdded) if bEval: traceln("-"*40) trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001))) if nNdArea2 == 0: raise ProjectException("Empty GT") return gtdoc
def main(sInputDir, sAlgoA, sAlgoB, bShape=False, bConvexHull=False, bVerbose=False): sAlgoC = sFMT % (sAlgoA, sAlgoB) # filenames without the path lsFilename = [ os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml") ] traceln(" - %d files to process, to produce clusters '%s'" % (len(lsFilename), sAlgoC)) for sFilename in lsFilename: sFullFilename = os.path.join(sInputDir, sFilename) traceln(" - FILE : ", sFullFilename) cntCluster, cntPage = 0, 0 doc = etree.parse(sFullFilename) for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)): nRemoved = Cluster.remove(ndPage, sAlgoC) lClusterA = Cluster.load(ndPage, sAlgoA) lClusterB = Cluster.load(ndPage, sAlgoB) if bVerbose: trace( "Page %d : (%d clusters REMOVED), %d cluster '%s' %d clusters '%s'" % (iPage + 1, nRemoved, len(lClusterA), sAlgoA, len(lClusterB), sAlgoB)) lClusterC = [] for A in lClusterA: for B in lClusterB: C = Cluster.intersect(A, B) if not C is None: lClusterC.append(C) if bVerbose: traceln(" -> %d clusters" % (len(lClusterC))) if bShape or bConvexHull: for c in lClusterC: c.shape = Cluster.computeShape(ndPage, c.setID, bConvexHull=bConvexHull) cntCluster += len(lClusterC) cntPage += 1 Cluster.store(ndPage, lClusterC, sAlgoC) doc.write(sFullFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) del doc traceln(" %d clusters over %d pages" % (cntCluster, cntPage)) traceln(" done (%d files)" % len(lsFilename))
sInputDir = os.path.join(sInputDir, "col") if not os.path.isdir(sInputDir): sys.stderr.write("Not a directory: %s\n" % sInputDir) sys.exit(2) # ok, go! traceln("Input is : ", os.path.abspath(sInputDir)) traceln("algo A is : ", sA) traceln("algo B is : ", sB) if options.bShape or options.bConvexHull: traceln("Shape of intersections based on content!") else: traceln("Shape of intersections is the intersection of shapes!") main(sInputDir, sA, sB, options.bShape, options.bConvexHull, options.bVerbose) traceln("Input was : ", os.path.abspath(sInputDir)) traceln("algo A was : ", sA) traceln("algo B was : ", sB) if options.bShape or options.bConvexHull: trace("Shape of intersections based on content: ") if options.bConvexHull: traceln(" as a convex hull") else: traceln(" as a minimum rotated rectangle") else: traceln("Shape of intersections is the intersection of shapes!") traceln("Done.")