def main(sInputDir, options):

    # filenames without the path
    lsFilename = [
        os.path.basename(name) for name in os.listdir(sInputDir)
        if name.endswith("_du.mpxml")
    ]
    traceln("- %d .mpxml files to process" % len(lsFilename))
    for sMPXml in lsFilename:
        trace(".du_mpxml FILE : ", sMPXml)
        if options.bVerbose: traceln()

        # 0 - load input file
        doc = etree.parse(os.path.join(sInputDir, sMPXml))
        cluster2Region(doc, options.bVerbose)
        if options.bOverwrite:
            outfilename = sMPXml
        else:
            outfilename = sMPXml[:-len('_du.mpxml')] + "." + options.outext
        traceln(" written in %s" % (os.path.join(sInputDir, outfilename)))
        doc.write(os.path.join(sInputDir, outfilename),
                  xml_declaration=True,
                  encoding="utf-8",
                  pretty_print=True
                  #compression=0,  #0 to 9
                  )
def split_by_max(crit, sDir):
    """
    here, we create sub-folders, where the files have the same number of row or col
    """
    assert crit in ["row", "col"]

    sColDir = os.path.join(sDir, "col")
    traceln("- looking at ", sColDir)
    lsFile = []
    for _fn in os.listdir(sColDir):
        _fnl = _fn.lower()
        if _fnl.endswith("_du.mpxml") or _fnl.endswith("_du.pxml"):
            continue
        if not (_fnl.endswith(".mpxml") or _fnl.endswith(".pxml")):
            continue
        lsFile.append(_fn)
    traceln(" %d files" % len(lsFile))

    dCnt = defaultdict(int)

    for sFilename in lsFile:
        trace("- %s" % sFilename)
        sInFile = os.path.join(sColDir, sFilename)
        doc = etree.parse(sInFile)
        rootNd = doc.getroot()
        vmax = -999
        xp = "//@%s" % crit
        try:
            vmax = max(int(_nd) for _nd in PageXml.xpath(rootNd, xp))
            assert vmax >= 0
            sToDir = "%s_%s_%d" % (sDir, crit, vmax)
        except ValueError:
            trace("  ERROR on file %s" % sInFile)
            vmax = None
            sToDir = "%s_%s_%s" % (sDir, crit, vmax)
        del doc
        sToColDir = os.path.join(sToDir, "col")
        try:
            os.mkdir(sToDir)
            os.mkdir(sToColDir)
        except FileExistsError:
            pass
        copyfile(sInFile, os.path.join(sToColDir, sFilename))
        traceln("   -> ", sToColDir)
        dCnt[vmax] += 1
    traceln("WARNING: %d invalid files" % dCnt[None])
    del dCnt[None]
    traceln(sorted(dCnt.items()))
Example #3
0
 def DYNAMIC_IMPORT(name, package=None):
     chronoOn("import")
     trace("SETUP: Dynamic import of '%s' from '%s'" % (name, package))
     m = import_module(name, package)
     traceln("  done [%.1fs]" % chronoOff("import"))
     return m
def main(sInputDir, sAlgo, bCol=False, scale_H=None, scale_V=None, bVerbose=False):
    
    if not scale_H is None: TableCluster.scale_H = scale_H
    if not scale_V is None: TableCluster.scale_V = scale_V
    
    traceln("scale_H=", TableCluster.scale_H)
    traceln("scale_V=", TableCluster.scale_V)
    
    # filenames without the path
    lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith("_du.pxml") or name.endswith("_du.mpxml")]
    traceln(" - %d files to process, to tabulate clusters '%s'" % (
        len(lsFilename)
        , sAlgo))
    lsFilename.sort()
    for sFilename in lsFilename:
        sFullFilename = os.path.join(sInputDir, sFilename)
        traceln(" -------- FILE : ", sFullFilename)
        cnt = 0
        doc = etree.parse(sFullFilename)
        
        for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage, namespaces=dNS)):
            lCluster = TableCluster.load(ndPage, sAlgo, bNode=True)  # True to keep a pointer to the DOM node
        
            if bVerbose:
                trace(" --- Page %d : %d cluster '%s' " %(iPage+1, len(lCluster), sAlgo))
            if len(lCluster) == 0:
                traceln("*** NO cluster '%s' *** we keep this page unchanged"%sAlgo)
                continue
            _nbRm = TableCluster.removeEdgesFromXml(ndPage)
            if bVerbose:
                traceln("\n  %d ClusterEdge removed"%_nbRm)
            
            TableCluster.computeClusterBoundingBox(lCluster)
            
            if True:
                # edges are better this way!
                lBB = []
                for c in lCluster: 
                    lBB.append(c.getBB())
                    c.scale(TableCluster.scale_H, TableCluster.scale_V)
                TableCluster.computeClusterEdge(ndPage, lCluster)
                for c, bb in zip(lCluster, lBB):
                    c.setBB(bb) 
                # for c in lCluster: c.scale(1.0/TableCluster.scale_H, 1.0/TableCluster.scale_V)
            else:
                # compute inter- cluster edges from inter- cluster-item edges  
                TableCluster.induceClusterEdge(ndPage, lCluster)

            # store inter-cluster edges
            cntPage = TableCluster.addEdgesToXml(ndPage, sAlgo, lCluster)
            if bVerbose:
                traceln("    %d inter-cluster edges   " %(cntPage))

            # compute min/max row/col for each cluster
            # WARNING - side effect on lCluster content and edges
            if bCol:
                TableCluster.tabulate(ndPage, lCluster, bVerbose=bVerbose)
            else:
                TableCluster.tabulate_rows(ndPage, lCluster, bVerbose=bVerbose)
                TableCluster.use_cut_columns(ndPage)
                            
            cnt += cntPage
        traceln("%d inter-cluster edges" %(cnt))
            
        
        doc.write(sFullFilename,
          xml_declaration=True,
          encoding="utf-8",
          pretty_print=True
          #compression=0,  #0 to 9
          )        
        
        del doc
        
    traceln(" done   (%d files)" % len(lsFilename))
def main(sInputDir, sGTDir, sOutputDir
         , xpElement1, xpElement2
         , xpArea1, xpArea2
         , bNorm, iNorm, bNormOnly
         , bSep
         , lsRmId
         , bEval
         , bWarm
         , sExt = ".mpxml"
         , bVerbose=False):
    
    lSkippedFile = []
    nOK = 0

    # filenames without the path
    lsFilename = [os.path.basename(name) for name in os.listdir(sInputDir) if name.endswith(sExt) and not name.endswith("_du%s"%sExt)]
    traceln(" - %d %s files to process" % (len(lsFilename), sExt))
    lsFilename.sort()
    for sMPXml in lsFilename:
        trace(" - %s FILE : " % sExt, sMPXml)
        if bVerbose: traceln()
        
        # -- find individual subfiles
        sSubDir = os.path.join(sInputDir, sMPXml[:-len(sExt)])
        if os.path.isdir(sSubDir):
            traceln("  (->  ", sSubDir, ")")
            lsPXml = [os.path.basename(name) for name in os.listdir(sSubDir) if name.endswith(".pxml")]
            if bVerbose: traceln("\t%d files to process"%len(lsPXml))
        else:
            sSubDir = sInputDir
            lsPXml = [sMPXml]
            if bVerbose: traceln("\tprocessing the %s file"%sExt)
        
        # -- find GT...
        for sInputXml in lsPXml:
            trace("\t", sMPXml, " -- ", sInputXml)
                
            sGTFN = os.path.join(sGTDir, sInputXml)
            if not os.path.isfile(sGTFN):
                # maybe it is also a folder downloaded from Transkribus?
                if os.path.isfile(os.path.join(sGTDir, sMPXml[:-len(".mpxml")], sInputXml)):
                    sGTFN = os.path.join(sGTDir, sMPXml[:-len(".mpxml")], sInputXml)
                else:
                    # hummm, maybe it is a mpxml instead... :-/
                    sGTFN = sGTFN[:-len(".pxml")] + ".mpxml"
                    if not os.path.isfile(sGTFN):
                        traceln("  *** NO GT *** file skipped ")
                        lSkippedFile.append(sInputXml)
                        continue
            # ok GT file found
            trace(" ...")

            # input Xml
            sInFN = os.path.join(sSubDir, sInputXml)
            sOutFN = os.path.join(sOutputDir, sInputXml)

            if bWarm and os.path.exists(sOutFN):
                # check existence and freshness
                t_in  = os.path.getmtime(sInFN)
                t_gt  = os.path.getmtime(sGTFN)
                t_out =  os.path.getmtime(sOutFN)
                if t_out > t_in and t_out > t_gt:
                    traceln("\t\t fresh output file found on disk: %s  - skipping it!"%sOutFN)
                    continue
            
            # 0 - load input file
            doc = etree.parse(sInFN)
            
            # 1 - normalize input elements
            if bNorm: 
                doc = normaliseDocElements(doc, xpElement2, iNorm)
            
            # 2 - project GT
            try:
                if not bNormOnly:
                    gtdoc = etree.parse(sGTFN)
                    if True:
                        doc = project_Elt_to_GT(gtdoc, doc
                                                , xpElement1, xpElement2
                                                , xpArea2, bSep, lsRmId, bEval)
                    else:
                        doc = project_Areas_to_Input(gtdoc, doc
                                                     , xpElement1, xpElement2, xpArea1, xpArea2
                                                     , bSep, lsRmId, bEval)

                # 3 - save
                doc.write(sOutFN,
                          xml_declaration=True,
                          encoding="utf-8",
                          pretty_print=True
                          #compression=0,  #0 to 9
                          )
                nOK += 1
            except ProjectException as e:
                traceln("Exception: ", e)
                lSkippedFile.append(sInputXml)
            # done
            
            del doc
            traceln(" done")
            
    
    traceln(" - %d file produced,  %d .pxml files skipped" % (nOK, len(lSkippedFile)))
def project_Elt_to_GT(gtdoc, doc
                      , xpElement1, xpElement2
                      , xpArea2
                      , bSep, lsRmId, bEval
                      , fTH=0.5):
    """
    Here we take the element out of the production file to put them in the GT
    doc
    
    WE IGNORE xpArea1 (no need for it)
    
    We return the GT doc
    """
    gtroot = gtdoc.getroot()

    # Evaluation
    # we build a table of list of TextLineId from the GT to check this SW
    # table_id -> row -> col -> list of element id
    dTable = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
    nOk, nTot = 0, 0
    
    if lsRmId:
        nbEltRemoved = 0
        for sRmId in lsRmId:
            # for _nd in gtroot.xpath('//pg:*[@id="%s"]'%sRmId, namespaces=dNS):
            for _nd in gtroot.xpath('//*[@id="%s"]'%sRmId):
                _nd.getparent().remove(_nd)
                nbEltRemoved += 1
        trace(" (Rm by ID: %d elements removed)" % nbEltRemoved)

    # remove all elements of interest from GT
    # inside TableRegion, we have TextLine, outside we have TextRegion
    if xpElement1 != xpArea2:
        for ndElt in gtroot.xpath(xpElement1, namespaces=dNS):
            if bEval:
                for ndElt2 in ndElt.xpath(xpElement2, namespaces=dNS):
                    dTable[None][None][None].append(ndElt2.get("id"))
            ndElt.getparent().remove(ndElt)
    for ndElt in gtroot.xpath(xpElement2, namespaces=dNS):
        ndCell = ndElt.getparent()
        if bEval: dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")].append(ndElt.get("id")) 
        ndCell.remove(ndElt)
    if bEval: traceln("\npEvaluation mode")
    
    if bSep:
        nbSepRemoved, nbSepAdded = 0, 0
        for _nd in gtroot.xpath('//pg:SeparatorRegion', namespaces=dNS):
            _nd.getparent().remove(_nd)
            nbSepRemoved += 1
        trace(" (Separators: %d removed" % nbSepRemoved)
                    
    # project the GT areas, page by page
    lNdPage   = doc.getroot().xpath("//pg:Page", namespaces=dNS)
    lNdPageGT =        gtroot.xpath("//pg:Page", namespaces=dNS)
    if len(lNdPage) != len(lNdPageGT):
        raise GTProjectionException("GT and input have different numbers of pages")
    assert len(lNdPage) > 0, "No page??"

    uniqID = 1
    nNdArea2 = 0
    for ndPage, ndPageGT in zip(lNdPage, lNdPageGT):
        lNdArea2 = ndPageGT.xpath(xpArea2, namespaces=dNS)
        loArea2 = [ShapeLoader.node_to_Polygon(nd) for nd in lNdArea2]
        nNdArea2 += len(lNdArea2)
        for ndElt in ndPage.xpath(xpElement2, namespaces=dNS):
            oElt = ShapeLoader.node_to_Polygon(ndElt)
            
            lOvrl = [oElt.intersection(o).area for o in loArea2]
            iMax = argmax(lOvrl) if lOvrl else None
            vMax = -1 if iMax is None else lOvrl[iMax]
            
            # where to add it?
            if vMax > 0 and vMax / oElt.area > fTH:
                # ok, this is a match
                ndCell = lNdArea2[iMax]
                # add it directly to the area2 (TableCell)
                ndCell.append(deepcopy(ndElt))
                if bEval:
                    if ndElt.get("id") in dTable[ndCell.getparent().get("id")][ndCell.get("row")][ndCell.get("col")]: 
                        nOk += 1
                    else:
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))
                    
            else:
                # add it outside of any area
                bestNd = ndPageGT
                # add it in its own TextRegion
                ndTR = etree.Element("TextRegion")
                ndTR.set("id", "prjct_region_%d" % uniqID)
                uniqID += 1
                ndTR.set("custom", "")
                ndTR.append(deepcopy(ndElt.xpath("./pg:Coords", namespaces=dNS)[0]))
                ndTR.append(deepcopy(ndElt))
                bestNd.append(ndTR)
                if bEval:
                    if ndElt.get("id") in dTable[None][None][None]: 
                        nOk += 1
                    else:
                        try: traceln('FAILED:in table: id="%s" "%s"' % (ndElt.get("id"), ndElt.xpath(".//pg:Unicode", namespaces=dNS)[0].text))
                        except IndexError:traceln('FAILED:in table: id="%s" NOTEXT"' % (ndElt.get("id")))                        
                        
            nTot += 1
            
        if bSep:
            for _nd in ndPage.xpath('//pg:SeparatorRegion', namespaces=dNS):
                ndPageGT.append(deepcopy(_nd))
                nbSepAdded += 1
    if bSep: trace(", %d added.)  " % nbSepAdded)
        
    if bEval:
        traceln("-"*40)
        trace(" - evaluation: %d ok out of %d = %.2f%%\n" % (nOk, nTot, 100*nOk / (nTot+0.0001)))

    if nNdArea2 == 0: raise ProjectException("Empty GT")
    return gtdoc
Example #7
0
def main(sInputDir,
         sAlgoA,
         sAlgoB,
         bShape=False,
         bConvexHull=False,
         bVerbose=False):
    sAlgoC = sFMT % (sAlgoA, sAlgoB)

    # filenames without the path
    lsFilename = [
        os.path.basename(name) for name in os.listdir(sInputDir)
        if name.endswith("_du.pxml") or name.endswith("_du.mpxml")
    ]
    traceln(" - %d files to process, to produce clusters '%s'" %
            (len(lsFilename), sAlgoC))

    for sFilename in lsFilename:
        sFullFilename = os.path.join(sInputDir, sFilename)
        traceln(" - FILE : ", sFullFilename)
        cntCluster, cntPage = 0, 0
        doc = etree.parse(sFullFilename)

        for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage,
                                                           namespaces=dNS)):
            nRemoved = Cluster.remove(ndPage, sAlgoC)

            lClusterA = Cluster.load(ndPage, sAlgoA)
            lClusterB = Cluster.load(ndPage, sAlgoB)

            if bVerbose:
                trace(
                    "Page %d : (%d clusters REMOVED),   %d cluster '%s'   %d clusters '%s'"
                    % (iPage + 1, nRemoved, len(lClusterA), sAlgoA,
                       len(lClusterB), sAlgoB))

            lClusterC = []
            for A in lClusterA:
                for B in lClusterB:
                    C = Cluster.intersect(A, B)
                    if not C is None:
                        lClusterC.append(C)

            if bVerbose: traceln("    -> %d clusters" % (len(lClusterC)))
            if bShape or bConvexHull:
                for c in lClusterC:
                    c.shape = Cluster.computeShape(ndPage,
                                                   c.setID,
                                                   bConvexHull=bConvexHull)

            cntCluster += len(lClusterC)
            cntPage += 1

            Cluster.store(ndPage, lClusterC, sAlgoC)

        doc.write(sFullFilename,
                  xml_declaration=True,
                  encoding="utf-8",
                  pretty_print=True
                  #compression=0,  #0 to 9
                  )

        del doc
        traceln(" %d clusters over %d pages" % (cntCluster, cntPage))

    traceln(" done   (%d files)" % len(lsFilename))
Example #8
0
        sInputDir = os.path.join(sInputDir, "col")

    if not os.path.isdir(sInputDir):
        sys.stderr.write("Not a directory: %s\n" % sInputDir)
        sys.exit(2)

    # ok, go!
    traceln("Input  is : ", os.path.abspath(sInputDir))
    traceln("algo A is : ", sA)
    traceln("algo B is : ", sB)
    if options.bShape or options.bConvexHull:
        traceln("Shape of intersections based on content!")
    else:
        traceln("Shape of intersections is the intersection of shapes!")

    main(sInputDir, sA, sB, options.bShape, options.bConvexHull,
         options.bVerbose)

    traceln("Input  was : ", os.path.abspath(sInputDir))
    traceln("algo A was : ", sA)
    traceln("algo B was : ", sB)
    if options.bShape or options.bConvexHull:
        trace("Shape of intersections based on content: ")
        if options.bConvexHull:
            traceln(" as a convex hull")
        else:
            traceln(" as a minimum rotated rectangle")
    else:
        traceln("Shape of intersections is the intersection of shapes!")

    traceln("Done.")