Beispiel #1
0
def tag_DU_row_col_header(root, lCells, maxRowSpan):
    """
    Tag the XML nodes corresponding to those cells
    Modify the XML DOM
    """
    for cell in lCells:
    
        lText = MultiPageXml.getChildByName(cell,'TextLine')
         
        # HEADER WISE: D CH O
        if int(cell.get('row')) < maxRowSpan:
            [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText]
        else:
            [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText]
        
        # ROW WISE: B I E S O
        if len(lText) == 0:
            pass
        if len(lText) == 1:
            lText[0].set(sDURow,lLabelsBIESO_R[3])
        elif len(lText) > 1:
    #         lText.sort(key=lambda x:float(x.prop('y')))
            lText[0].set(sDURow,lLabelsBIESO_R[0])
            [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]]
            lText[-1].set(sDURow,lLabelsBIESO_R[2])
    #         MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0])
    #         MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2])
    #         [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]]    
        
        #COLUM WISE: M S O 
        lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
        coord= lCoords[0]
        sPoints=coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        (cx,cy,cx2,cy2) = plgn.getBoundingBox()     
        
        for txt in lText:
            lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
            coord= lCoords[0]
            sPoints=coord.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx,sy) = sPair.split(',')
                    lXY.append( (int(sx), int(sy)) )
                except ValueError:
                    traceln("WARNING: invalid coord in TextLine id=%s  IGNORED"%txt.get("id"))
            ## HOW to define a CM element!!!!
            if lXY:
                (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox()
                if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1):
                    txt.set(sDUCol,lLabelsSM_C[0])
                else:
                    txt.set(sDUCol,lLabelsSM_C[1])
            else:
                txt.set(sDUCol,lLabelsSM_C[-1])
                
    # textline outside table
    lRegions= MultiPageXml.getChildByName(root,'TextRegion')
    for region in lRegions:
        lText =  MultiPageXml.getChildByName(region,'TextLine')
        [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
        [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
        [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
        
    return
Beispiel #2
0
        if len(sys.argv) == 3: 
            # COMPATIBILITY MODE
            #load mpxml 
            sFilename = sys.argv[1]
            sOutFilename = sys.argv[2]
            lsFilename = [sFilename]
            lsOutFilename = [sOutFilename]
        else:
            #we expect a folder 
            sInput = sys.argv[1]
            if os.path.isdir(sInput):
                lsFilename = [os.path.join(sInput, "col", s) for s in os.listdir(os.path.join(sInput, "col")) if s.endswith(".mpxml") ]
                if not lsFilename:
                    lsFilename = [os.path.join(sInput, "col", s) for s in os.listdir(os.path.join(sInput, "col")) if s.endswith(".pxml") ]
                lsFilename.sort()
                lsOutFilename = [ os.path.dirname(s) + os.sep + "c_" + os.path.basename(s) for s in lsFilename]
            else:
                traceln("%s is not a folder"%sys.argv[1])
                raise IndexError()
    except IndexError:
        traceln("Usage: %s ( input-file output-file | folder )" % sys.argv[0])
        exit(1)
            
    traceln(lsFilename)
    traceln("%d files to be processed" % len(lsFilename))
    traceln(lsOutFilename)

    main(lsFilename, lsOutFilename)


Beispiel #3
0
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        doc = etree.parse(sFilename, parser)
        root = doc.getroot()
        
        lCells= MultiPageXml.getChildByName(root,'TableCell')
        if not lCells:
            traceln("ERROR: no TableCell - SKIPPING THIS FILE!!!")
            continue
        
        # default: O for all cells: all cells must have all tags!
        for cell in lCells:
            lText = MultiPageXml.getChildByName(cell,'TextLine')
            [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
            [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
            [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
            
        
        if False:
            # Oct' 2018 RV and JL decided that we keep the binding TextLine (if any!)
            # ignore "binding" cells
            # dirty...
            # lCells = list(filter(lambda x: int(x.get('rowSpan')) < 5, lCells))
            # less dirty
            maxrow = max(int(x.get('row')) for x in lCells)
            binding_rowspan = max(5, maxrow * 0.8) 
            traceln(" - max row = %d  => considering rowspan > %d as binding cells"
                    % (maxrow, binding_rowspan))
            lValidCell, lBindingCell = [], []
            for ndCell in lCells:
                if int(ndCell.get('rowSpan')) < binding_rowspan:
                    lValidCell.append(ndCell)
                else:
                    lBindingCell.append(ndCell)
            nDiscarded = len(lBindingCell)
            if nDiscarded > 1: traceln("****************   WARNING  ****************")
            traceln(" - %d cells discarded as binding cells" % nDiscarded)
            for ndCell in lBindingCell:
                ndCell.set("type", "table-binding")
            lCells = lValidCell
            
        # FOR COLUMN HEADER: get max(cell[0,i].span)
        maxRowSpan = computeMaxRowSpan(lCells)
        
        tag_DU_row_col_header(root, lCells, maxRowSpan)
        
        try:
            removeSeparator(root)
            addSeparator(root, lCells)
            doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
            traceln('annotation done for %s  --> %s' % (sFilename, sOutFilename))
        except TableAnnotationException:
            traceln("No Table region in file ", sFilename, "  IGNORED!!")
        
        del doc
Beispiel #4
0
                0-            TOC-entry    5940 occurences       (   2%)  (   2%)
                1-              caption     707 occurences       (   0%)  (   0%)
                2-           catch-word     201 occurences       (   0%)  (   0%)
                3-               footer      11 occurences       (   0%)  (   0%)
                4-             footnote   36942 occurences       (  11%)  (  11%)
                5-   footnote-continued    1890 occurences       (   1%)  (   1%)
                6-               header   15910 occurences       (   5%)  (   5%)
                7-              heading   18032 occurences       (   6%)  (   6%)
                8-           marginalia    4292 occurences       (   1%)  (   1%)
                9-          page-number   40236 occurences       (  12%)  (  12%)
               10-            paragraph  194927 occurences       (  60%)  (  60%)
               11-       signature-mark    4894 occurences       (   2%)  (   2%)
"""
lActuallySeen = None
if lActuallySeen:
    traceln("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
    lIgnoredLabels = [
        lLabels[i] for i in range(len(lLabels)) if i not in lActuallySeen
    ]
    lLabels = [lLabels[i] for i in lActuallySeen]
    traceln(len(lLabels), lLabels)
    traceln(len(lIgnoredLabels), lIgnoredLabels)
    nbClass = len(lLabels) + 1  #because the ignored labels will become OTHER

    #DEFINING THE CLASS OF GRAPH WE USE
    DU_GRAPH = Graph_MultiPageXml
    nt = NodeType_PageXml_type_NestedText(
        "gtb"  #some short prefix because labels below are prefixed with it
        ,
        lLabels,
        lIgnoredLabels,
Beispiel #5
0
              
    
if __name__ == "__main__":

    version = "v.01"
    usage, description, parser = DU_CRF_Task.getBasicTrnTstRunOptionParser(sys.argv[0], version)
#     parser.add_option("--annotate", dest='bAnnotate',  action="store_true",default=False,  help="Annotate the textlines with BIES labels")    
    # --- 
    #parse the command line
    (options, args) = parser.parse_args()
    
    # --- 
    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        _exit(usage, 1, e)
        
    doer = DU_ABPTable(sModelName, sModelDir,
                      C                 = options.crf_C,
                      tol               = options.crf_tol,
                      njobs             = options.crf_njobs,
                      max_iter          = options.crf_max_iter,
                      inference_cache   = options.crf_inference_cache)
    
    
    
    if options.rm:
        doer.rm()
        sys.exit(0)
Beispiel #6
0
    def parseXmlFile(self, sFilename, iVerbose=0):
        """
        Load that document as a CRF Graph.
        Also set the self.doc variable!
        
        Return a CRF Graph object
        """
        self.doc = etree.parse(sFilename)
        self.lNode, self.lEdge = list(), list()
        self.lNodeBlock = []  # text node
        self.lNodeCutLine = []  # cut line node

        root = self.doc.getroot()

        doer = BaselineCutAnnotator()
        doer.setLabelScheme_SIO()  #use SIO instead of SO labels!

        #doer.setModulo(self.iModulo)  # this is optional

        #load the groundtruth table separators, if any, per page (1 in tABP)
        ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage)
        for (lHi, lVi) in ltlYlX:
            traceln(" - found %d horizontal,  %d vertical  GT separators" %
                    (len(lHi), len(lVi)))

        #create DOM node reflecting the cuts
        #first clean (just in case!)
        n = doer.remove_cuts_from_dom(root)
        if n > 0:
            traceln(" - removed %d pre-existing cut lines" % n)

        # if GT, then we have labelled cut lines in DOM
        _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX)

        lClassicType = [
            nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType
        ]
        lSpecialType = [
            nt for nt in self.getNodeTypeList()
            if nt not in self._lClassicNodeType
        ]

        for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc):
            #now that we have the page, let's create the node for each type!
            lClassicPageNode = [
                nd for nodeType in lClassicType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]
            lSpecialPageNode = [
                nd for nodeType in lSpecialType
                for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page)
            ]

            self.lNode.extend(lClassicPageNode)  # e.g. the TextLine objects
            self.lNodeBlock.extend(lClassicPageNode)

            self.lNode.extend(lSpecialPageNode)  # e.g. the cut lines!
            self.lNodeCutLine.extend(lSpecialPageNode)

            #no previous page to consider (for cross-page links...) => None
            lClassicPageEdge = Edge.computeEdges(None, lClassicPageNode)
            self.lEdge.extend(lClassicPageEdge)

            # Now, compute edges between special and classic objects...
            lSpecialPageEdge = self.computeSpecialEdges(
                lClassicPageNode, lSpecialPageNode, doer.bCutIsBeforeText)
            self.lEdge.extend(lSpecialPageEdge)

            #if iVerbose>=2: traceln("\tPage %5d    %6d nodes    %7d edges"%(pnum, len(lPageNode), len(lPageEdge)))
            if iVerbose >= 2:
                traceln("\tPage %5d" % (pnum))
                traceln("\t   block: %6d nodes    %7d edges (to block)" %
                        (pnum, len(lClassicPageNode), len(lClassicPageEdge)))
                traceln("\t   line: %6d nodes    %7d edges (from block)" %
                        (pnum, len(lSpecialPageNode), len(lSpecialPageEdge)))

        if iVerbose:
            traceln("\t\t (%d nodes,  %d edges)" %
                    (len(self.lNode), len(self.lEdge)))

        return self
    def getConfiguredGraphClass(cls):
        """
        In this class method, we must return a configured graph class
        """
        lLabels = [
            'heading', 'header', 'page-number', 'resolution-number',
            'resolution-marginalia', 'resolution-paragraph', 'other'
        ]

        lIgnoredLabels = None
        """
        if you play with a toy collection, which does not have all expected classes, you can reduce those.
        """

        lActuallySeen = None
        if lActuallySeen:
            traceln("REDUCING THE CLASSES TO THOSE SEEN IN TRAINING")
            lIgnoredLabels = [
                lLabels[i] for i in range(len(lLabels))
                if i not in lActuallySeen
            ]
            lLabels = [lLabels[i] for i in lActuallySeen]
            traceln(len(lLabels), lLabels)
            traceln(len(lIgnoredLabels), lIgnoredLabels)

        # DEFINING THE CLASS OF GRAPH WE USE
        if cls.bPerPage:
            DU_GRAPH = Graph_MultiSinglePageXml  # consider each age as if indep from each other
        else:
            DU_GRAPH = Graph_MultiPageXml

        if cls.bHTR:
            ntClass = NodeType_PageXml_type
        else:
            #ignore text
            ntClass = NodeType_PageXml_type_woText

        nt = ntClass(
            "bar"  # some short prefix because labels below are prefixed with it
            ,
            lLabels,
            lIgnoredLabels,
            False  # no label means OTHER
            ,
            BBoxDeltaFun=lambda v: max(v * 0.066, min(5, v / 3))
            # we reduce overlap in this way
        )
        nt.setLabelAttribute("DU_sem")
        if cls.bTextLine:
            nt.setXpathExpr((
                ".//pc:TextRegion/pc:TextLine"  #how to find the nodes
                ,
                "./pc:TextEquiv"))
        else:
            nt.setXpathExpr((
                ".//pc:TextRegion"  #how to find the nodes
                ,
                "./pc:TextEquiv")  #how to get their text
                            )

        DU_GRAPH.addNodeType(nt)

        return DU_GRAPH
Beispiel #8
0
    def computeSpecialEdges(self, lClassicPageNode, lSpecialPageNode,
                            bCutIsBeforeText):
        """
        Compute:
        - edges between each block and the cut line above/across/below the block
        - edges between cut lines
        return a list of edges
        """

        #augment the block with the coordinate of its baseline central point
        for blk in lClassicPageNode:
            try:
                x, y = BaselineCutAnnotator.getDomBaselineXY(blk.node)
                blk.x_bslne = x
                blk.y_bslne = y
            except IndexError:
                traceln("** WARNING: no Baseline in ", blk.domid)
                traceln("** Using x2 and y2 instead... :-/")
                blk.x_bslne = blk.x2
                blk.y_bslne = blk.y2

        for cutBlk in lSpecialPageNode:
            assert cutBlk.y1 == cutBlk.y2
            cutBlk.y1 = int(round(cutBlk.y1))  #DeltaFun make float
            cutBlk.y2 = cutBlk.y1

        #block to cut line edges
        lEdge = []
        for blk in lClassicPageNode:
            for cutBlk in lSpecialPageNode:
                if blk.y_bslne == cutBlk.y1:
                    edge = Edge_BL(blk, cutBlk)
                    edge.len = 0
                    edge._type = 0  # Cut line is crossing the block
                    lEdge.append(edge)
                elif abs(blk.y_bslne - cutBlk.y1) <= self.iBlockVisibility:
                    edge = Edge_BL(blk, cutBlk)
                    # experiments show that abs helps
                    # edge.len = (blk.y_bslne - cutBlk.y1) / self.iBlockVisibility
                    edge.len = abs(blk.y_bslne -
                                   cutBlk.y1) / self.iBlockVisibility
                    edge._type = -1 if blk.y_bslne > cutBlk.y1 else +1
                    lEdge.append(edge)

        #sort those edge from top to bottom
        lEdge.sort(key=lambda o: o.B.y1)  # o.B.y1 == o.B.y2 by construction

        #now filter those edges
        n0 = len(lEdge)
        if False:
            print("--- before filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s yg=%s" %
                      (edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne) +
                      "  %s line %s " %
                      (["↑", "-", "↓"][1 + edge._type], edge.B.y1) +
                      "domid=%s y1=%s  " % (edge.B.domid, edge.B.y1) +
                      str(id(edge)))
        lEdge = self._filterBadEdge(lEdge, lSpecialPageNode, bCutIsBeforeText)
        traceln(" - filtering: removed %d edges due to obstruction." %
                (n0 - len(lEdge)))
        if False:
            print("--- After filtering: %d edges" % len(lEdge))
            lSortedEdge = sorted(lEdge, key=lambda x: x.A.domid)
            print(len(lSortedEdge))
            for edge in lSortedEdge:
                print("Block domid=%s y1=%s y2=%s yg=%s" %
                      (edge.A.domid, edge.A.y1, edge.A.y2, edge.A.y_bslne) +
                      "  %s line %s " %
                      (["↑", "-", "↓"][1 + edge._type], edge.B.y1) +
                      "domid=%s y1=%s  " % (edge.B.domid, edge.B.y1) +
                      str(id(edge)))

        if self.iLineVisibility > 0:
            # Cut line to Cut line edges
            lSpecialPageNode.sort(key=lambda o: o.y1)
            for i, A in enumerate(lSpecialPageNode):
                for B in lSpecialPageNode[i + 1:]:
                    if B.y1 - A.y1 <= self.iLineVisibility:
                        edge = Edge_LL(A, B)
                        edge.len = (B.y1 - A.y1) / self.iLineVisibility
                        assert edge.len >= 0
                        lEdge.append(edge)
                    else:
                        break

        return lEdge
Beispiel #9
0
def main(sModelDir, sModelName, options):
    doer = DU_ABPTableRCut(sModelName,
                           sModelDir,
                           iBlockVisibility=options.iBlockVisibility,
                           iLineVisibility=options.iLineVisibility,
                           C=options.crf_C,
                           tol=options.crf_tol,
                           njobs=options.crf_njobs,
                           max_iter=options.max_iter,
                           inference_cache=options.crf_inference_cache)

    if options.rm:
        doer.rm()
        return

    lTrn, lTst, lRun, lFold = [
        _checkFindColDir(lsDir, bAbsolute=False)
        for lsDir in [options.lTrn, options.lTst, options.lRun, options.lFold]
    ]
    #     if options.bAnnotate:
    #         doer.annotateDocument(lTrn)
    #         traceln('annotation done')
    #         sys.exit(0)

    traceln("- classes: ", doer.getGraphClass().getLabelNameList())

    ## use. a_mpxml files
    #doer.sXmlFilenamePattern = doer.sLabeledXmlFilenamePattern

    if options.iFoldInitNum or options.iFoldRunNum or options.bFoldFinish:
        if options.iFoldInitNum:
            """
            initialization of a cross-validation
            """
            splitter, ts_trn, lFilename_trn = doer._nfold_Init(
                lFold, options.iFoldInitNum, bStoreOnDisk=True)
        elif options.iFoldRunNum:
            """
            Run one fold
            """
            oReport = doer._nfold_RunFoldFromDisk(options.iFoldRunNum,
                                                  options.warm, options.pkl)
            traceln(oReport)
        elif options.bFoldFinish:
            tstReport = doer._nfold_Finish()
            traceln(tstReport)
        else:
            assert False, "Internal error"
        #no more processing!!
        exit(0)
        #-------------------

    if lFold:
        loTstRpt = doer.nfold_Eval(lFold, 3, .25, None, options.pkl)
        sReportPickleFilename = os.path.join(sModelDir,
                                             sModelName + "__report.txt")
        traceln("Results are in %s" % sReportPickleFilename)
        graph.GraphModel.GraphModel.gzip_cPickle_dump(sReportPickleFilename,
                                                      loTstRpt)
    elif lTrn:
        doer.train_save_test(lTrn, lTst, options.warm, options.pkl)
        try:
            traceln("Baseline best estimator: %s" %
                    doer.bsln_mdl.best_params_)  #for CutSearch
        except:
            pass
        traceln(" --- CRF Model ---")
        traceln(doer.getModel().getModelInfo())
    elif lTst:
        doer.load()
        tstReport = doer.test(lTst)
        traceln(tstReport)
        if options.bDetailedReport:
            traceln(tstReport.getDetailledReport())
            sReportPickleFilename = os.path.join(
                sModelDir, sModelName + "__detailled_report.txt")
            graph.GraphModel.GraphModel.gzip_cPickle_dump(
                sReportPickleFilename, tstReport)

    if lRun:
        if options.storeX or options.applyY:
            try:
                doer.load()
            except:
                pass  #we only need the transformer
            lsOutputFilename = doer.runForExternalMLMethod(
                lRun, options.storeX, options.applyY, options.bRevertEdges)
        else:
            doer.load()
            lsOutputFilename = doer.predict(lRun)

        traceln("Done, see in:\n  %s" % lsOutputFilename)
Beispiel #10
0
def main(sInputDir,
         sAlgoA,
         sAlgoB,
         bShape=False,
         bConvexHull=False,
         bVerbose=False):
    sAlgoC = sFMT % (sAlgoA, sAlgoB)

    # filenames without the path
    lsFilename = [
        os.path.basename(name) for name in os.listdir(sInputDir)
        if name.endswith("_du.pxml") or name.endswith("_du.mpxml")
    ]
    traceln(" - %d files to process, to produce clusters '%s'" %
            (len(lsFilename), sAlgoC))

    for sFilename in lsFilename:
        sFullFilename = os.path.join(sInputDir, sFilename)
        traceln(" - FILE : ", sFullFilename)
        cntCluster, cntPage = 0, 0
        doc = etree.parse(sFullFilename)

        for iPage, ndPage in enumerate(doc.getroot().xpath(xpPage,
                                                           namespaces=dNS)):
            nRemoved = Cluster.remove(ndPage, sAlgoC)

            lClusterA = Cluster.load(ndPage, sAlgoA)
            lClusterB = Cluster.load(ndPage, sAlgoB)

            if bVerbose:
                trace(
                    "Page %d : (%d clusters REMOVED),   %d cluster '%s'   %d clusters '%s'"
                    % (iPage + 1, nRemoved, len(lClusterA), sAlgoA,
                       len(lClusterB), sAlgoB))

            lClusterC = []
            for A in lClusterA:
                for B in lClusterB:
                    C = Cluster.intersect(A, B)
                    if not C is None:
                        lClusterC.append(C)

            if bVerbose: traceln("    -> %d clusters" % (len(lClusterC)))
            if bShape or bConvexHull:
                for c in lClusterC:
                    c.shape = Cluster.computeShape(ndPage,
                                                   c.setID,
                                                   bConvexHull=bConvexHull)

            cntCluster += len(lClusterC)
            cntPage += 1

            Cluster.store(ndPage, lClusterC, sAlgoC)

        doc.write(sFullFilename,
                  xml_declaration=True,
                  encoding="utf-8",
                  pretty_print=True
                  #compression=0,  #0 to 9
                  )

        del doc
        traceln(" %d clusters over %d pages" % (cntCluster, cntPage))

    traceln(" done   (%d files)" % len(lsFilename))
Beispiel #11
0
                      action="store_true",
                      default=False,
                      help="report baseline method")
    parser.add_option(
        "--line_see_line",
        dest='iLineVisibility',
        action="store",
        type=int,
        default=0,
        help="seeline2line: how far in pixel can a line see another cut line?")
    parser.add_option(
        "--block_see_line",
        dest='iBlockVisibility',
        action="store",
        type=int,
        default=273,
        help="seeblock2line: how far in pixel can a block see a cut line?")

    # ---
    #parse the command line
    (options, args) = parser.parse_args()

    # ---
    try:
        sModelDir, sModelName = args
    except Exception as e:
        traceln("Specify a model folder and a model name!")
        _exit(usage, 1, e)

    main(sModelDir, sModelName, options)
Beispiel #12
0
    try:
        sInputDir, sA, sB = args
    except ValueError:
        sys.stderr.write(sUsage)
        sys.exit(1)

    # ... checking folders
    if not os.path.normpath(sInputDir).endswith("col"):
        sInputDir = os.path.join(sInputDir, "col")

    if not os.path.isdir(sInputDir):
        sys.stderr.write("Not a directory: %s\n" % sInputDir)
        sys.exit(2)

    # ok, go!
    traceln("Input  is : ", os.path.abspath(sInputDir))
    traceln("algo A is : ", sA)
    traceln("algo B is : ", sB)
    if options.bShape or options.bConvexHull:
        traceln("Shape of intersections based on content!")
    else:
        traceln("Shape of intersections is the intersection of shapes!")

    main(sInputDir, sA, sB, options.bShape, options.bConvexHull,
         options.bVerbose)

    traceln("Input  was : ", os.path.abspath(sInputDir))
    traceln("algo A was : ", sA)
    traceln("algo B was : ", sB)
    if options.bShape or options.bConvexHull:
        trace("Shape of intersections based on content: ")
Beispiel #13
0
def eval_cluster_of_files(lsFilename
                , sClusterLevel  # either "row", "col", "cell", "region", "cluster"
                , bIgnoreHeader=False
                , bIgnoreOutOfTable=False
                , lfSimil=[i / 100.0 for i in [66, 80, 100]]
                , xpSelector=".//pc:TextLine"
                , sAlgo=None
                , sGroupByAttr=""
                , sClusterGTAttr=None  # used when sClusterLevel=="cluster"
                ):
    
    bTable = sClusterLevel in ["row", "col", "cell"]
    #if not bTable: assert sClusterLevel in ['region', 'cluster']
    # sCluelsterLevel can be CLuster or clusterlvl1, 2, ...
    if not bTable: assert sClusterLevel == 'region' or sClusterLevel.startswith('cluster')  
    
    dOkErrMiss = { fSimil:(0,0,0) for fSimil in lfSimil }
    lsRpt = []
    for sFilename in lsFilename:
        doc = etree.parse(sFilename)
        rootNd = doc.getroot()
        #assert len(PageXml.xpath(rootNd, "//pc:Page")) == 1, "NOT YET IMPLEMENTED: eval on multi-page files"
        for iPage, ndPage in enumerate(PageXml.xpath(rootNd, "//pc:Page")):
            lsRpt.append("PAGE %5d  OF FILE    %s" % (iPage+1, sFilename))
            # cluster  -> [node_id]
            dGT  = defaultdict(list)
            dRun = defaultdict(list)
            for nd in PageXml.xpath(ndPage, xpSelector):
                #if bIgnoreHeader and nd.get("DU_header") != "D": continue
                if bIgnoreHeader and nd.getparent().get("custom") and "table-header" in nd.getparent().get("custom"): continue
                
                ndparent = nd.getparent() 
                ndid   = nd.get("id")
                val_run = nd.get("DU_cluster")  # ok in most cases
                if bTable:
                    if sClusterLevel == "cell":
                        val_gt = "%s__%s" % (ndparent.get("row"), ndparent.get("col"))
                        if val_gt == 'None__None' and bIgnoreOutOfTable: continue
                    else: # "col" or "row"
                        val_gt = ndparent.get(sClusterLevel)
                        if val_gt == None and bIgnoreOutOfTable: continue
                    # distinguish each table!
                    val_gt = "%s_%s" % (val_gt, ndparent.getparent().get("id"))
                else:
                    if sClusterLevel == 'region':
                        val_gt = ndparent.get("id")
                        # WHY???  if val_gt == 'None' and bIgnoreOutOfTable: continue
                    #elif sClusterLevel == 'cluster':
                    elif sClusterLevel == 'cluster':
                        val_gt = nd.get(sClusterGTAttr)
                    elif sClusterLevel.startswith('cluster_lvl'):
                        val_gt = nd.get(sClusterGTAttr)
                        val_run = nd.get("DU_"+sClusterLevel)
                    else:
                        raise Exception("Unknown clustering level: %s"%sClusterLevel)
               
                dGT[val_gt].append(ndid)
 
                dRun[val_run].append(ndid)
                #assert ndparent.tag.endswith("TableCell"), "expected TableCell got %s" % nd.getparent().tag
                
            if not sAlgo is None:
                dRun = defaultdict(list)
                lNdCluster = PageXml.xpath(ndPage, ".//pc:Cluster[@algo='%s']"%sAlgo)
                # lNdCluster = PageXml.xpath(ndPage, ".//pc:Cluster[@algo='%s' and @rowSpan='1']"%sAlgo)
                traceln("Loaded %d cluster @algo='%s'"%(len(lNdCluster), sAlgo))
                for iCluster, ndCluster in enumerate(lNdCluster):
                    sIDs = ndCluster.get("content")
                    lndid = sIDs.split()
                    if lndid: 
                        if sGroupByAttr:
                            # we group them by the value of an attribute
                            dRun[ndCluster.get(sGroupByAttr)].extend(lndid)
                        else:
                            dRun[str(iCluster)] = lndid
            
            for fSimil in lfSimil:
                _nOk, _nErr, _nMiss = evalPartitions(
                      list(dRun.values())
                    , list(dGT.values())
                    , fSimil
                    , jaccard_distance)
                
                _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss)
                
                #traceln("simil:%.2f  P %5.2f  R %5.2f  F1 %5.2f   ok=%6d  err=%6d  miss=%6d" %(
                lsRpt.append("@simil %.2f   P %5.2f  R %5.2f  F1 %5.2f   ok=%6d  err=%6d  miss=%6d" %(
                      fSimil
                    , _fP, _fR, _fF
                    , _nOk, _nErr, _nMiss
                    ))
#                     , os.path.basename(sFilename)))
#                 sFilename = "" # ;-)
                
                # global count
                nOk, nErr, nMiss = dOkErrMiss[fSimil]
                nOk   += _nOk
                nErr  += _nErr
                nMiss += _nMiss
                dOkErrMiss[fSimil] = (nOk, nErr, nMiss)

    lSummary = []
    for fSimil in lfSimil:
        nOk, nErr, nMiss = dOkErrMiss[fSimil]
        fP, fR, fF = computePRF(nOk, nErr, nMiss)
        sLine = "ALL_%s  @simil %.2f   P %5.2f  R %5.2f  F1 %5.2f " % (
                     "TABLES" if bTable else sClusterLevel, fSimil, fP, fR, fF ) \
                + "        "                                                                    \
                +"ok=%d  err=%d  miss=%d" %(nOk, nErr, nMiss)
        lSummary.append(sLine)
    
    sRpt = "\n".join(lSummary) + "\n\n" + "\n".join(lsRpt) + "\n\n" + "\n".join(lSummary)
    
    return nOk, nErr, nMiss, sRpt
Beispiel #14
0
def eval_direct(lCriteria, lsDocDir
                , bIgnoreHeader=False
                , bIgnoreOutOfTable=True
                , lfSimil=[i / 100.0 for i in [66, 80, 100]]
                , xpSelector=".//pc:TextLine"):
    """
    use the row, col, DU_row, DU_col XML attributes to form the partitions
    
    lCriteria is a list containg "row" or "col" or both
    """
    assert lsDocDir
    
    llsFile = listParallelFiles(lsDocDir)
    traceln("-loaded %d files for each criteria"%len(llsFile[0]))
    
    dOkErrMiss = { fSimil:(0,0,0) for fSimil in lfSimil }

    def _reverseDictionary(d):
        rd = defaultdict(list)
        for k, v in d.items():
            rd[v].append(k)
        return rd
 
    for i, lsCritFile in enumerate(zip(*llsFile)):
        assert len(lCriteria) == len(lsCritFile)
        
        # node_id -> consolidated_criteria_values
        dIdValue    = defaultdict(str)
        dIdValue_GT = defaultdict(str)
        for crit, sFilename, sDir in zip(lCriteria, lsCritFile, lsDocDir):
            doc = etree.parse(os.path.join(sDir, sFilename))
            rootNd = doc.getroot()
            assert len(PageXml.xpath(rootNd, "//pc:Page")) == 1, "NOT YET IMPLEMENTED: eval on multi-page files"
             
            for nd in PageXml.xpath(rootNd, xpSelector):
                ndid   = nd.get("id")
                val_gt = nd.getparent().get(crit)
                if val_gt is None:
                    if bIgnoreOutOfTable: 
                        continue
                    else:
                        val_gt = "-1"
                #if bIgnoreHeader and nd.get("DU_header") != "D": continue
                if bIgnoreHeader and nd.getparent().get("custom") and "table-header" in nd.getparent().get("custom"): continue
                assert nd.getparent().tag.endswith("TableCell"), "expected TableCell got %s" % nd.getparent().tag
                val    = nd.get("DU_"+crit)
#                 import random
#                 if random.random() < 0.10:
#                     val    = nd.get("DU_"+crit)
#                 else:
#                     val    = nd.getparent().get(crit)
                dIdValue[ndid]    += "_%s_" % val
                dIdValue_GT[ndid] += "_%s_" % val_gt  
#         print("**run ", str(dIdValue))
#         print("**GT  ", str(dIdValue_GT))

        # reverse dicitonaries
        dValue_lId    = _reverseDictionary(dIdValue)
        dValue_lId_GT = _reverseDictionary(dIdValue_GT)
        
#         print("run ", list(dValue_lId.values()))
#         print("GT  ", list(dValue_lId_GT.values()))
        for fSimil in lfSimil:
            _nOk, _nErr, _nMiss = evalPartitions(
                  list(dValue_lId.values())
                , list(dValue_lId_GT.values())
                , fSimil
                , jaccard_distance)
            
            _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss)
            
            traceln("simil:%.2f  P %5.2f  R %5.2f  F1 %5.2f   ok=%6d  err=%6d  miss=%6d  %s" %(
                  fSimil
                , _fP, _fR, _fF
                , _nOk, _nErr, _nMiss
                , os.path.basename(sFilename)))
            sFilename = "" # ;-)
            nOk, nErr, nMiss = dOkErrMiss[fSimil]
            nOk   += _nOk
            nErr  += _nErr
            nMiss += _nMiss
            dOkErrMiss[fSimil] = (nOk, nErr, nMiss)
        traceln()
        
    for fSimil in lfSimil:
        nOk, nErr, nMiss = dOkErrMiss[fSimil]
        fP, fR, fF = computePRF(nOk, nErr, nMiss)
        traceln("ALL simil:%.2f  P %5.2f  R %5.2f  F1 %5.2f " % (fSimil, fP, fR, fF )
                , "        "
                ,"ok=%d  err=%d  miss=%d" %(nOk, nErr, nMiss))
        
    return (nOk, nErr, nMiss)
Beispiel #15
0
def eval_oracle(lsRunDir, sClusterLevel
                , bIgnoreHeader=True
                , bIgnoreOutOfTable=True
                , lfSimil=[i / 100.0 for i in [66, 80, 100]]
                , xpSelector=".//pc:TextLine"):
    """
    evaluate the cluster quality from a run folder
    
    We assume to have the groundtruth row and col in the files as well as the predicted clusters
    """
    assert lsRunDir
    dOkErrMiss = { fSimil:(0,0,0) for fSimil in lfSimil }

    DU_GraphClass = getConfiguredGraphClass()
    
    for sRunDir in lsRunDir:
        lsFile = listFiles(sRunDir,ext='.pxml')
        traceln("-loaded %d files from %s" % (len(lsFile), sRunDir))
        
        for sFilename in lsFile:
            
            #
            lg = DU_GraphClass.loadGraphs(DU_GraphClass, [os.path.join(sRunDir, sFilename)], bDetach=False, bLabelled=False, iVerbose=1)
            
            # cluster  -> [node_id]
            dGT  = defaultdict(list)
            dRun = defaultdict(list)
            
#             doc = etree.parse(os.path.join(sRunDir, sFilename))
            # assume 1 page per doc!
            g=lg[0]
            rootNd = g.doc.getroot()
            #assert len(PageXml.xpath(rootNd, "//pc:Page")) == 1, "NOT YET IMPLEMENTED: eval on multi-page files"
            for iPage, ndPage in enumerate(PageXml.xpath(rootNd, "//pc:Page")):
                traceln("PAGE %5d  OF FILE    %s" % (iPage+1, sFilename))
                
                try:g = lg[iPage]
                except IndexError:continue
                Y = labelEdges(g,sClusterLevel)
                g.form_cluster(Y)
                g.addEdgeToDoc()
                
                for nd in PageXml.xpath(ndPage, xpSelector):
                    if bIgnoreHeader and nd.getparent().get("custom") and "table-header" in nd.getparent().get("custom"): continue
#                     if bIgnoreHeader and nd.get("DU_header") != "D": continue
                    
                    ndparent = nd.getparent() 
                    ndid   = nd.get("id")
                    
                    if sClusterLevel == "cell":
                        val_gt = "%s__%s" % (ndparent.get("row"), ndparent.get("col"))
                        if val_gt == 'None__None' and bIgnoreOutOfTable: continue
                    elif sClusterLevel == "col":
                        val_gt = ndparent.get("col")
                        if val_gt == None and bIgnoreOutOfTable: continue
                    elif sClusterLevel == "row":
                        val_gt = ndparent.get("row")
                        if val_gt == None and bIgnoreOutOfTable: continue
                    else:
                        raise Exception("Unknown clustering level: %s"%sClusterLevel)
    
                    # distinguish each table!
                    val_gt = val_gt + "_" + ndparent.getparent().get("id")
                    
                    dGT[val_gt].append(ndid)
     
                    val_run = nd.get("DU_cluster")
                    dRun[val_run].append(ndid)
#                     assert ndparent.tag.endswith("TableCell"), "expected TableCell got %s" % nd.getparent().tag
                    
                for fSimil in lfSimil:
                    _nOk, _nErr, _nMiss = evalPartitions(
#                     _nOk, _nErr, _nMiss, _lFound, _lErr, _lMissed = evalPartitions(
                          list(dRun.values())
                        , list(dGT.values())
                        , fSimil
                        , jaccard_distance)
                    
                    _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss)
                    
                    #traceln("simil:%.2f  P %5.2f  R %5.2f  F1 %5.2f   ok=%6d  err=%6d  miss=%6d" %(
                    traceln("@simil %.2f   P %5.2f  R %5.2f  F1 %5.2f   ok=%6d  err=%6d  miss=%6d" %(
                          fSimil
                        , _fP, _fR, _fF
                        , _nOk, _nErr, _nMiss
                        ))
    #                     , os.path.basename(sFilename)))
    #                 sFilename = "" # ;-)
                    
                    # global count
                    nOk, nErr, nMiss = dOkErrMiss[fSimil]
                    nOk   += _nOk
                    nErr  += _nErr
                    nMiss += _nMiss
                    dOkErrMiss[fSimil] = (nOk, nErr, nMiss)
                
            traceln()
            g.doc.write(os.path.join(sRunDir, sFilename)+'.oracle')
        
    for fSimil in lfSimil:
        nOk, nErr, nMiss = dOkErrMiss[fSimil]
        fP, fR, fF = computePRF(nOk, nErr, nMiss)
        traceln("ALL_TABLES  @simil %.2f   P %5.2f  R %5.2f  F1 %5.2f " % (fSimil, fP, fR, fF )
                , "        "
                ,"ok=%d  err=%d  miss=%d" %(nOk, nErr, nMiss))
        
    return (nOk, nErr, nMiss)
Beispiel #16
0
                      , type=float
                      , help="eval_col, eval_cell : Apply this ratio to the bounding box. This is normally useless as the baseline becomes a point (the centroid)"
                      , default=CutAnnotator.fRATIO) 

    
    # --- 
    #parse the command line
    (options, args) = parser.parse_args()
    
    options.bCutAbove = not(options.bCutBelow)
    
    #load mpxml 
    try:
        op = args[0]
    except:
        traceln(usage)
        sys.exit(1)
    
    traceln("--- %s ---"%op)
    if op in ["eval", "eval_row", "eval_col", "eval_cell"]:
        if op == "eval": op = "eval_row"
        traceln("DEPRECATED: now use ", op[0:4] + "_cut" + op[4:])
        exit(1)
    # --------------------------------------
    if op == "cut":
        sFilename = args[1]
        sOutFilename = args[2]
        traceln("- cutting : %s  --> %s" % (sFilename, sOutFilename))
        lDegAngle = [float(s) for s in options.lsAngle.split(",")]
        traceln("- Allowed angles (°): %s" % lDegAngle)
        op_cut(sFilename, sOutFilename, lDegAngle, options.bCutAbove, fCutHeight=options.fCutHeight)