コード例 #1
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """    
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = ""
            
            #now we need to infer the bounding box of that object
            (x1, y1), (x2, y2) = PageXml.getPointList(ndBlock)  #the polygon
            
            orientation = 0 
            classIndex = 0   #is computed later on

            #and create a Block
            # we pass the coordinates, not x1,y1,w,h !!
            cutBlk = Block(page, ((x1, y1), (x2, y2)), sText, orientation, classIndex, self, ndBlock, domid=domid)
            
            # Create the shapely shape
            cutBlk.shape = geom.LineString([(x1, y1), (x2, y2)])
            cutBlk.angle = float(ndBlock.get("DU_angle"))
            cutBlk.angle_freq       = float(ndBlock.get("DU_angle_freq"))
            cutBlk.angle_cumul_freq = float(ndBlock.get("DU_angle_cumul_freq"))
            cutBlk.set_support      = literal_eval(ndBlock.get("DU_set_support"))
            
            yield cutBlk
            
        return        
コード例 #2
0
    def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes
        """    
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS)

        #order blocks from top to bottom of page
        lOrderedNdBlock = list()
        for ndBlock in lNdBlock:
            
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                raise ValueError("Node %x has invalid coordinates" % str(ndBlock))
            
            plg = Polygon(lXY)
            _, (xg, yg) = plg.getArea_and_CenterOfMass()
            
            lOrderedNdBlock.append( (yg, ndBlock))  #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved
            
        lOrderedNdBlock.sort()
        
        for _, ndBlock in lOrderedNdBlock: yield ndBlock
            
        return        
コード例 #3
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """    
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                traceln("Warning: no text in node %s"%domid) 
                #raise ValueError, "No text in node: %s"%ndBlock 
            
            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ZeroDivisionError:
#                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
#                 continue
#             if True:
#                 #we reduce a bit this rectangle, to ovoid overlap
#                 w,h = x2-x1, y2-y1
#                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
#                 dy = max(h * 0.066, min(20, w/3))
#                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1,y1,x2,y2 = plg.getBoundingBox()
            except ValueError:
                x1,y1,x2,y2 = plg.getBoundingBox()
                
                
            #we reduce a bit this rectangle, to ovoid overlap
            if not(self.BBoxDeltaFun is None):
                w,h = x2-x1, y2-y1
                dx = self.BBoxDeltaFun(w)
                dy = self.BBoxDeltaFun(h)
                x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]
                
            
            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0   #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid)
            
            yield blk
            
        raise StopIteration()        
コード例 #4
0
def convertTR2Sep(filename):
    """
    """
    print (filename)
    tagname='TextRegion'
    xml = etree.parse(filename)
    ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"})
    
    for x in ltextsep:
        if "separator" in x.get('custom'):
            x.tag = 'SeparatorRegion'
            
            #now we need to convert that object to a line
            lXY = PageXml.getPointList(x)  #the polygon
            assert lXY, "Separator without Coord??"
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ValueError:
                print("Warning: Coords might be bad, taking bounding box: ", lXY)
                x1,y1,x2,y2 = plg.getBoundingBox()
#             try:
#                 x1,y1, x2,y2 = plg.fitRectangle()
#             except ZeroDivisionError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()
#             except ValueError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()            
            if abs(x2-x1) > abs(y2-y1): # horizontal
                y1 = (y1+y2)/2
                y2 = y1
            else:
                x1 = (x1+x2)/2
                x2=x1
            
            ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0]
            PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)])
                
    return xml
コード例 #5
0
def get_col_partition(doer,
                      sxpCut,
                      dNS,
                      sFilename,
                      lFilterFun,
                      fRatio,
                      bVerbose=False,
                      funIndex=lambda x: x._du_index):
    """
    return the GT partition in columns, as well as 1 partition per filter function
    """
    global global_maxHeaderRowSpan

    if bVerbose: traceln("- loading %s" % sFilename)
    parser = etree.XMLParser()
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()

    llsetRun = []

    pnum = 0
    lndPage = MultiPageXml.getChildByName(root, 'Page')
    assert len(lndPage) == 1, "NOT SUPPORTED: file has many pages - soorry"
    for ndPage in lndPage:
        pnum += 1
        if bVerbose: traceln("   - page %s - loading table GT" % pnum)

        loBaseline, dsetTableByCol, dsetTableDataByCol, global_maxHeaderRowSpan = doer.loadPageCol(
            ndPage, fRatio, funIndex=funIndex)

        if bVerbose:
            traceln("   - found %d objects on page" % (len(loBaseline)))

        # make a dictionary of cumulative sets, and the set of all objects
        lTableColK = sorted(dsetTableByCol.keys())
        lTableDataColK = sorted(dsetTableDataByCol.keys())
        if bVerbose:
            traceln("   - found %d cols" % (len(lTableColK)))
            traceln("   - found %d objects in the table" %
                    (sum(len(v) for v in dsetTableByCol.values())))
            traceln("   - found %d objects in the table data" %
                    (sum(len(v) for v in dsetTableDataByCol.values())))
        lNdCut = ndPage.xpath(sxpCut, namespaces=dNS)
        if bVerbose:
            traceln("   - found %d cuts" % (len(lNdCut)))
        else:
            traceln(
                "- loaded %40s " % sFilename,
                " %6d cols %6d 'S' cuts" % (len(lTableColK), len(lNdCut)),
                " %6d objects %6d table objects" %
                (len(loBaseline), sum(len(v)
                                      for v in dsetTableByCol.values())))
        loCut = []
        for ndCut in lNdCut:
            #now we need to infer the bounding box of that object
            (x1, y1), (x2, y2) = PageXml.getPointList(ndCut)  #the polygon
            # Create the shapely shape
            loCut.append(geom.LineString([(x1, y1), (x2, y2)]))

        w, h = float(ndPage.get("imageWidth")), float(
            ndPage.get("imageHeight"))
        #             # Add a fictive cut at top of page
        #             loCut.append(geom.LineString([(0, 0), (w, 0)]))
        #             # Add a fictive cut at end of page
        #             loCut.append(geom.LineString([(0, h), (w, h)]))

        # order it by line centroid x
        loCut.sort(key=lambda o: o.centroid.x)

        # dcumset is the GT!!
        lsetGT = [dsetTableByCol[k]
                  for k in lTableColK]  # list of set of du_index
        lsetDataGT = [dsetTableDataByCol[k] for k in lTableDataColK]

        # NOW, look at predictions
        for filterFun in lFilterFun:
            loBaselineInTable = [o for o in loBaseline if filterFun(o._du_nd)]
            if bVerbose:
                traceln("   - %d objects on page predicted in table (%d out)" %
                        (len(loBaselineInTable),
                         len(loBaseline) - len(loBaselineInTable)))

            # Now create the list of partitions created by the Cuts
            lsetRun = []
            partition = PolygonPartition(loBaselineInTable)
            if True:  # or bCutOnLeft:
                #cut if above the text that led to its creation
                setAllPrevIds = set(
                    [])  # cumulative set of what was already taken
                for oCut in loCut:
                    lo = partition.getObjectOnRightOfLine(oCut)
                    setIds = set(funIndex(o) for o in lo)
                    #print(oCut.centroid.x, setIds)
                    if setAllPrevIds:
                        prevColIds = setAllPrevIds.difference(
                            setIds)  # content of previous row
                        if prevColIds:
                            #an empty set is denoting alternative cuts leading to same partition
                            lsetRun.append(prevColIds)
                    setAllPrevIds = setIds
            else:
                assert False, "look at this code..."


#                     #cut if below the text that led to its creation
#                     cumSetIds = set([]) # cumulative set
#                     for oCut in loCut:
#                         lo = partition.getObjectAboveLine(oCut)
#                         setIds = set(o._du_index for o in lo)
#                         rowIds = setIds.difference(cumSetIds) # only last row!
#                         if rowIds:
#                             #an empty set is denoting alternative cuts leading to same partition
#                             lsetRun.append(rowIds)
#                         cumSetIds = setIds
#             _debugPartition("run", lsetRun)
#             _debugPartition("ref", lsetGT)
            llsetRun.append(lsetRun)
    return lsetGT, lsetDataGT, llsetRun
コード例 #6
0
    def resizeCell(self, cell, ns):
        """
            replace the cell region by a BB for textlines: better for transcriber
        """
        xpath = "./a:%s" % ("TextLine")
        lTextLines = cell.xpath(xpath, namespaces={'a': PageXml.NS_PAGE_XML})
        if lTextLines == []:
            return True

        ## get minx,maxy  for the cell
        lXY = PageXml.getPointList(cell)  #the polygon
        plg = Polygon(lXY)
        x1, y1, x2, y2 = plg.fitRectangle()
        x1, y1, x2, y2 = plg.getBoundingBox()
        cellX1 = x1
        cellX2 = x2
        cellY1 = y1
        cellY2 = y2

        ## get all the textlines of the cell
        minx, miny, maxx, maxy = 9e9, 9e9, 0, 0
        for line in lTextLines:
            lXY = PageXml.getPointList(line)  #the polygon
            # in case empty cell
            if lXY == []:
                continue
            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle()
            except ZeroDivisionError:
                continue
            x1, y1, x2, y2 = plg.getBoundingBox()
            minx = min(minx, x1)
            miny = min(miny, y1)
            maxx = max(maxx, x2)
            maxy = max(maxy, y2)
        """
            finally: simply use the BB of the textlines + padding
        """

        #         # new request: height= max(cell,text)
        #         ## new new request: (12/10/2017): min (cell, text)!
        #         HCell = cellY2 - cellY1
        #         HBBText = maxy - miny
        #
        miny -= self.vpadding  # vertical padding (top)
        maxy += self.vpadding  # vertical padding (bottom)
        #
        #         # Height computation
        #         ## if HBBText <= self.HeightTH * HCell: take HBBText as height for TextREgion
        #         if HBBText > self.HeightTH * HCell:
        #             miny = max(miny,cellY1)
        #             maxy = min(maxy,cellY2)
        #         # else : don't touch miny, maxy  : use only Text for computing Height
        #
        #         # Width computation
        minx -= self.hpadding  # horizontal padding
        maxx += self.hpadding  # horizontal padding

        #         minx = min(cellX1,minx)
        #         maxx = max(cellX2, maxx)
        #         print cellX2, maxx

        corner = cell[0]
        #         print minx,miny,maxx,miny,maxx,maxy,minx,maxy
        corner.set(
            'points', "%d,%d %d,%d %d,%d %d,%d" %
            (minx, miny, maxx, miny, maxx, maxy, minx, maxy))
コード例 #7
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(
            self.sxpNode, namespaces=self.dNS)  #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                NodeType_PageXml.nbNoTextWarning += 1
                if NodeType_PageXml.nbNoTextWarning < 33:
                    traceln("Warning: no text in node %s" % domid)
                elif NodeType_PageXml.nbNoTextWarning == 33:
                    traceln(
                        "Warning: no text in node %s  - *** %d repetition : I STOP WARNING ***"
                        % (domid, NodeType_PageXml.nbNoTextWarning))
                #raise ValueError, "No text in node: %s"%ndBlock

            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue

            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle(
                    bPreserveWidth=self.bPreserveWidth)
            except ZeroDivisionError:
                #                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
                #                 continue
                #             if True:
                #                 #we reduce a bit this rectangle, to ovoid overlap
                #                 w,h = x2-x1, y2-y1
                #                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
                #                 dy = max(h * 0.066, min(20, w/3))
                #                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1, y1, x2, y2 = plg.getBoundingBox()
            except ValueError:
                x1, y1, x2, y2 = plg.getBoundingBox()

            #we reduce a bit this rectangle, to ovoid overlap
            if not (self.BBoxDeltaFun is None):
                if type(self.BBoxDeltaFun) is tuple and len(
                        self.BBoxDeltaFun) == 2:
                    xFun, yFun = self.BBoxDeltaFun
                    if xFun is not None:
                        dx = xFun(x2 - x1)
                        x1, x2 = int(round(x1 + dx)), int(round(x2 - dx))
                    if yFun is not None:
                        dy = yFun(y2 - y1)
                        y1, y2 = int(round(y1 + dy)), int(round(y2 - dy))
                else:
                    # historical code
                    w, h = x2 - x1, y2 - y1
                    dx = self.BBoxDeltaFun(w)
                    dy = self.BBoxDeltaFun(h)
                    x1, y1, x2, y2 = [
                        int(round(v))
                        for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy]
                    ]

            # store the rectangle"
            ndBlock.set(
                "DU_points", " ".join([
                    "%d,%d" % (int(x), int(y))
                    for x, y in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
                ]))

            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0  #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2 - x1, y2 - y1),
                        sText,
                        orientation,
                        classIndex,
                        self,
                        ndBlock,
                        domid=domid)

            yield blk

        return
コード例 #8
0
    def mergeBaselineCells(self, coldir, colid, docid):
        """
        
            Take a file (pxml) with stuff processed on Transkribus
            Tale the CVL template tool xml (xml)
            
            merge them
            
            regenerate a mpxml
             
        """

        xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid))
        #         print (xmlpath)

        mpxml = xmlpath + ".mpxml"
        mpxmldoc = etree.parse(mpxml)

        lxml = glob.glob(os.path.join(xmlpath, "*.xml"))
        pxmldoc = MultiPageXml.makeMultiPageXml(lxml)

        lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml"))
        mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml)

        lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page')
        lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page')

        assert len(lXMLPage) == len(lPXMLPage)
        for i, cvlpage in enumerate(lXMLPage):
            ## remove TextRegion from xcvlpage
            lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion')
            for tr in lTextRegions:
                tr.getparent().remove(tr)

            pxmlpage = lPXMLPage[i]
            lTL = []
            lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion')
            for x in lTextRegions:
                lTL.extend(PageXml.getChildByName(x, 'TextLine'))

            ltable = PageXml.getChildByName(cvlpage, 'TableRegion')
            if len(ltable) == 0:
                raise "NO TABLE"
            lCells = PageXml.getChildByName(ltable[0], 'TableCell')

            lC = [Polygon(PageXml.getPointList(c)) for c in lCells]
            lT = [Polygon(PageXml.getPointList(t)) for t in lTL]
            for i, tl in enumerate(lT):
                ## normalization
                lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords')
                lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline')
                coordB = lCoordsB[0]
                coord = lCoordsPoints[0]
                iHeight = 30  # in pixel
                x1, y1, x2, y2 = Polygon(
                    PageXml.getPointList(coordB)).getBoundingBox()
                if coord is not None:
                    coord.set(
                        'points', "%d,%d %d,%d %d,%d %d,%d" %
                        (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
                    tl = Polygon(PageXml.getPointList(coordB))
                lOverlap = []
                for _, c in enumerate(lC):
                    #                     print (lCells[j].get('row'),lCells[j].get('col'),  self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox())
                    lOverlap.append(self.signedOverlap(
                        c, tl))  #.getBoundingBox()))
                ## region of the same size as the textline
#                 print (j,max(lOverlap),lOverlap.index(max(lOverlap)))
                if max(lOverlap) == 0:
                    region = PageXml.createPageXmlNode('TextRegion')
                    cvlpage.append(region)
                    region.append(lTL[i])

                else:
                    cell = lCells[lOverlap.index(max(lOverlap))]
                    cell.append(lTL[i])
#                     print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext()))

        pxmldoc.write(mpxml)
コード例 #9
0
def op_eval_old(lsFilename, fSimil, bDetail=False):
    """
    We load the XML
    - get the cut with @type="S"
    - get the text objects (geometry=Baseline)
    - 
    """
    nOk, nErr, nMiss = 0, 0, 0

    # OLD STYLE (May'18)
    sxpCut = './/pc:CutSeparator[@orient="0" and @type="S"]'  #how to find the cuts

    dNS = "./pc:TextEquiv"

    doer = SkewedCutAnnotator(True)

    traceln(" - Cut selector = ", sxpCut)

    def getPolylineAverageXY(ndPolyline):
        """
        COPIED FROM  tasks.DU_ABPTableCutAnnotator.BaselineCutAnnotator
        weighted average X and average Y of a polyline
        the weight indicate how long each segment at a given X, or Y, was.
        """
        sPoints = ndPolyline.get('points')
        lXY = Polygon.parsePoints(sPoints).lXY

        # list of X and Y values and respective weights
        lXYWxWy = [((x1+x2)/2.0, abs(y2-y1),    # for how long at this X?
                    (y1+y2)/2.0, abs(x2-x1)) \
                 for (x1,y1), (x2, y2) in zip(lXY, lXY[1:])]
        fWeightedSumX = sum(x * wx for x, wx, _, _ in lXYWxWy)
        fWeightedSumY = sum(y * wy for _, _, y, wy in lXYWxWy)
        fSumWeightX = sum(wx for _, wx, _, _ in lXYWxWy)
        fSumWeightY = sum(wy for _, _, _, wy in lXYWxWy)

        Xavg = int(round(fWeightedSumX /
                         fSumWeightX)) if fSumWeightX > 0 else 0
        Yavg = int(round(fWeightedSumY /
                         fSumWeightY)) if fSumWeightY > 0 else 0

        #         Xavg, Yavg = self.moduloSnap(Xavg, Yavg)

        return (Xavg, Yavg)

    def baseline_loader(nd):
        """
        load the baseline as done in DU_ABPTableCutAnnotator
        """
        x, y = getPolylineAverageXY(nd)
        # make a short horizontal line out of a point
        return geom.LineString([(x - 10, y), (x + 10, y)])

    # load objects: Baseline and Cuts
    for sFilename in lsFilename:
        traceln("- loading %s" % sFilename)
        parser = etree.XMLParser()
        doc = etree.parse(sFilename, parser)
        root = doc.getroot()

        pnum = 0
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            pnum += 1
            traceln("   - page %s - loading table GT" % pnum)
            loBaseline, dsetTableByRow = doer.loadPage(
                ndPage, shaper_fun=baseline_loader)
            traceln("   - found %d objects on page" % (len(loBaseline)))
            # make a dictionary of cumulative sets, and the set of all objects
            lTableRowK = sorted(dsetTableByRow.keys(),
                                reverse=True)  # bottom to top
            traceln("   - found %d objects in the table" %
                    (sum(len(v) for v in dsetTableByRow.values())))

            lNdCut = ndPage.xpath(sxpCut,
                                  namespaces={"pc": PageXml.NS_PAGE_XML})
            traceln("   - found %d 'S' cut" % (len(lNdCut)))
            loCut = []
            for ndCut in lNdCut:
                #now we need to infer the bounding box of that object
                (x1, y1), (x2, y2) = PageXml.getPointList(ndCut)  #the polygon
                # make sure that the cut is above the baseline that created it
                y1 -= 1
                y2 -= 1
                assert y1 == y2  # in this version, the cuts were horizontal
                # Create the shapely shape
                loCut.append(geom.LineString([(x1, y1), (x2, y2)]))
            # order it by line centroid Y
            loCut.sort(key=lambda o: o.centroid.y,
                       reverse=True)  # from bottom to top

            # dcumset is the GT!!
            lsetGT = [dsetTableByRow[k]
                      for k in lTableRowK]  # list of set of du_index

            # Now create the list of partitions created by the Cuts, excluding the 'O'
            lsetRun = []
            partition = PolygonPartition(loBaseline)
            cumSetIds = set([])  # cumulative set
            for oCut in loCut:
                lo = partition.getObjectBelowLine(oCut)
                setIds = set(o._du_index for o in lo
                             if _isBaselineInTable(o._du_nd))
                rowIds = setIds.difference(cumSetIds)  # only last row!
                if rowIds:
                    #an empty set is denoting alternative cuts leading to same partition
                    lsetRun.append(rowIds)
                cumSetIds = setIds
#             _debugPartition("run", lsetRun)
#             _debugPartition("ref", lsetGT)
            _nOk, _nErr, _nMiss, _lFound, _lErr, _lMissed = evalPartitions(
                lsetRun, lsetGT, fSimil, jaccard_distance)
            nOk += _nOk
            nErr += _nErr
            nMiss += _nMiss
            if bDetail:
                _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss)
                traceln(
                    "ok=%d  err=%d  miss=%d  P=%.1f  R=%.1f  F1=%.1f %s page=%d"
                    % (_nOk, _nErr, _nMiss, _fP, _fR, _fF, sFilename, pnum))

    fP, fR, fF = computePRF(nOk, nErr, nMiss)

    traceln("SUMMARY == P=%.1f%%\tR=%.1f%%\tF1=%.1f" % (fP, fR, fF))
    traceln("ok=%d  err=%d  miss=%d  P=%.1f  R=%.1f  F1=%.1f" %
            (nOk, nErr, nMiss, fP, fR, fF))
    return (nOk, nErr, nMiss)
コード例 #10
0
def get_row_partition(doer,
                      sxpCut,
                      dNS,
                      sFilename,
                      lFilterFun,
                      bCutAbove=True,
                      bVerbose=False,
                      funIndex=lambda x: x._du_index,
                      bIgnoreHeader=False):
    """
    return the GT partition in rows, as well as 1 partition per filter fucntion
    """
    # load objects: Baseline and Cuts
    if bVerbose: traceln("- loading %s" % sFilename)
    parser = etree.XMLParser()
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()

    llsetRun = []

    pnum = 0
    lndPage = MultiPageXml.getChildByName(root, 'Page')
    assert len(lndPage) == 1, "NOT SUPPORTED: file has many pages - soorry"
    for ndPage in lndPage:
        pnum += 1
        if bVerbose: traceln("   - page %s - loading table GT" % pnum)
        loBaseline, dsetTableByRow = doer.loadPage(ndPage,
                                                   funIndex=funIndex,
                                                   bIgnoreHeader=bIgnoreHeader)
        if bVerbose:
            traceln("   - found %d objects on page" % (len(loBaseline)))

        # make a dictionary of cumulative sets, and the set of all objects
        lTableRowK = sorted(dsetTableByRow.keys())
        if bVerbose:
            traceln("   - found %d rows" % (len(lTableRowK)))
            traceln("   - found %d objects in the table" %
                    (sum(len(v) for v in dsetTableByRow.values())))
        lNdCut = ndPage.xpath(sxpCut, namespaces=dNS)
        if bVerbose:
            traceln("   - found %d 'S' cut" % (len(lNdCut)))
        else:
            traceln(
                "- loaded %40s " % sFilename,
                " %6d rows %6d 'S' cuts" % (len(lTableRowK), len(lNdCut)),
                " %6d objects %6d table objects" %
                (len(loBaseline), sum(len(v)
                                      for v in dsetTableByRow.values())))
        loCut = []
        for ndCut in lNdCut:
            #now we need to infer the bounding box of that object
            (x1, y1), (x2, y2) = PageXml.getPointList(ndCut)  #the polygon
            # Create the shapely shape
            loCut.append(geom.LineString([(x1, y1), (x2, y2)]))

        w, h = float(ndPage.get("imageWidth")), float(
            ndPage.get("imageHeight"))
        # Add a fictive cut at top of page
        loCut.append(geom.LineString([(0, 0), (w, 0)]))
        # Add a fictive cut at end of page
        loCut.append(geom.LineString([(0, h), (w, h)]))

        # order it by line centroid Y
        loCut.sort(key=lambda o: o.centroid.y)

        # dcumset is the GT!!
        lsetGT = [dsetTableByRow[k]
                  for k in lTableRowK]  # list of set of du_index

        # NOW, look at predictions
        for filterFun in lFilterFun:

            loBaselineInTable = [o for o in loBaseline if filterFun(o._du_nd)]
            if bVerbose:
                traceln("   - %d objects on page predicted in table (%d out)" %
                        (len(loBaselineInTable),
                         len(loBaseline) - len(loBaselineInTable)))

            # Now create the list of partitions created by the Cuts
            lsetRun = []
            partition = PolygonPartition(loBaselineInTable)
            if bCutAbove:
                #cut if above the text that led to its creation
                setAllPrevIds = set(
                    [])  # cumulative set of what was already taken
                for oCut in loCut:
                    lo = partition.getObjectBelowLine(oCut)
                    setIds = set(funIndex(o) for o in lo)
                    if setAllPrevIds:
                        prevRowIds = setAllPrevIds.difference(
                            setIds)  # content of previous row
                        if prevRowIds:
                            #an empty set is denoting alternative cuts leading to same partition
                            lsetRun.append(prevRowIds)
                    setAllPrevIds = setIds
            else:
                #cut if below the text that led to its creation
                cumSetIds = set([])  # cumulative set
                for oCut in loCut:
                    lo = partition.getObjectAboveLine(oCut)
                    setIds = set(funIndex(o) for o in lo)
                    rowIds = setIds.difference(cumSetIds)  # only last row!
                    if rowIds:
                        #an empty set is denoting alternative cuts leading to same partition
                        lsetRun.append(rowIds)
                    cumSetIds = setIds
#             _debugPartition("run", lsetRun)
            llsetRun.append(lsetRun)


#             _debugPartition("ref", lsetGT)
    return lsetGT, llsetRun