コード例 #1
0
    def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes
        """    
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS)

        #order blocks from top to bottom of page
        lOrderedNdBlock = list()
        for ndBlock in lNdBlock:
            
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                raise ValueError("Node %x has invalid coordinates" % str(ndBlock))
            
            plg = Polygon(lXY)
            _, (xg, yg) = plg.getArea_and_CenterOfMass()
            
            lOrderedNdBlock.append( (yg, ndBlock))  #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved
            
        lOrderedNdBlock.sort()
        
        for _, ndBlock in lOrderedNdBlock: yield ndBlock
            
        return        
コード例 #2
0
ファイル: ABP_LA.py プロジェクト: Transkribus/TranskribusDU
    def regularTextLines(self, doc):
        """
            from a baseline: create a regular TextLine: rectangle along the baseline
                
        """

        lTextLines = PageXml.getChildByName(doc.getRootElement(), 'TextLine')
        for tl in lTextLines:
            coord = tl.children
            baseline = tl.children.next
            sPoints = baseline.prop('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                except ValueError:
                    print tl
            plg = Polygon(lXY)
            iHeight = 50  # not points!!  300 dpi : 62.5
            x1, y1, x2, y2 = plg.getBoundingBox()
            coord.setProp(
                'points', "%d,%d %d,%d %d,%d %d,%d" %
                (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
コード例 #3
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """    
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                traceln("Warning: no text in node %s"%domid) 
                #raise ValueError, "No text in node: %s"%ndBlock 
            
            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ZeroDivisionError:
#                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
#                 continue
#             if True:
#                 #we reduce a bit this rectangle, to ovoid overlap
#                 w,h = x2-x1, y2-y1
#                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
#                 dy = max(h * 0.066, min(20, w/3))
#                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1,y1,x2,y2 = plg.getBoundingBox()
            except ValueError:
                x1,y1,x2,y2 = plg.getBoundingBox()
                
                
            #we reduce a bit this rectangle, to ovoid overlap
            if not(self.BBoxDeltaFun is None):
                w,h = x2-x1, y2-y1
                dx = self.BBoxDeltaFun(w)
                dy = self.BBoxDeltaFun(h)
                x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]
                
            
            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0   #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid)
            
            yield blk
            
        raise StopIteration()        
コード例 #4
0
    def getPolylineAverageXY(cls, ndPolyline):
        """
        weighted average X and average Y of a polyline
        the weight indicate how long each segment at a given X, or Y, was.
        """
        sPoints = ndPolyline.get('points')
        lXY = Polygon.parsePoints(sPoints).lXY

        # list of X and Y values and respective weights
        lXYWxWy = [((x1+x2)/2.0, abs(y2-y1),    # for how long at this X?
                    (y1+y2)/2.0, abs(x2-x1)) \
                 for (x1,y1), (x2, y2) in zip(lXY, lXY[1:])]
        fWeightedSumX = sum(x * wx for x, wx, _, _ in lXYWxWy)
        fWeightedSumY = sum(y * wy for _, _, y, wy in lXYWxWy)
        fSumWeightX = sum(wx for _, wx, _, _ in lXYWxWy)
        fSumWeightY = sum(wy for _, _, _, wy in lXYWxWy)

        Xavg = int(round(fWeightedSumX /
                         fSumWeightX)) if fSumWeightX > 0 else 0
        Yavg = int(round(fWeightedSumY /
                         fSumWeightY)) if fSumWeightY > 0 else 0

        #         Xavg, Yavg = self.moduloSnap(Xavg, Yavg)

        return (Xavg, Yavg)
コード例 #5
0
def add_cluster_to_dom(root, llX):
    """
    Cluster the Textline based on the vertical cuts
    """

    for lX, (_iPage, ndPage) in zip(
            llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))):
        w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))

        lX.append(w)
        lX.sort()
        # cluster of objects on
        imax = len(lX)
        dCluster = {i: list() for i in range(imax)}

        #Histogram of projections
        lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine')

        # hack to use addClusterToDom
        class MyBlock:
            def __init__(self, nd):
                self.node = nd

        o = GraphBinaryConjugateSegmenter()
        o.lNode = []
        for nd in lndTextline:
            o.lNode.append(MyBlock(nd))

        for iNd, ndTextline in enumerate(lndTextline):
            sPoints = MultiPageXml.getChildByName(ndTextline,
                                                  'Coords')[0].get('points')
            try:
                x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle()
                xm = (x1 + x2) / 2.0
                bLastColumn = True
                for i, xi in enumerate(lX):
                    if xm <= xi:
                        dCluster[i].append(iNd)
                        ndTextline.set("DU_cluster", str(i))
                        bLastColumn = False
                        break
                if bLastColumn:
                    i = imax
                    dCluster[i].append(iNd)
                    ndTextline.set("DU_cluster", str(i))
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        # add clusters
        lNdCluster = o.addClusterToDom(dCluster,
                                       bMoveContent=False,
                                       sAlgo="cut",
                                       pageNode=ndPage)

        # add a cut_X attribute to the clusters
        for ndCluster in lNdCluster:
            i = int(ndCluster.get('name'))
            ndCluster.set("cut_X", str(lX[i]))
コード例 #6
0
    def regularTextLinesold(self, doc):
        """
            from a baseline: create a regular TextLine:
            
            also: for slanted baseline: 
                
        """
        from shapely.geometry import LineString
        from shapely.affinity import translate
        self.xmlns = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'

        lTextLines = PageXml.getChildByName(doc.getroot(), 'TextLine')
        for tl in lTextLines:
            #get Coords
            xpath = "./a:%s" % ("Coords")
            lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns})
            coord = lCoords[0]
            xpath = "./a:%s" % ("Baseline")
            lBL = tl.xpath(xpath, namespaces={"a": self.xmlns})
            baseline = lBL[0]

            sPoints = baseline.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                except ValueError:
                    print(tl)
            plg = Polygon(lXY)
            line = LineString(lXY)
            # 50 seems to large: the manual GT is 30  ? not always!
            iHeight = 30  # in pixel
            x1, y1, x2, y2 = plg.getBoundingBox()
            if coord is not None:
                coord.set(
                    'points', "%d,%d %d,%d %d,%d %d,%d" %
                    (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
            else:
                print(tl)
コード例 #7
0
def convertTR2Sep(filename):
    """
    """
    print (filename)
    tagname='TextRegion'
    xml = etree.parse(filename)
    ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"})
    
    for x in ltextsep:
        if "separator" in x.get('custom'):
            x.tag = 'SeparatorRegion'
            
            #now we need to convert that object to a line
            lXY = PageXml.getPointList(x)  #the polygon
            assert lXY, "Separator without Coord??"
            
            plg = Polygon(lXY)
            try:
                x1,y1, x2,y2 = plg.fitRectangle()
            except ValueError:
                print("Warning: Coords might be bad, taking bounding box: ", lXY)
                x1,y1,x2,y2 = plg.getBoundingBox()
#             try:
#                 x1,y1, x2,y2 = plg.fitRectangle()
#             except ZeroDivisionError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()
#             except ValueError:
#                 x1,y1,x2,y2 = plg.getBoundingBox()            
            if abs(x2-x1) > abs(y2-y1): # horizontal
                y1 = (y1+y2)/2
                y2 = y1
            else:
                x1 = (x1+x2)/2
                x2=x1
            
            ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0]
            PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)])
                
    return xml
コード例 #8
0
    def get_separator_YX_from_DOM(self, root, fMinPageCoverage):
        """
        get the x and y of the GT table separators
        return lists of y, for horizontal and of x for vertical separators, per page
        return [(y_list, x_list), ...]
        """
        ltlYlX = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lYi, lXi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if len(l) != 1:
                if l:
                    traceln(
                        "** warning ** %d TableRegion instead of expected 1" %
                        len(l))
                else:
                    traceln("** warning ** no TableRegion, expected 1")
            if l:
                for ndTR in l:
                    #enumerate the table separators
                    for ndSep in MultiPageXml.getChildByName(
                            ndTR, 'SeparatorRegion'):
                        sPoints = MultiPageXml.getChildByName(
                            ndSep, 'Coords')[0].get('points')
                        [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                        dx, dy = abs(x2 - x1), abs(y2 - y1)
                        if dx > dy:
                            #horizontal table line
                            if dx > (fMinPageCoverage * w):
                                #ym = (y1+y2)/2.0   # 2.0 to support python2
                                lYi.append((y1, y2))
                        else:
                            if dy > (fMinPageCoverage * h):
                                #xm = (x1+x2)/2.0
                                lXi.append((x1, x2))
            ltlYlX.append((lYi, lXi))

        return ltlYlX
コード例 #9
0
    def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage):
        """
        get the index in our grid of the table lines
        return lists of index, for horizontal and for vertical grid lines, per page
        return [(h_list, v_list), ...]
        """
        ltlHlV = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lHi, lVi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if l:
                assert len(l) == 1, "More than 1 TableRegion??"
                ndTR = l[0]

                #enumerate the table separators
                for ndSep in MultiPageXml.getChildByName(
                        ndTR, 'SeparatorRegion'):
                    sPoints = MultiPageXml.getChildByName(
                        ndSep, 'Coords')[0].get('points')
                    [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                    dx, dy = abs(x2 - x1), abs(y2 - y1)
                    if dx > dy:
                        #horizontal table line
                        if dx > (fMinPageCoverage * w):
                            ym = (y1 + y2) / 2.0  # 2.0 to support python2
                            #i = int(round(ym / self.iGridVertiStep, 0))
                            i = self.snapToGridIndex(ym, self.iGridVertiStep)
                            lHi.append(i)
                    else:
                        if dy > (fMinPageCoverage * h):
                            xm = (x1 + x2) / 2.0
                            #i = int(round(xm / self.iGridHorizStep, 0))
                            i = self.snapToGridIndex(xm, self.iGridHorizStep)
                            lVi.append(i)
            ltlHlV.append((lHi, lVi))

        return ltlHlV
コード例 #10
0
    def getHisto(self,
                 lNd,
                 w,
                 _fMinHorizProjection,
                 h,
                 _fMinVertiProjection,
                 fRatio=1.0,
                 fMinHLen=None):
        """
        
        return two Numpy array reflecting the histogram of projections of objects
        first array along Y axis (horizontal projection), 2nd along X axis 
        (vertical projection)
        
        when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen
        """

        hy = np.zeros((h, ), np.float)
        hx = np.zeros((w, ), np.float)

        for nd in lNd:
            sPoints = MultiPageXml.getChildByName(nd,
                                                  'Coords')[0].get('points')
            try:
                x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()

                if fMinHLen is None or abs(x2 - x1) > fMinHLen:
                    _x1, _x2 = self.scale(x1, x2, fRatio)
                else:
                    _x1, _x2 = x1, x2
                _y1, _y2 = self.scale(y1, y2, fRatio)
                hy[_y1:_y2 + 1] += float(x2 - x1) / w
                hx[_x1:_x2 + 1] += float(y2 - y1) / h
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        return hy, hx
コード例 #11
0
def getTBLRBorders(lNodes):
    """
        return top, bottom, left, right borders
    """
    llT, llR, llB, llL = [], [], [], []
    for bordercell in lNodes:
        coord = bordercell.xpath("./a:%s" % ("Coords"),
                                 namespaces={"a": MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
        except:
            pass
        llT.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lT])
        llR.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lR])
        llB.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lB])
        llL.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lL])
    return (MultiLineString(llT).convex_hull), (
        MultiLineString(llR).convex_hull
    ), (MultiLineString(llB).convex_hull), (MultiLineString(llL).convex_hull)
    return llT, llR, llB, llL
コード例 #12
0
    def reintegrateTextIntoCells(self, doc, lLTextLines=[]):
        """
        from XMLDSTABLE
        """
        def overlapX(zone1, zone2):
            [a1, a2] = zone1  #self.getX(),self.getX()+ self.getWidth()
            [b1, b2] = zone2  #zone.getX(),zone.getX()+ zone.getWidth()
            return min(a2, b2) >= max(a1, b1)

        def overlapY(zone1, zone2):
            [a1, a2] = zone1  #self.getY(),self.getY() + self.getHeight()
            [b1, b2] = zone2  #zone.getY(),zone.getY() + zone.getHeight()
            return min(a2, b2) >= max(a1, b1)

        def signedRatioOverlap(zone1, zone2):
            """
             overlap self and zone
             return surface of self in zone 
            """

            [
                x1, y1, x12, y12
            ] = zone1  #self.getX(),self.getY(),self.getHeight(),self.getWidth()
            [
                x2, y2, x22, y22
            ] = zone2  #zone.getX(),zone.getY(),zone.getHeight(),zone.getWidth()

            w1, h1 = x12 - x1, y12 - y1
            w2, h2 = x22 - x2, y22 - y2
            fOverlap = 0.0

            #             print (x1,x12),(x2,x22)
            #             print overlapX((x1,x12),(x2,x22))
            #             print (y1,y12),(y2,y22)
            #             print overlapY((y1,y12),(y2,y22))
            #             if overlapX((x1,w1),(x2,w2)) and overlapY((y1,h1),(y2,h2)):
            if overlapX((x1, x12), (x2, x22)) and overlapY((y1, y12),
                                                           (y2, y22)):
                [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1]
                [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2]

                s1 = w1 * h1

                # possible ?
                if s1 == 0: s1 = 1.0

                #intersection
                nx1 = max(x11, x21)
                nx2 = min(x12, x22)
                ny1 = max(y11, y21)
                ny2 = min(y12, y22)
                h = abs(nx2 - nx1)
                w = abs(ny2 - ny1)

                inter = h * w
                if inter > 0:
                    fOverlap = inter / s1
                else:
                    # if overX and Y this is not possible !
                    fOverlap = 0.0

            return fOverlap

        def bestRegionsAssignment(plgtl, lRegions):
            """
                find the best (max overlap for self) region  for self
            """

            lOverlap = []
            for _, plg in lRegions:
                lOverlap.append(
                    signedRatioOverlap(plgtl.getBoundingBox(),
                                       plg.getBoundingBox()))
#             print plgtl.getBoundingBox(), lOverlap
            if max(lOverlap) == 0: return None
            return lRegions[lOverlap.index(max(lOverlap))]

        lPages = PageXml.getChildByName(doc.getroot(), 'Page')
        lRegionsToBeDeleted = []
        for i, page in enumerate(lPages):
            if lLTextLines == []:
                lTextLines = PageXml.getChildByName(page, 'TextLine')
            else:
                lTextLines = lLTextLines[i]

            lCells = PageXml.getChildByName(page, 'TableCell')

            #             print len(lCells),len(lTextLines)
            lOCells = []
            for cell in lCells:
                #get Coords
                xpath = "./a:%s" % ("Coords")
                lCoords = cell.xpath(xpath, namespaces={"a": self.xmlns})
                coord = lCoords[0]
                sPoints = coord.get('points')
                lsPair = sPoints.split(' ')
                lXY = list()
                for sPair in lsPair:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                plg = Polygon(lXY)
                lOCells.append((cell, plg))

            # find the best assignment of each text
            for tl in lTextLines:
                #get Coords
                xpath = "./a:%s" % ("Coords")
                lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns})
                coord = lCoords[0]
                sPoints = coord.get('points')
                lsPair = sPoints.split(' ')
                lXY = list()
                for sPair in lsPair:
                    (sx, sy) = sPair.split(',')
                    lXY.append((int(sx), int(sy)))
                plg = Polygon(lXY)
                cell = bestRegionsAssignment(plg, lOCells)
                if cell:
                    c, _ = cell
                    lRegionsToBeDeleted.append(c.parent)
                    ## what about parent  TextRegion  delete at least TextRegion/TextEquiv
                    #                     tl.unlinkNode()
                    tlcp = tl.docCopyNode(c.doc, True)
                    #                     tlcp.unlinkNode()
                    c.append(tlcp)
#                     print c

        for region in lRegionsToBeDeleted:
            region.getParent().remove(region)
コード例 #13
0
    def mergeBaselineCells(self, coldir, colid, docid):
        """
        
            Take a file (pxml) with stuff processed on Transkribus
            Tale the CVL template tool xml (xml)
            
            merge them
            
            regenerate a mpxml
             
        """

        xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid))
        #         print (xmlpath)

        mpxml = xmlpath + ".mpxml"
        mpxmldoc = etree.parse(mpxml)

        lxml = glob.glob(os.path.join(xmlpath, "*.xml"))
        pxmldoc = MultiPageXml.makeMultiPageXml(lxml)

        lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml"))
        mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml)

        lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page')
        lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page')

        assert len(lXMLPage) == len(lPXMLPage)
        for i, cvlpage in enumerate(lXMLPage):
            ## remove TextRegion from xcvlpage
            lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion')
            for tr in lTextRegions:
                tr.getparent().remove(tr)

            pxmlpage = lPXMLPage[i]
            lTL = []
            lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion')
            for x in lTextRegions:
                lTL.extend(PageXml.getChildByName(x, 'TextLine'))

            ltable = PageXml.getChildByName(cvlpage, 'TableRegion')
            if len(ltable) == 0:
                raise "NO TABLE"
            lCells = PageXml.getChildByName(ltable[0], 'TableCell')

            lC = [Polygon(PageXml.getPointList(c)) for c in lCells]
            lT = [Polygon(PageXml.getPointList(t)) for t in lTL]
            for i, tl in enumerate(lT):
                ## normalization
                lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords')
                lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline')
                coordB = lCoordsB[0]
                coord = lCoordsPoints[0]
                iHeight = 30  # in pixel
                x1, y1, x2, y2 = Polygon(
                    PageXml.getPointList(coordB)).getBoundingBox()
                if coord is not None:
                    coord.set(
                        'points', "%d,%d %d,%d %d,%d %d,%d" %
                        (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
                    tl = Polygon(PageXml.getPointList(coordB))
                lOverlap = []
                for _, c in enumerate(lC):
                    #                     print (lCells[j].get('row'),lCells[j].get('col'),  self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox())
                    lOverlap.append(self.signedOverlap(
                        c, tl))  #.getBoundingBox()))
                ## region of the same size as the textline
#                 print (j,max(lOverlap),lOverlap.index(max(lOverlap)))
                if max(lOverlap) == 0:
                    region = PageXml.createPageXmlNode('TextRegion')
                    cvlpage.append(region)
                    region.append(lTL[i])

                else:
                    cell = lCells[lOverlap.index(max(lOverlap))]
                    cell.append(lTL[i])
#                     print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext()))

        pxmldoc.write(mpxml)
コード例 #14
0
    def _iter_GraphNode(self, doc, domNdPage, page):
        """
        Get the DOM, the DOM page node, the page object

        iterator on the DOM, that returns nodes  (of class Block)
        """
        #--- XPATH contexts
        assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes"
        lNdBlock = domNdPage.xpath(
            self.sxpNode, namespaces=self.dNS)  #all relevant nodes of the page

        for ndBlock in lNdBlock:
            domid = ndBlock.get("id")
            sText = self._get_GraphNodeText(doc, domNdPage, ndBlock)
            if sText == None:
                sText = ""
                NodeType_PageXml.nbNoTextWarning += 1
                if NodeType_PageXml.nbNoTextWarning < 33:
                    traceln("Warning: no text in node %s" % domid)
                elif NodeType_PageXml.nbNoTextWarning == 33:
                    traceln(
                        "Warning: no text in node %s  - *** %d repetition : I STOP WARNING ***"
                        % (domid, NodeType_PageXml.nbNoTextWarning))
                #raise ValueError, "No text in node: %s"%ndBlock

            #now we need to infer the bounding box of that object
            lXY = PageXml.getPointList(ndBlock)  #the polygon
            if lXY == []:
                continue

            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle(
                    bPreserveWidth=self.bPreserveWidth)
            except ZeroDivisionError:
                #                 traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum))
                #                 continue
                #             if True:
                #                 #we reduce a bit this rectangle, to ovoid overlap
                #                 w,h = x2-x1, y2-y1
                #                 dx = max(w * 0.066, min(20, w/3))  #we make sure that at least 1/"rd of te width will remain!
                #                 dy = max(h * 0.066, min(20, w/3))
                #                 x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ]

                x1, y1, x2, y2 = plg.getBoundingBox()
            except ValueError:
                x1, y1, x2, y2 = plg.getBoundingBox()

            #we reduce a bit this rectangle, to ovoid overlap
            if not (self.BBoxDeltaFun is None):
                if type(self.BBoxDeltaFun) is tuple and len(
                        self.BBoxDeltaFun) == 2:
                    xFun, yFun = self.BBoxDeltaFun
                    if xFun is not None:
                        dx = xFun(x2 - x1)
                        x1, x2 = int(round(x1 + dx)), int(round(x2 - dx))
                    if yFun is not None:
                        dy = yFun(y2 - y1)
                        y1, y2 = int(round(y1 + dy)), int(round(y2 - dy))
                else:
                    # historical code
                    w, h = x2 - x1, y2 - y1
                    dx = self.BBoxDeltaFun(w)
                    dy = self.BBoxDeltaFun(h)
                    x1, y1, x2, y2 = [
                        int(round(v))
                        for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy]
                    ]

            # store the rectangle"
            ndBlock.set(
                "DU_points", " ".join([
                    "%d,%d" % (int(x), int(y))
                    for x, y in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)]
                ]))

            #TODO
            orientation = 0  #no meaning for PageXml
            classIndex = 0  #is computed later on

            #and create a Block
            blk = Block(page, (x1, y1, x2 - x1, y2 - y1),
                        sText,
                        orientation,
                        classIndex,
                        self,
                        ndBlock,
                        domid=domid)

            yield blk

        return
コード例 #15
0
    def predict(self, sFilename):
        """
        
        return Y_pred, YGT   (shape=(nb_node,) dtype=np.int)
        """         
        
        #for the pretty printer to format better...
        assert os.path.isfile(sFilename), sFilename
        doc  = etree.parse(sFilename, self.parser)
        #doc  = etree.parse(sFilename)
        root = doc.getroot()
        
        # Find cuts
        lY, lX = self.add_cut_to_DOM(root,
                            fMinHorizProjection=self.fMinHorizProjection,
                            fMinVertiProjection=self.fMinVertiProjection)
        
        # ################################################################
        # NOTE : we will assumes first and last row/column contain Other
        # ################################################################
        
        lyy = list(zip(lY[:-1], lY[1:])) # list of intervals
        lxx = list(zip(lX[:-1], lX[1:])) # list of intervals
        
        dTable = collections.defaultdict(lambda : collections.defaultdict(list) )
        # dTable[i][j] --> list of TExLine in that cell
        
        def getTableIndex(v, lvv):
            """
            index in the table row or columns. The 1st border is at 0
            """
            for i, (v1, v2) in enumerate(lvv):
                if v1 <= v and v <= v2:
                    return i+1
            if v < lvv[0][0]:
                return 0
            else:
                return len(lvv)+1
        
        
        #place each TextLine in the table rows and columns
        ndPage = MultiPageXml.getChildByName(root, 'Page')[0]
        w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))
        
        lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine') 
        
        imax, jmax = -1, -1
        for nd in lndTexLine:
            sPoints=MultiPageXml.getChildByName(nd,'Coords')[0].get('points')
            #x1,y1,x2,y2 = Polygon.parsePoints(sPoints).fitRectangle()
            x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()
            
            i = getTableIndex((y1+y2)/2.0, lyy)
            j = getTableIndex((x1+x2)/2.0, lxx)
            
            dTable[i][j].append( (y1, y2, x1, x2, nd) )
            imax = max(i, imax)
            jmax = max(j, jmax)

        def getGT(nd):
            try:
                return CutPredictor.dLblIndex[nd.get('DU_row')]
            except:
                return 0

        def overlap(a, b):
            _y1, _y2, ax1, ax2, _nd = a
            _y1, _y2, bx1, bx2, _nd = b
            return min(ax2, bx2) - max(ax1, bx1)
        
        def label(lt):
            """
            set the attribute cutDU_row to each node for BIESO labelling
            """
            lt.sort()
            
            #the 'O'
            newlt = []
            for (y1, y2, x1, x2, nd) in lt:
                bInMargin = x2 < 350 or x1 > 4600
                
                if bInMargin:
                    nd.set('cutDU_row',  'O')
                else:
                    newlt.append((y1, y2, x1, x2, nd))
            
            for i, t in enumerate(newlt):
                (y1, y2, x1, x2, nd) = t
                
                #is there someone above?? if yes get the closest node above
                nd_just_above = None
                for j in range(i-1, -1, -1):
                    tt = newlt[j]
                    if overlap(tt, t) > 0:
                        nd_just_above = tt[4]
                        if nd_just_above.get('cutDU_row') != 'O': break
                
                if nd_just_above is None:
                    #S by default
                    nd.set('cutDU_row',  'S')
                else:
                    if nd_just_above.get('cutDU_row') == 'S':
                        nd_just_above.set('cutDU_row', 'B')
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'E':
                        nd_just_above.set('cutDU_row', 'I')
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'I':
                        nd.set('cutDU_row',  'E')
                    elif nd_just_above.get('cutDU_row') == 'B':
                        #bad luck, we do not see the intermediary node
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'O':
                        raise Exception('Internal error')
                
        #now set the BIESO labels... (only vertically for now)
        liGTLabel = list()
        liLabel = list()
        for i in range(0, imax+1):
            dRow = dTable[i]
            for j in range(0, jmax+1):
                lt = dRow[j]
                if not lt: continue
                
                label(lt)

                liGTLabel.extend([getGT(nd) for y1, y2, x1, x2, nd in lt])
                liLabel.extend([self.dLblIndex[nd.get('cutDU_row')] for y1, y2, x1, x2, nd in lt])

        Y_pred = np.array(liLabel  , dtype=np.int)
        YGT    = np.array(liGTLabel, dtype=np.int)
                     
        sOutFilename = sFilename[:-6] + "_cut.mpxml"
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        print('Annotated cut separators in %s'%sOutFilename)   
        
        del doc
        
        return Y_pred, YGT
コード例 #16
0
    def resizeCell(self, cell, ns):
        """
            replace the cell region by a BB for textlines: better for transcriber
        """
        xpath = "./a:%s" % ("TextLine")
        lTextLines = cell.xpath(xpath, namespaces={'a': PageXml.NS_PAGE_XML})
        if lTextLines == []:
            return True

        ## get minx,maxy  for the cell
        lXY = PageXml.getPointList(cell)  #the polygon
        plg = Polygon(lXY)
        x1, y1, x2, y2 = plg.fitRectangle()
        x1, y1, x2, y2 = plg.getBoundingBox()
        cellX1 = x1
        cellX2 = x2
        cellY1 = y1
        cellY2 = y2

        ## get all the textlines of the cell
        minx, miny, maxx, maxy = 9e9, 9e9, 0, 0
        for line in lTextLines:
            lXY = PageXml.getPointList(line)  #the polygon
            # in case empty cell
            if lXY == []:
                continue
            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle()
            except ZeroDivisionError:
                continue
            x1, y1, x2, y2 = plg.getBoundingBox()
            minx = min(minx, x1)
            miny = min(miny, y1)
            maxx = max(maxx, x2)
            maxy = max(maxy, y2)
        """
            finally: simply use the BB of the textlines + padding
        """

        #         # new request: height= max(cell,text)
        #         ## new new request: (12/10/2017): min (cell, text)!
        #         HCell = cellY2 - cellY1
        #         HBBText = maxy - miny
        #
        miny -= self.vpadding  # vertical padding (top)
        maxy += self.vpadding  # vertical padding (bottom)
        #
        #         # Height computation
        #         ## if HBBText <= self.HeightTH * HCell: take HBBText as height for TextREgion
        #         if HBBText > self.HeightTH * HCell:
        #             miny = max(miny,cellY1)
        #             maxy = min(maxy,cellY2)
        #         # else : don't touch miny, maxy  : use only Text for computing Height
        #
        #         # Width computation
        minx -= self.hpadding  # horizontal padding
        maxx += self.hpadding  # horizontal padding

        #         minx = min(cellX1,minx)
        #         maxx = max(cellX2, maxx)
        #         print cellX2, maxx

        corner = cell[0]
        #         print minx,miny,maxx,miny,maxx,maxy,minx,maxy
        corner.set(
            'points', "%d,%d %d,%d %d,%d %d,%d" %
            (minx, miny, maxx, miny, maxx, maxy, minx, maxy))
コード例 #17
0
def tag_DU_row_col_header(root, lCells, maxRowSpan):
    """
    Tag the XML nodes corresponding to those cells
    Modify the XML DOM
    """
    for cell in lCells:
    
        lText = MultiPageXml.getChildByName(cell,'TextLine')
         
        # HEADER WISE: D CH O
        if int(cell.get('row')) < maxRowSpan:
            [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText]
        else:
            [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText]
        
        # ROW WISE: B I E S O
        if len(lText) == 0:
            pass
        if len(lText) == 1:
            lText[0].set(sDURow,lLabelsBIESO_R[3])
        elif len(lText) > 1:
    #         lText.sort(key=lambda x:float(x.prop('y')))
            lText[0].set(sDURow,lLabelsBIESO_R[0])
            [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]]
            lText[-1].set(sDURow,lLabelsBIESO_R[2])
    #         MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0])
    #         MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2])
    #         [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]]    
        
        #COLUM WISE: M S O 
        lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
        coord= lCoords[0]
        sPoints=coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        (cx,cy,cx2,cy2) = plgn.getBoundingBox()     
        
        for txt in lText:
            lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
            coord= lCoords[0]
            sPoints=coord.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx,sy) = sPair.split(',')
                    lXY.append( (int(sx), int(sy)) )
                except ValueError:
                    traceln("WARNING: invalid coord in TextLine id=%s  IGNORED"%txt.get("id"))
            ## HOW to define a CM element!!!!
            if lXY:
                (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox()
                if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1):
                    txt.set(sDUCol,lLabelsSM_C[0])
                else:
                    txt.set(sDUCol,lLabelsSM_C[1])
            else:
                txt.set(sDUCol,lLabelsSM_C[-1])
                
    # textline outside table
    lRegions= MultiPageXml.getChildByName(root,'TextRegion')
    for region in lRegions:
        lText =  MultiPageXml.getChildByName(region,'TextLine')
        [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
        [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
        [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
        
    return
コード例 #18
0
def getCellsSeparators(lCell):
    """
    return two dictionaries
    row -> ((x1, y1), (x2, y2))    NOTE: top of row
    col -> ((x1, y1), (x2, y2))    NOTE: left of column
    """
    dRowSep = {}
    dColSep = {}
    
    # let's collect the segments forming the cell borders, by row, by col
    dRowSep_lSgmt = collections.defaultdict(list)
    dColSep_lSgmt = collections.defaultdict(list)
    for cell in lCell:
        row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \
                                      in ["row", "col", "rowSpan", "colSpan"] ]
        coord = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
        except ZeroDivisionError:
            traceln("ERROR: cell %s row=%d col=%d has empty area and is IGNORED"
                    % (cell.get("id"), row, col))
            continue
        #now the top segments contribute to row separator of index: row
        dRowSep_lSgmt[row].extend(lT)
        #now the bottom segments contribute to row separator of index: row+rowSpan
        dRowSep_lSgmt[row+rowSpan].extend(lB)
        
        dColSep_lSgmt[col].extend(lL)
        dColSep_lSgmt[col+colSpan].extend(lR)
        
    #now make linear regression to draw relevant separators
    def getX(lSegment):
        lX = list()
        for x1,_y1,x2,_y2 in lSegment:
            lX.append(x1)
            lX.append(x2)
        return lX

    def getY(lSegment):
        lY = list()
        for _x1,y1,_x2,y2 in lSegment:
            lY.append(y1)
            lY.append(y2)
        return lY

    for row, lSegment in dRowSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment]
        #duplicate each element 
        sumW = sum(lfNorm) * 2
        W = [fN/sumW for fN in lfNorm for _ in (0,1)]
        # a * x + b
        a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W)

        xmin, xmax = min(X), max(X)
        y1 = a + b * xmin
        y2 = a + b * xmax
        
        dRowSep[row] = ((xmin, y1), (xmax, y2))
    
    for col, lSegment in dColSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment]
        #duplicate each element 
        sumW = sum(lfNorm) * 2
        W = [fN/sumW for fN in lfNorm for _ in (0,1)]
        a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W)

        ymin, ymax = min(Y), max(Y)
        x1 = a + b * ymin
        x2 = a + b * ymax 
        dColSep[col] = ((x1, ymin), (x2, ymax))
        
    return dRowSep, dColSep
コード例 #19
0
    res = cv2.bitwise_and(frame, frame, mask=mask)
    # res = cv2.flip(res,0)

    # finds contours
    im2, contours, heirarchy = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Creates two polygon objects
    for c in range(0, len(contours)):
        if cv2.contourArea(contours[c]) >= cv2.contourArea(contours[high]):
            sec = high
            high = c
        elif (cv2.contourArea(contours[c]) > cv2.contourArea(contours[sec]) and cv2.contourArea(
                contours[c]) <= cv2.contourArea(contours[high])):
            sec = c

    top_tape = Polygon(frame, mask)
    bottom_tape = Polygon(frame, mask)

    # Runs bottom tape when contours is > 1
    if len(contours) > 0:
        bottom_tape.run(contours[high])

    # Runs top tape when contour count is > 2
    if len(contours) > 1:
        top_tape.run(contours[sec])
        hasRun = True

    # Finds center is two objects are found
    if hasRun:
        top_tape.center = list(top_tape.center)
        bottom_tape.center = list(bottom_tape.center)
コード例 #20
0
def addSeparator(root, lCells):
    """
    Add separator that correspond to cell boundaries
    modify the XML DOM
    """
    # let's collect the segment forming the separators
    dRowSep_lSgmt = collections.defaultdict(list)
    dColSep_lSgmt = collections.defaultdict(list)
    for cell in lCells:
        row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \
                                      in ["row", "col", "rowSpan", "colSpan"] ]
        coord = cell.xpath("./a:%s" % ("Coords"),
                           namespaces={"a": MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
            #now the top segments contribute to row separator of index: row
            dRowSep_lSgmt[row].extend(lT)
            #now the bottom segments contribute to row separator of index: row+rowSpan
            dRowSep_lSgmt[row + rowSpan].extend(lB)

            dColSep_lSgmt[col].extend(lL)
            dColSep_lSgmt[col + colSpan].extend(lR)
        except ValueError:
            pass

    #now make linear regression to draw relevant separators
    def getX(lSegment):
        lX = list()
        for x1, y1, x2, y2 in lSegment:
            lX.append(x1)
            lX.append(x2)
        return lX

    def getY(lSegment):
        lY = list()
        for x1, y1, x2, y2 in lSegment:
            lY.append(y1)
            lY.append(y2)
        return lY

    ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

    lB = []
    for irow, lSegment in dRowSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [
            np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment
        ]
        #duplicate each element
        W = [fN for fN in lfNorm for _ in (0, 1)]

        a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W)

        xmin, xmax = min(X), max(X)
        y1 = a + b * xmin
        y2 = a + b * xmax
        lB.append(b * 100)

        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "horizontal %.1f %.3f" % (a, b))
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(xmin, y1), (xmax, y2)])
        ndSep.append(ndCoord)

    sStat = "\tHORIZONTAL: Average=%.1f%%  stdev=%.2f%%  min=%.1f%% max=%.1f%%" % (
        np.average(lB), np.std(lB), min(lB), max(lB))
    ndTR.append(etree.Comment(sStat))
    print(sStat)

    lB = []
    for icol, lSegment in dColSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [
            np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment
        ]
        #duplicate each element
        W = [fN for fN in lfNorm for _ in (0, 1)]

        # a * x + b
        a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W)
        lB.append(b * 100)

        ymin, ymax = min(Y), max(Y)
        x1 = a + b * ymin
        x2 = a + b * ymax
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "vertical %.1f %.3f" % (a, b))
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, ymin), (x2, ymax)])
        ndSep.append(ndCoord)
    sStat = "\tVERTICAL  : Average=%.1f%%  stdev=%.2f%%  min=%.1f%% max=%.1f%%" % (
        np.average(lB), np.std(lB), min(lB), max(lB))
    ndTR.append(etree.Comment(sStat))
    print(sStat)

    return
コード例 #21
0
    def _iter_GraphNode(self, doc, sFilename, page=None):
        """
        Get the json dict

        iterator on the DOM, that returns nodes  (of class Block)
        """
        # --- XPATH contexts

        lNdBlock = doc['GlynchResults']['Areas']
        #page_w, page_h = self.getPageWidthandHeight(sFilename)
        page = Page(1, 1, 1, 1)
        for ndBlock in lNdBlock:
            try:
                sText = ndBlock['label']
            except KeyError:
                sText = ""
                traceln("WARNING: no 'label' in : %s" % ndBlock)
            if sText == None:
                sText = ""
                traceln("Warning: no text in node")
                # raise ValueError, "No text in node: %s"%ndBlock

            # now we need to infer the bounding box of that object
            lXY = self.getPointList(ndBlock)  # the polygon
            if lXY == []:
                continue

            plg = Polygon(lXY)
            try:
                x1, y1, x2, y2 = plg.fitRectangle(
                    bPreserveWidth=self.bPreserveWidth)
            except ZeroDivisionError:
                x1, y1, x2, y2 = plg.getBoundingBox()
            except ValueError:
                x1, y1, x2, y2 = plg.getBoundingBox()

            # we reduce a bit this rectangle, to ovoid overlap
            if not (self.BBoxDeltaFun is None):
                if type(self.BBoxDeltaFun) is tuple and len(
                        self.BBoxDeltaFun) == 2:
                    xFun, yFun = self.BBoxDeltaFun
                    if xFun is not None:
                        dx = xFun(x2 - x1)
                        x1, x2 = int(round(x1 + dx)), int(round(x2 - dx))
                    if yFun is not None:
                        dy = yFun(y2 - y1)
                        y1, y2 = int(round(y1 + dy)), int(round(y2 - dy))
                else:
                    # historical code
                    w, h = x2 - x1, y2 - y1
                    dx = self.BBoxDeltaFun(w)
                    dy = self.BBoxDeltaFun(h)
                    x1, y1, x2, y2 = [
                        int(round(v))
                        for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy]
                    ]

            # TODO
            orientation = 0  # no meaning for PageXml
            classIndex = 0  # is computed later on
            # and create a Block
            blk = Block(page, (x1, y1, x2 - x1, y2 - y1),
                        sText,
                        orientation,
                        classIndex,
                        self,
                        ndBlock,
                        domid=None)
            blk.setShape(geom.Polygon(lXY))
            yield blk

        return
コード例 #22
0
    def computeSkewing(self):
        """
            input: self
            output: skewing ange
            compute text skewing in the row 
        """
        def getX(lSegment):
            lX = list()
            for x1, y1, x2, y2 in lSegment:
                lX.append(x1)
                lX.append(x2)
            return lX

        def getY(lSegment):
            lY = list()
            for x1, y1, x2, y2 in lSegment:
                lY.append(y1)
                lY.append(y2)
            return lY

        import numpy as np
        from util.Polygon import Polygon
        if self.getCells():
            dRowSep_lSgmt = []
            # alternative: compute the real top ones? for wedding more robust!!
            for cell in self.getCells():
                #                 lTopText = filter(lambda x:x.getAttribute('DU_row') == 'B', [text for text in cell.getObjects()])
                try:
                    lTopText = [cell.getObjects()[0]]
                except IndexError:
                    lTopText = []
                for text in lTopText:
                    sPoints = text.getAttribute('points')
                    spoints = ' '.join(
                        "%s,%s" % ((x, y))
                        for x, y in zip(*[iter(sPoints.split(','))] * 2))
                    it_sXsY = (sPair.split(',')
                               for sPair in spoints.split(' '))
                    plgn = Polygon(
                        (float(sx), float(sy)) for sx, sy in it_sXsY)

                    lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
                    dRowSep_lSgmt.extend(lT)
            if dRowSep_lSgmt != []:
                X = getX(dRowSep_lSgmt)
                Y = getY(dRowSep_lSgmt)
                lfNorm = [
                    np.linalg.norm([[x1, y1], [x2, y2]])
                    for x1, y1, x2, y2 in dRowSep_lSgmt
                ]
                #duplicate each element
                W = [fN for fN in lfNorm for _ in (0, 1)]

                # a * x + b
                a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W)
                xmin, xmax = min(X), max(X)
                y1 = a + b * xmin
                y2 = a + b * xmax
                ro = XMLDSTABLEROWClass(self.getIndex())
                #[(x1, ymin), (x2, ymax)
                ro.setX(xmin)
                ro.setY(y1)
                ro.setWidth(xmax - xmin)
                ro.setHeight(y2 - y1)
                ro.setPage(self.getPage())
                ro.setParent(self.getParent())
                ro.addAttribute(
                    'points',
                    ','.join([str(xmin),
                              str(y1), str(xmax),
                              str(y2)]))
                ro.tagMe()