def add_cluster_to_dom(root, llX):
    """
    Cluster the Textline based on the vertical cuts
    """

    for lX, (_iPage, ndPage) in zip(
            llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))):
        w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))

        lX.append(w)
        lX.sort()
        # cluster of objects on
        imax = len(lX)
        dCluster = {i: list() for i in range(imax)}

        #Histogram of projections
        lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine')

        # hack to use addClusterToDom
        class MyBlock:
            def __init__(self, nd):
                self.node = nd

        o = GraphBinaryConjugateSegmenter()
        o.lNode = []
        for nd in lndTextline:
            o.lNode.append(MyBlock(nd))

        for iNd, ndTextline in enumerate(lndTextline):
            sPoints = MultiPageXml.getChildByName(ndTextline,
                                                  'Coords')[0].get('points')
            try:
                x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle()
                xm = (x1 + x2) / 2.0
                bLastColumn = True
                for i, xi in enumerate(lX):
                    if xm <= xi:
                        dCluster[i].append(iNd)
                        ndTextline.set("DU_cluster", str(i))
                        bLastColumn = False
                        break
                if bLastColumn:
                    i = imax
                    dCluster[i].append(iNd)
                    ndTextline.set("DU_cluster", str(i))
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        # add clusters
        lNdCluster = o.addClusterToDom(dCluster,
                                       bMoveContent=False,
                                       sAlgo="cut",
                                       pageNode=ndPage)

        # add a cut_X attribute to the clusters
        for ndCluster in lNdCluster:
            i = int(ndCluster.get('name'))
            ndCluster.set("cut_X", str(lX[i]))
    def getPolylineAverageXY(cls, ndPolyline):
        """
        weighted average X and average Y of a polyline
        the weight indicate how long each segment at a given X, or Y, was.
        """
        sPoints = ndPolyline.get('points')
        lXY = Polygon.parsePoints(sPoints).lXY

        # list of X and Y values and respective weights
        lXYWxWy = [((x1+x2)/2.0, abs(y2-y1),    # for how long at this X?
                    (y1+y2)/2.0, abs(x2-x1)) \
                 for (x1,y1), (x2, y2) in zip(lXY, lXY[1:])]
        fWeightedSumX = sum(x * wx for x, wx, _, _ in lXYWxWy)
        fWeightedSumY = sum(y * wy for _, _, y, wy in lXYWxWy)
        fSumWeightX = sum(wx for _, wx, _, _ in lXYWxWy)
        fSumWeightY = sum(wy for _, _, _, wy in lXYWxWy)

        Xavg = int(round(fWeightedSumX /
                         fSumWeightX)) if fSumWeightX > 0 else 0
        Yavg = int(round(fWeightedSumY /
                         fSumWeightY)) if fSumWeightY > 0 else 0

        #         Xavg, Yavg = self.moduloSnap(Xavg, Yavg)

        return (Xavg, Yavg)
    def get_separator_YX_from_DOM(self, root, fMinPageCoverage):
        """
        get the x and y of the GT table separators
        return lists of y, for horizontal and of x for vertical separators, per page
        return [(y_list, x_list), ...]
        """
        ltlYlX = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lYi, lXi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if len(l) != 1:
                if l:
                    traceln(
                        "** warning ** %d TableRegion instead of expected 1" %
                        len(l))
                else:
                    traceln("** warning ** no TableRegion, expected 1")
            if l:
                for ndTR in l:
                    #enumerate the table separators
                    for ndSep in MultiPageXml.getChildByName(
                            ndTR, 'SeparatorRegion'):
                        sPoints = MultiPageXml.getChildByName(
                            ndSep, 'Coords')[0].get('points')
                        [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                        dx, dy = abs(x2 - x1), abs(y2 - y1)
                        if dx > dy:
                            #horizontal table line
                            if dx > (fMinPageCoverage * w):
                                #ym = (y1+y2)/2.0   # 2.0 to support python2
                                lYi.append((y1, y2))
                        else:
                            if dy > (fMinPageCoverage * h):
                                #xm = (x1+x2)/2.0
                                lXi.append((x1, x2))
            ltlYlX.append((lYi, lXi))

        return ltlYlX
Example #4
0
    def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage):
        """
        get the index in our grid of the table lines
        return lists of index, for horizontal and for vertical grid lines, per page
        return [(h_list, v_list), ...]
        """
        ltlHlV = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lHi, lVi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if l:
                assert len(l) == 1, "More than 1 TableRegion??"
                ndTR = l[0]

                #enumerate the table separators
                for ndSep in MultiPageXml.getChildByName(
                        ndTR, 'SeparatorRegion'):
                    sPoints = MultiPageXml.getChildByName(
                        ndSep, 'Coords')[0].get('points')
                    [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                    dx, dy = abs(x2 - x1), abs(y2 - y1)
                    if dx > dy:
                        #horizontal table line
                        if dx > (fMinPageCoverage * w):
                            ym = (y1 + y2) / 2.0  # 2.0 to support python2
                            #i = int(round(ym / self.iGridVertiStep, 0))
                            i = self.snapToGridIndex(ym, self.iGridVertiStep)
                            lHi.append(i)
                    else:
                        if dy > (fMinPageCoverage * h):
                            xm = (x1 + x2) / 2.0
                            #i = int(round(xm / self.iGridHorizStep, 0))
                            i = self.snapToGridIndex(xm, self.iGridHorizStep)
                            lVi.append(i)
            ltlHlV.append((lHi, lVi))

        return ltlHlV
    def getHisto(self,
                 lNd,
                 w,
                 _fMinHorizProjection,
                 h,
                 _fMinVertiProjection,
                 fRatio=1.0,
                 fMinHLen=None):
        """
        
        return two Numpy array reflecting the histogram of projections of objects
        first array along Y axis (horizontal projection), 2nd along X axis 
        (vertical projection)
        
        when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen
        """

        hy = np.zeros((h, ), np.float)
        hx = np.zeros((w, ), np.float)

        for nd in lNd:
            sPoints = MultiPageXml.getChildByName(nd,
                                                  'Coords')[0].get('points')
            try:
                x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()

                if fMinHLen is None or abs(x2 - x1) > fMinHLen:
                    _x1, _x2 = self.scale(x1, x2, fRatio)
                else:
                    _x1, _x2 = x1, x2
                _y1, _y2 = self.scale(y1, y2, fRatio)
                hy[_y1:_y2 + 1] += float(x2 - x1) / w
                hx[_x1:_x2 + 1] += float(y2 - y1) / h
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        return hy, hx
Example #6
0
def getTBLRBorders(lNodes):
    """
        return top, bottom, left, right borders
    """
    llT, llR, llB, llL = [], [], [], []
    for bordercell in lNodes:
        coord = bordercell.xpath("./a:%s" % ("Coords"),
                                 namespaces={"a": MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
        except:
            pass
        llT.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lT])
        llR.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lR])
        llB.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lB])
        llL.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lL])
    return (MultiLineString(llT).convex_hull), (
        MultiLineString(llR).convex_hull
    ), (MultiLineString(llB).convex_hull), (MultiLineString(llL).convex_hull)
    return llT, llR, llB, llL
Example #7
0
    def predict(self, sFilename):
        """
        
        return Y_pred, YGT   (shape=(nb_node,) dtype=np.int)
        """         
        
        #for the pretty printer to format better...
        assert os.path.isfile(sFilename), sFilename
        doc  = etree.parse(sFilename, self.parser)
        #doc  = etree.parse(sFilename)
        root = doc.getroot()
        
        # Find cuts
        lY, lX = self.add_cut_to_DOM(root,
                            fMinHorizProjection=self.fMinHorizProjection,
                            fMinVertiProjection=self.fMinVertiProjection)
        
        # ################################################################
        # NOTE : we will assumes first and last row/column contain Other
        # ################################################################
        
        lyy = list(zip(lY[:-1], lY[1:])) # list of intervals
        lxx = list(zip(lX[:-1], lX[1:])) # list of intervals
        
        dTable = collections.defaultdict(lambda : collections.defaultdict(list) )
        # dTable[i][j] --> list of TExLine in that cell
        
        def getTableIndex(v, lvv):
            """
            index in the table row or columns. The 1st border is at 0
            """
            for i, (v1, v2) in enumerate(lvv):
                if v1 <= v and v <= v2:
                    return i+1
            if v < lvv[0][0]:
                return 0
            else:
                return len(lvv)+1
        
        
        #place each TextLine in the table rows and columns
        ndPage = MultiPageXml.getChildByName(root, 'Page')[0]
        w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))
        
        lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine') 
        
        imax, jmax = -1, -1
        for nd in lndTexLine:
            sPoints=MultiPageXml.getChildByName(nd,'Coords')[0].get('points')
            #x1,y1,x2,y2 = Polygon.parsePoints(sPoints).fitRectangle()
            x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()
            
            i = getTableIndex((y1+y2)/2.0, lyy)
            j = getTableIndex((x1+x2)/2.0, lxx)
            
            dTable[i][j].append( (y1, y2, x1, x2, nd) )
            imax = max(i, imax)
            jmax = max(j, jmax)

        def getGT(nd):
            try:
                return CutPredictor.dLblIndex[nd.get('DU_row')]
            except:
                return 0

        def overlap(a, b):
            _y1, _y2, ax1, ax2, _nd = a
            _y1, _y2, bx1, bx2, _nd = b
            return min(ax2, bx2) - max(ax1, bx1)
        
        def label(lt):
            """
            set the attribute cutDU_row to each node for BIESO labelling
            """
            lt.sort()
            
            #the 'O'
            newlt = []
            for (y1, y2, x1, x2, nd) in lt:
                bInMargin = x2 < 350 or x1 > 4600
                
                if bInMargin:
                    nd.set('cutDU_row',  'O')
                else:
                    newlt.append((y1, y2, x1, x2, nd))
            
            for i, t in enumerate(newlt):
                (y1, y2, x1, x2, nd) = t
                
                #is there someone above?? if yes get the closest node above
                nd_just_above = None
                for j in range(i-1, -1, -1):
                    tt = newlt[j]
                    if overlap(tt, t) > 0:
                        nd_just_above = tt[4]
                        if nd_just_above.get('cutDU_row') != 'O': break
                
                if nd_just_above is None:
                    #S by default
                    nd.set('cutDU_row',  'S')
                else:
                    if nd_just_above.get('cutDU_row') == 'S':
                        nd_just_above.set('cutDU_row', 'B')
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'E':
                        nd_just_above.set('cutDU_row', 'I')
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'I':
                        nd.set('cutDU_row',  'E')
                    elif nd_just_above.get('cutDU_row') == 'B':
                        #bad luck, we do not see the intermediary node
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'O':
                        raise Exception('Internal error')
                
        #now set the BIESO labels... (only vertically for now)
        liGTLabel = list()
        liLabel = list()
        for i in range(0, imax+1):
            dRow = dTable[i]
            for j in range(0, jmax+1):
                lt = dRow[j]
                if not lt: continue
                
                label(lt)

                liGTLabel.extend([getGT(nd) for y1, y2, x1, x2, nd in lt])
                liLabel.extend([self.dLblIndex[nd.get('cutDU_row')] for y1, y2, x1, x2, nd in lt])

        Y_pred = np.array(liLabel  , dtype=np.int)
        YGT    = np.array(liGTLabel, dtype=np.int)
                     
        sOutFilename = sFilename[:-6] + "_cut.mpxml"
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        print('Annotated cut separators in %s'%sOutFilename)   
        
        del doc
        
        return Y_pred, YGT
def tag_DU_row_col_header(root, lCells, maxRowSpan):
    """
    Tag the XML nodes corresponding to those cells
    Modify the XML DOM
    """
    for cell in lCells:
    
        lText = MultiPageXml.getChildByName(cell,'TextLine')
         
        # HEADER WISE: D CH O
        if int(cell.get('row')) < maxRowSpan:
            [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText]
        else:
            [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText]
        
        # ROW WISE: B I E S O
        if len(lText) == 0:
            pass
        if len(lText) == 1:
            lText[0].set(sDURow,lLabelsBIESO_R[3])
        elif len(lText) > 1:
    #         lText.sort(key=lambda x:float(x.prop('y')))
            lText[0].set(sDURow,lLabelsBIESO_R[0])
            [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]]
            lText[-1].set(sDURow,lLabelsBIESO_R[2])
    #         MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0])
    #         MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2])
    #         [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]]    
        
        #COLUM WISE: M S O 
        lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
        coord= lCoords[0]
        sPoints=coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        (cx,cy,cx2,cy2) = plgn.getBoundingBox()     
        
        for txt in lText:
            lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
            coord= lCoords[0]
            sPoints=coord.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx,sy) = sPair.split(',')
                    lXY.append( (int(sx), int(sy)) )
                except ValueError:
                    traceln("WARNING: invalid coord in TextLine id=%s  IGNORED"%txt.get("id"))
            ## HOW to define a CM element!!!!
            if lXY:
                (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox()
                if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1):
                    txt.set(sDUCol,lLabelsSM_C[0])
                else:
                    txt.set(sDUCol,lLabelsSM_C[1])
            else:
                txt.set(sDUCol,lLabelsSM_C[-1])
                
    # textline outside table
    lRegions= MultiPageXml.getChildByName(root,'TextRegion')
    for region in lRegions:
        lText =  MultiPageXml.getChildByName(region,'TextLine')
        [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
        [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
        [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
        
    return
Example #9
0
def addSeparator(root, lCells):
    """
    Add separator that correspond to cell boundaries
    modify the XML DOM
    """
    # let's collect the segment forming the separators
    dRowSep_lSgmt = collections.defaultdict(list)
    dColSep_lSgmt = collections.defaultdict(list)
    for cell in lCells:
        row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \
                                      in ["row", "col", "rowSpan", "colSpan"] ]
        coord = cell.xpath("./a:%s" % ("Coords"),
                           namespaces={"a": MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
            #now the top segments contribute to row separator of index: row
            dRowSep_lSgmt[row].extend(lT)
            #now the bottom segments contribute to row separator of index: row+rowSpan
            dRowSep_lSgmt[row + rowSpan].extend(lB)

            dColSep_lSgmt[col].extend(lL)
            dColSep_lSgmt[col + colSpan].extend(lR)
        except ValueError:
            pass

    #now make linear regression to draw relevant separators
    def getX(lSegment):
        lX = list()
        for x1, y1, x2, y2 in lSegment:
            lX.append(x1)
            lX.append(x2)
        return lX

    def getY(lSegment):
        lY = list()
        for x1, y1, x2, y2 in lSegment:
            lY.append(y1)
            lY.append(y2)
        return lY

    ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

    lB = []
    for irow, lSegment in dRowSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [
            np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment
        ]
        #duplicate each element
        W = [fN for fN in lfNorm for _ in (0, 1)]

        a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W)

        xmin, xmax = min(X), max(X)
        y1 = a + b * xmin
        y2 = a + b * xmax
        lB.append(b * 100)

        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "horizontal %.1f %.3f" % (a, b))
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(xmin, y1), (xmax, y2)])
        ndSep.append(ndCoord)

    sStat = "\tHORIZONTAL: Average=%.1f%%  stdev=%.2f%%  min=%.1f%% max=%.1f%%" % (
        np.average(lB), np.std(lB), min(lB), max(lB))
    ndTR.append(etree.Comment(sStat))
    print(sStat)

    lB = []
    for icol, lSegment in dColSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [
            np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment
        ]
        #duplicate each element
        W = [fN for fN in lfNorm for _ in (0, 1)]

        # a * x + b
        a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W)
        lB.append(b * 100)

        ymin, ymax = min(Y), max(Y)
        x1 = a + b * ymin
        x2 = a + b * ymax
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "vertical %.1f %.3f" % (a, b))
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, ymin), (x2, ymax)])
        ndSep.append(ndCoord)
    sStat = "\tVERTICAL  : Average=%.1f%%  stdev=%.2f%%  min=%.1f%% max=%.1f%%" % (
        np.average(lB), np.std(lB), min(lB), max(lB))
    ndTR.append(etree.Comment(sStat))
    print(sStat)

    return
def getCellsSeparators(lCell):
    """
    return two dictionaries
    row -> ((x1, y1), (x2, y2))    NOTE: top of row
    col -> ((x1, y1), (x2, y2))    NOTE: left of column
    """
    dRowSep = {}
    dColSep = {}
    
    # let's collect the segments forming the cell borders, by row, by col
    dRowSep_lSgmt = collections.defaultdict(list)
    dColSep_lSgmt = collections.defaultdict(list)
    for cell in lCell:
        row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \
                                      in ["row", "col", "rowSpan", "colSpan"] ]
        coord = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
        except ZeroDivisionError:
            traceln("ERROR: cell %s row=%d col=%d has empty area and is IGNORED"
                    % (cell.get("id"), row, col))
            continue
        #now the top segments contribute to row separator of index: row
        dRowSep_lSgmt[row].extend(lT)
        #now the bottom segments contribute to row separator of index: row+rowSpan
        dRowSep_lSgmt[row+rowSpan].extend(lB)
        
        dColSep_lSgmt[col].extend(lL)
        dColSep_lSgmt[col+colSpan].extend(lR)
        
    #now make linear regression to draw relevant separators
    def getX(lSegment):
        lX = list()
        for x1,_y1,x2,_y2 in lSegment:
            lX.append(x1)
            lX.append(x2)
        return lX

    def getY(lSegment):
        lY = list()
        for _x1,y1,_x2,y2 in lSegment:
            lY.append(y1)
            lY.append(y2)
        return lY

    for row, lSegment in dRowSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment]
        #duplicate each element 
        sumW = sum(lfNorm) * 2
        W = [fN/sumW for fN in lfNorm for _ in (0,1)]
        # a * x + b
        a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W)

        xmin, xmax = min(X), max(X)
        y1 = a + b * xmin
        y2 = a + b * xmax
        
        dRowSep[row] = ((xmin, y1), (xmax, y2))
    
    for col, lSegment in dColSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment]
        #duplicate each element 
        sumW = sum(lfNorm) * 2
        W = [fN/sumW for fN in lfNorm for _ in (0,1)]
        a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W)

        ymin, ymax = min(Y), max(Y)
        x1 = a + b * ymin
        x2 = a + b * ymax 
        dColSep[col] = ((x1, ymin), (x2, ymax))
        
    return dRowSep, dColSep