def add_cluster_to_dom(root, llX):
    """
    Cluster the Textline based on the vertical cuts
    """

    for lX, (_iPage, ndPage) in zip(
            llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))):
        w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))

        lX.append(w)
        lX.sort()
        # cluster of objects on
        imax = len(lX)
        dCluster = {i: list() for i in range(imax)}

        #Histogram of projections
        lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine')

        # hack to use addClusterToDom
        class MyBlock:
            def __init__(self, nd):
                self.node = nd

        o = GraphBinaryConjugateSegmenter()
        o.lNode = []
        for nd in lndTextline:
            o.lNode.append(MyBlock(nd))

        for iNd, ndTextline in enumerate(lndTextline):
            sPoints = MultiPageXml.getChildByName(ndTextline,
                                                  'Coords')[0].get('points')
            try:
                x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle()
                xm = (x1 + x2) / 2.0
                bLastColumn = True
                for i, xi in enumerate(lX):
                    if xm <= xi:
                        dCluster[i].append(iNd)
                        ndTextline.set("DU_cluster", str(i))
                        bLastColumn = False
                        break
                if bLastColumn:
                    i = imax
                    dCluster[i].append(iNd)
                    ndTextline.set("DU_cluster", str(i))
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        # add clusters
        lNdCluster = o.addClusterToDom(dCluster,
                                       bMoveContent=False,
                                       sAlgo="cut",
                                       pageNode=ndPage)

        # add a cut_X attribute to the clusters
        for ndCluster in lNdCluster:
            i = int(ndCluster.get('name'))
            ndCluster.set("cut_X", str(lX[i]))
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        doc = etree.parse(sFilename, parser)
        root = doc.getroot()

        lCells= MultiPageXml.getChildByName(root,'TableCell')
        if not lCells:
            traceln("ERROR: no TableCell - SKIPPING THIS FILE!!!")
            continue
        
        # default: O for all cells: all cells must have all tags!
        for cell in lCells:
            lText = MultiPageXml.getChildByName(cell,'TextLine')
            [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
            [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
            [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
            
        
        if False:
            # Oct' 2018 RV and JL decided that we keep the binding TextLine (if any!)
            # ignore "binding" cells
            # dirty...
            # lCells = list(filter(lambda x: int(x.get('rowSpan')) < 5, lCells))
            # less dirty
            maxrow = max(int(x.get('row')) for x in lCells)
            binding_rowspan = max(5, maxrow * 0.8) 
            traceln(" - max row = %d  => considering rowspan > %d as binding cells"
                    % (maxrow, binding_rowspan))
            lValidCell, lBindingCell = [], []
            for ndCell in lCells:
                if int(ndCell.get('rowSpan')) < binding_rowspan:
                    lValidCell.append(ndCell)
                else:
                    lBindingCell.append(ndCell)
            nDiscarded = len(lBindingCell)
            if nDiscarded > 1: traceln("****************   WARNING  ****************")
            traceln(" - %d cells discarded as binding cells" % nDiscarded)
            for ndCell in lBindingCell:
                ndCell.set("type", "table-binding")
            lCells = lValidCell
            
        # FOR COLUMN HEADER: get max(cell[0,i].span)
        maxRowSpan = computeMaxRowSpan(lCells)
        
        tag_DU_row_col_header(root, lCells, maxRowSpan)
            
        try:
            removeSeparator(root)
            addSeparator(root, lCells)
            doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
            traceln('annotation done for %s  --> %s' % (sFilename, sOutFilename))
        except TableAnnotationException:
            traceln("No Table region in file ", sFilename, "  IGNORED!!")
        
        del doc
 def remove_cuts_from_dom(self, root):
     """
     clean the DOM from any existing cut 
     return the number of removed cut lines
     """
     lnd = MultiPageXml.getChildByName(root, 'CutSeparator')
     n = len(lnd)
     for nd in lnd:
         nd.getparent().remove(nd)
     #check...
     lnd = MultiPageXml.getChildByName(root, 'CutSeparator')
     assert len(lnd) == 0
     return n
Example #4
0
    def add_grid_to_DOM(self, root, ltlHlV=None):
        """
        Add the grid lines to the DOM
        Tag them if ltlHlV is given
        Modify the XML DOM
        return the number of grid lines created
        """
        domid = 0  #to add unique separator id and count them

        for iPage, ndPage in enumerate(
                MultiPageXml.getChildByName(root, 'Page')):
            try:
                lHi, lVi = ltlHlV[iPage]
            except IndexError:
                lHi, lVi = [], []

            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

            def addPageXmlSeparator(nd, i, lGTi, x1, y1, x2, y2, domid):
                ndSep = MultiPageXml.createPageXmlNode("GridSeparator")
                if lGTi:
                    # propagate the groundtruth info we have
                    sLabel = self.getLabel(i, lGTi)
                    ndSep.set("type", sLabel)
                if abs(x2 - x1) > abs(y2 - y1):
                    ndSep.set("orient", "0")
                else:
                    ndSep.set("orient", "90")
                ndSep.set("id", "s_%d" % domid)
                nd.append(ndSep)
                ndCoord = MultiPageXml.createPageXmlNode("Coords")
                MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
                ndSep.append(ndCoord)
                return ndSep

            #Vertical grid lines
            for i, (x1, y1, x2,
                    y2) in enumerate(self.iterGridVerticalLines(w, h)):
                domid += 1
                addPageXmlSeparator(ndTR, i, lVi, x1, y1, x2, y2, domid)

            #horizontal grid lines
            for i, (x1, y1, x2,
                    y2) in enumerate(self.iterGridHorizontalLines(w, h)):
                domid += 1
                addPageXmlSeparator(ndTR, i, lHi, x1, y1, x2, y2, domid)

        return domid
def op_gt_recall(lsFilename,
                 bCutAbove,
                 lDegAngle,
                 fMinHorizProjection=0.05,
                 fCutHeight=25):
    cAll = Counter()
    for sFilename in lsFilename:
        traceln("- loading GT: %s" % sFilename)

        #for the pretty printer to format better...
        parser = etree.XMLParser(remove_blank_text=True)
        doc = etree.parse(sFilename, parser)
        root = doc.getroot()

        doer = SkewedCutAnnotator(bCutAbove,
                                  lAngle=[math.radians(x) for x in lDegAngle])

        pnum = 0
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            pnum += 1
            traceln(" --- page %s - constructing separator candidates" % pnum)

            #load the page objects and the GT partition (defined by the table) if any
            loBaseline, dsetTableByRow = doer.loadPage(ndPage)
            traceln(" - found %d objects on page" % (len(loBaseline)))

            # find almost-horizontal cuts and tag them if GT is available
            loHCut = doer.findHCut(ndPage, loBaseline, dsetTableByRow,
                                   fCutHeight)
            cAll.update(Counter(o._du_label for o in loHCut))

    lk = sorted(cAll.keys())
    traceln("GT: ALL CUT Label count:  ",
            "  ".join("%s:%d" % (k, cAll[k]) for k in lk))
Example #6
0
    def remove_grid_from_dom(self, root):
        """
        clean the DOM from any existing grid (useful to choose at run-time the 
        grid increment (called step)
        return the number of removed grid lines
        """
        for iPage, ndPage in enumerate(
                MultiPageXml.getChildByName(root, 'Page')):

            lnd = MultiPageXml.getChildByName(root, 'GridSeparator')
            n = len(lnd)
            for nd in lnd:
                nd.getparent().remove(nd)
            #check...
            lnd = MultiPageXml.getChildByName(root, 'GridSeparator')
            assert len(lnd) == 0
        return n
    def get_separator_YX_from_DOM(self, root, fMinPageCoverage):
        """
        get the x and y of the GT table separators
        return lists of y, for horizontal and of x for vertical separators, per page
        return [(y_list, x_list), ...]
        """
        ltlYlX = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lYi, lXi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if len(l) != 1:
                if l:
                    traceln(
                        "** warning ** %d TableRegion instead of expected 1" %
                        len(l))
                else:
                    traceln("** warning ** no TableRegion, expected 1")
            if l:
                for ndTR in l:
                    #enumerate the table separators
                    for ndSep in MultiPageXml.getChildByName(
                            ndTR, 'SeparatorRegion'):
                        sPoints = MultiPageXml.getChildByName(
                            ndSep, 'Coords')[0].get('points')
                        [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                        dx, dy = abs(x2 - x1), abs(y2 - y1)
                        if dx > dy:
                            #horizontal table line
                            if dx > (fMinPageCoverage * w):
                                #ym = (y1+y2)/2.0   # 2.0 to support python2
                                lYi.append((y1, y2))
                        else:
                            if dy > (fMinPageCoverage * h):
                                #xm = (x1+x2)/2.0
                                lXi.append((x1, x2))
            ltlYlX.append((lYi, lXi))

        return ltlYlX
def tagSeparatorRegion(lPages):
    """
        tag separatorRegion
    """
    for page in lPages:
        lSeparators = MultiPageXml.getChildByName(page, 'SeparatorRegion')
        lTables = MultiPageXml.getChildByName(page, 'TableRegion')
        if lTables == []:
            print("no table for %s" % sys.argv[1])
            sys.exit(0)
        # default O
        [x.set(sDUSep, lLabels_OI[0]) for x in lSeparators]

        for table in lTables:
            lPolygonTables = [
                ShapePo(MultiPageXml.getPointList(x)) for x in lTables
            ]
            lPolygonSep = [
                LineString(MultiPageXml.getPointList(x)) for x in lSeparators
            ]

            for table in lPolygonTables:
                table_prep = prep(table)
                [
                    lSeparators[i].set(sDUSep, lLabels_OI[1])
                    for i, x in enumerate(lPolygonSep)
                    if table_prep.intersects(x)
                ]

        ## fix bindings
        for table in lTables:
            lCells = MultiPageXml.getChildByName(table, 'TableCell')
            lCells = list(filter(lambda x: int(x.get('rowSpan')) > 6, lCells))
            lPolygonCells = [
                ShapePo(MultiPageXml.getPointList(x)) for x in lCells
            ]
            for cell in lPolygonCells:
                cell_prep = prep(scale(cell, xfact=0.5))
                for i, x in enumerate(lPolygonSep):
                    if cell_prep.intersects(x) and (
                            x.bounds[3] - x.bounds[1]) > (x.bounds[2] -
                                                          x.bounds[0]):
                        lSeparators[i].set(sDUSep, lLabels_OI[0])
def isBaselineHorizontal(ndText):
    lNdBaseline = MultiPageXml.getChildByName(ndText ,'Baseline')
    if lNdBaseline:
        try:
            o = ShapeLoader.node_to_LineString(lNdBaseline[0])
        except:
            return True
        (minx, miny, maxx, maxy) = o.bounds
        return bool(maxx-minx >= maxy-miny)
    return True            
Example #10
0
    def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage):
        """
        get the index in our grid of the table lines
        return lists of index, for horizontal and for vertical grid lines, per page
        return [(h_list, v_list), ...]
        """
        ltlHlV = []
        for ndPage in MultiPageXml.getChildByName(root, 'Page'):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            lHi, lVi = [], []

            l = MultiPageXml.getChildByName(ndPage, 'TableRegion')
            if l:
                assert len(l) == 1, "More than 1 TableRegion??"
                ndTR = l[0]

                #enumerate the table separators
                for ndSep in MultiPageXml.getChildByName(
                        ndTR, 'SeparatorRegion'):
                    sPoints = MultiPageXml.getChildByName(
                        ndSep, 'Coords')[0].get('points')
                    [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY

                    dx, dy = abs(x2 - x1), abs(y2 - y1)
                    if dx > dy:
                        #horizontal table line
                        if dx > (fMinPageCoverage * w):
                            ym = (y1 + y2) / 2.0  # 2.0 to support python2
                            #i = int(round(ym / self.iGridVertiStep, 0))
                            i = self.snapToGridIndex(ym, self.iGridVertiStep)
                            lHi.append(i)
                    else:
                        if dy > (fMinPageCoverage * h):
                            xm = (x1 + x2) / 2.0
                            #i = int(round(xm / self.iGridHorizStep, 0))
                            i = self.snapToGridIndex(xm, self.iGridHorizStep)
                            lVi.append(i)
            ltlHlV.append((lHi, lVi))

        return ltlHlV
 def getDomBaselineXY(cls, domNode):
     """
     find the baseline descendant node and return its "central" point
     """
     try:
         ndBaseline = MultiPageXml.getChildByName(domNode, 'Baseline')[0]
     except IndexError as e:
         traceln("WARNING:  No Baseline child in ", domNode.get('id'))
         raise e
     x, y = cls.getPolylineAverageXY(ndBaseline)
     # modulo should be done only after the GT assigns labels.
     return (x, y)
def addSeparator(root, lCells):
    """
    Add separator that correspond to cell boundaries
    modify the XML DOM
    """
    dRow, dCol = getCellsSeparators(lCells)

    try:
        ndTR = MultiPageXml.getChildByName(root,'TableRegion')[0]
    except IndexError:
        raise TableAnnotationException("No TableRegion!!! ")

    lRow = sorted(dRow.keys())
    lB = []
    for row in lRow:
        (x1, y1), (x2, y2) = dRow[row]
        b = math.degrees(math.atan((y2-y1) / (x2-x1)))
        lB.append(b)
        
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "horizontal angle=%.2f" % b)
        ndSep.set("row", "%d" % row)
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
        ndSep.append(ndCoord)
        sStat = "\tHORIZONTAL: Average=%.1f°  stdev=%.2f°  min=%.1f° max=%.1f°" % (
        np.average(lB), np.std(lB), min(lB), max(lB)
        )
    ndTR.append(etree.Comment(sStat))
    traceln(sStat)
    
    lCol = sorted(dCol.keys())
    lB = []
    for col in lCol:
        (x1, y1), (x2, y2) = dCol[col]
        b = 90  -math.degrees(math.atan((x2-x1) / (y2 - y1)))
        lB.append(b)
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "vertical %.2f" % b)
        ndSep.set("col", "%d" % col)
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)])
        ndSep.append(ndCoord)
    sStat = "\tVERTICAL  : Average=%.1f°  stdev=%.2f°  min=%.1f° max=%.1f°" % (
        np.average(lB), np.std(lB), min(lB), max(lB)
        )
    ndTR.append(etree.Comment(sStat))
    traceln(sStat)

    return
Example #13
0
 def _shapeFromNodePoints(cls, nd, ShapeClass):
     """
     Find the Coords child of the node and parse its points
      e.g. <SeparatorRegion orient="horizontal 373.8 0.006" row="1">
             <Coords points="3324,392 3638,394"/>
           </SeparatorRegion>
     returns a shape of given class
     """
     sPoints = nd.get('points')
     if sPoints is None:
         sPoints = MultiPageXml.getChildByName(nd,
                                               'Coords')[0].get('points')
     return cls._shapeFromPoints(sPoints, ShapeClass)
def getDocSeparators(sFilename):
    """
    return two dictionaries
    row -> list of (x1, y1, x2, y2)
    col -> list of (x1, y1, x2, y2)
    """
    parser = etree.XMLParser()
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()
    lCell= MultiPageXml.getChildByName(root,'TableCell')
    if not lCell:
        raise DocSeparatorException("No TableCell element in %s" %sFilename)
    dRowSep, dColSep = getCellsSeparators(lCell)
    del doc
    return dRowSep, dColSep
Example #15
0
def defineTableBordersFromCells(table):
    """
        Table points are too loose: redefine thme with cells regions 
    """
    lCells = MultiPageXml.getChildByName(table, 'TableCell')
    maxRow = max(int(x.get('row')) for x in lCells)
    maxCol = max(int(x.get('col')) for x in lCells)
    #first col
    col1 = filter(lambda x: x.get('col') == "0", lCells)
    colN = filter(lambda x: x.get('col') == str(maxCol), lCells)
    row1 = filter(lambda x: x.get('row') == "0", lCells)
    rowN = filter(lambda x: x.get('row') == str(maxRow), lCells)

    # T R B L
    lColSep_1 = getTBLRBorders(col1)[3]
    lColSep_N = getTBLRBorders(colN)[1]
    lRowSep_1 = getTBLRBorders(row1)[0]
    lRowSep_N = getTBLRBorders(rowN)[2]

    return lColSep_1, lColSep_N, lRowSep_1, lRowSep_N
def op_cut(sFilename,
           sOutFilename,
           lDegAngle,
           bCutAbove,
           fMinHorizProjection=0.05,
           fCutHeight=25):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()

    doer = SkewedCutAnnotator(bCutAbove,
                              lAngle=[math.radians(x) for x in lDegAngle])

    pnum = 0
    domid = 0
    for ndPage in MultiPageXml.getChildByName(root, 'Page'):
        pnum += 1
        traceln(" --- page %s - constructing separator candidates" % pnum)

        #load the page objects and the GT partition (defined by the table) if any
        loBaseline, dsetTableByRow = doer.loadPage(ndPage)
        traceln(" - found %d objects on page" % (len(loBaseline)))

        # find almost-horizontal cuts and tag them if GT is available
        loHCut = doer.findHCut(ndPage, loBaseline, dsetTableByRow, fCutHeight)

        #create DOM node reflecting the cuts
        #first clean (just in case!)
        n = doer.remove_cuts_from_dom(ndPage)
        if n > 0:
            traceln(" - removed %d pre-existing cut lines" % n)

        # if GT, then we have labelled cut lines in DOM
        domid = doer.add_Hcut_to_Page(ndPage, loHCut, domid)

    doc.write(sOutFilename,
              encoding='utf-8',
              pretty_print=True,
              xml_declaration=True)
    print('Annotated cut separators added to %s' % sOutFilename)
    def getHisto(self,
                 lNd,
                 w,
                 _fMinHorizProjection,
                 h,
                 _fMinVertiProjection,
                 fRatio=1.0,
                 fMinHLen=None):
        """
        
        return two Numpy array reflecting the histogram of projections of objects
        first array along Y axis (horizontal projection), 2nd along X axis 
        (vertical projection)
        
        when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen
        """

        hy = np.zeros((h, ), np.float)
        hx = np.zeros((w, ), np.float)

        for nd in lNd:
            sPoints = MultiPageXml.getChildByName(nd,
                                                  'Coords')[0].get('points')
            try:
                x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()

                if fMinHLen is None or abs(x2 - x1) > fMinHLen:
                    _x1, _x2 = self.scale(x1, x2, fRatio)
                else:
                    _x1, _x2 = x1, x2
                _y1, _y2 = self.scale(y1, y2, fRatio)
                hy[_y1:_y2 + 1] += float(x2 - x1) / w
                hx[_x1:_x2 + 1] += float(y2 - y1) / h
            except ZeroDivisionError:
                pass
            except ValueError:
                pass

        return hy, hx
Example #18
0
    def _doShape_getChildByName(cls, node, name, ShapeClass, fun=None):
        """
        do a MultiPageXml.getChildByName from a node to get some nodes
            e.g. "SeparatorRegion"
        construct a shape of given Shapely class from the coordinates of each node
            e.g.  <SeparatorRegion orient="horizontal 373.8 0.006" row="1">
                <Coords points="3324,392 3638,394"/>
              </SeparatorRegion>
        if fun is given applies fun, with arguments: shape, current_node
        return the list of shape objects in same order as retrived by getChildByName
        """
        lO = []
        for _nd in MultiPageXml.getChildByName(node, name):
            try:
                o = cls._shapeFromNodePoints(_nd, ShapeClass)
            except Exception as e:
                print('ERROR: cannot load this element "%s"' % str(_nd))
                print('  because "%s"' % e)
                continue
            if not fun is None: fun(o, _nd)
            lO.append(o)

        return lO
def main(lsFilename, lsOutFilename):
    #for the pretty printer to format better...
    parser = etree.XMLParser(remove_blank_text=True)
    cnt, cntS = 0, 0
    for sFilename, sOutFilename in zip(lsFilename, lsOutFilename):
        cntDoc, cntDocS = 0, 0

        doc = etree.parse(sFilename, parser)
        root = doc.getroot()
        
        # Separators are not under tableRegion... :-/
        lNdSep = MultiPageXml.getChildByName(root ,'SeparatorRegion')
        loSep = [ShapeLoader.node_to_LineString(ndSep) for ndSep in lNdSep]
        for _o in loSep: _o._bConsistent = True
        
        if not lNdSep:
            traceln("Warning: no separator in %s"%sFilename)
        else:
            traceln("%25s  %d separators" % (sFilename, len(lNdSep)))
            lNdTR = MultiPageXml.getChildByName(root ,'TableRegion')
            for ndTR in lNdTR:
                lNdCells= MultiPageXml.getChildByName(ndTR ,'TableCell')
                if not lNdCells:
                    continue
                
                nbRows = max(int(x.get('row')) for x in lNdCells)
        
                # build a list of Shapely objects augmented with our own table attributes
                loText = [] #
                for ndCell in lNdCells:
                    minRow = int(ndCell.get('row'))
                    minCol = int(ndCell.get('col'))
                    maxRow = minRow + int(ndCell.get('rowSpan')) - 1
                    maxCol = minCol + int(ndCell.get('colSpan')) - 1
#                     # ignore cell spanning the whole table height
#                     if maxRow >= nbRows:
#                         continue
                    for ndText in MultiPageXml.getChildByName(ndCell ,'TextLine'):
                        try:
                            oText = ShapeLoader.node_to_Polygon(ndText)
                        except:
                            traceln("WARNING: SKIPPING 1 TExtLine: cannot make a polygon from: %s" % etree.tostring(ndText))
                            continue
                        # reflecting the textbox as a single point
                        (minx, miny, maxx, maxy) = oText.bounds
                        
                        # is the baseline horizontal or vertical??
                        fDelta = min((maxx-minx) / 2.0, (maxy-miny) / 2.0) 
                        if isBaselineHorizontal(ndText):
                            # supposed Horizontal text
                            oText = geom.Point(minx + fDelta  , (maxy + miny)/2.0)
                            ndText.set("Horizontal", "TRUE")

                        else:
                            ndText.set("Horizontal", "nope")
                            oText = geom.Point((minx + maxx)/2.0  , miny + fDelta)
                            
                        # considering it as a point, using its centroid
                        # does not work well due to loooong texts oText = oText.centroid
                        oText._minRow, oText._minCol = minRow, minCol
                        oText._maxRow, oText._maxCol = maxRow, maxCol
                        if DEBUG: oText._domnd = ndText
                        loText.append(oText)
                
                traceln("    TableRegion  %d texts" % (len(loText)))
                
                if loText:
                    # checking in tun each separator for table-consistency
                    sp = ShapePartition(loText)
                    
                    for oSep in loSep:
                        (minx, miny, maxx, maxy) = oSep.bounds
                        if maxx - minx >= maxy - miny:
                            # supposed Horizontal
                            l = sp.getObjectAboveLine(oSep)
                            if l:
                                maxRowBefore = max(_o._maxRow for _o in l)
                                l = sp.getObjectBelowLine(oSep)
                                if l:
                                    minRowAfter  = min(_o._minRow for _o in l)
                                    if maxRowBefore >= minRowAfter: oSep._bConsistent = False
                        else:
                            l1 = sp.getObjectOnLeftOfLine(oSep)
                            if l1:
                                maxColBefore = max(_o._maxCol for _o in l1)
                                l2 = sp.getObjectOnRightOfLine(oSep)
                                if l2:
                                    minColAfter  = min(_o._minCol for _o in l2)
                                    if maxColBefore >= minColAfter: 
                                        oSep._bConsistent = False
                                        if DEBUG:
                                            # DEBUG
                                            for o in l1:
                                                if o._maxCol >= minColAfter: print("too much on right", etree.tostring(o._domnd))
                                            for o in l2:
                                                if o._minCol <= maxColBefore: print("too much on left", etree.tostring(o._domnd))
                # end of TableRegion
            # end of document
            for ndSep, oSep in zip(lNdSep, loSep): 
                if oSep._bConsistent:
                    ndSep.set("DU_Sep", "S")
                    cntDocS += 1
                else:
                    ndSep.set("DU_Sep", "I")
                cntDoc += 1
            
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        traceln('%.2f%% consistent separators - annotation done for %s  --> %s' % (100*float(cntDocS)/(cntDoc+0.000001), sFilename, sOutFilename))
        
        del doc
        cnt, cntS = cnt+cntDoc, cntS+cntDocS
    traceln('%.2f%% consistent separators - annotation done for %d files' % (100*float(cntS)/(cnt+0.000001), cnt))
    def check(self, sFilename, scale, bVerbose=False):
        """
        return Y_pred, YGT   (shape=(nb_node,) dtype=np.int)
        return a TestReport
        """
        lY, lYGT = [], []

        #for the pretty printer to format better...
        assert os.path.isfile(sFilename), sFilename
        doc = etree.parse(sFilename, self.parser)
        #doc  = etree.parse(sFilename)
        root = doc.getroot()

        #place each TextLine in the table rows and columns
        ndPage = MultiPageXml.getChildByName(root, 'Page')[0]

        #         w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))

        def storeNode(oShape, nd):
            oShape.duNode = nd

        if True:
            loTxt = ShapeLoader.children_to_LinearRing(ndPage, 'TextLine',
                                                       storeNode)
        else:
            loTxt = ShapeLoader.children_to_LineString(ndPage, 'Baseline',
                                                       storeNode)

        if not scale is None:
            scaled_loTxt = []
            for o in loTxt:
                scaled_o = shapely.affinity.scale(o, 1.0, scale)
                scaled_o.duNode = o.duNode
                scaled_loTxt.append(scaled_o)
            loTxt = scaled_loTxt

        if bVerbose: print("%d TextLines" % len(loTxt))
        loSep = ShapeLoader.children_to_LineString(ndPage, 'SeparatorRegion',
                                                   storeNode)
        if bVerbose: print("%d SeparatorRegion" % len(loSep))

        if True:
            # brute-force code
            for oSep in loSep:
                if bVerbose:
                    print("%35s %4s %4s %s" %
                          (oSep, oSep.duNode.get("row"),
                           oSep.duNode.get("col"), oSep.duNode.get("orient")))
                sInfo = oSep.duNode.get("orient")
                if sInfo.startswith("horizontal"):
                    YGT = 0
                elif sInfo.startswith("vertical"):
                    YGT = 1
                    continue
                else:
                    YGT = 2
                Y = None
                for oTxt in loTxt:
                    if oSep.crosses(oTxt):
                        Y = 2
                        nd = oTxt.duNode
                        sid = nd.get("id")
                        ndCell = nd.getparent()
                        if bVerbose:
                            print("\tCrossing %s row=%s col=%s" %
                                  (sid, ndCell.get("row"), ndCell.get("col")))
                if Y is None: Y = YGT
                lY.append(Y)
                lYGT.append(YGT)
        else:
            # Collection-based code
            moTxt = shapely.geometry.MultiLineString(loTxt)
            for oSep in loSep:
                print("%40s %4s %4s %s" %
                      (oSep, oSep.duNode.get("row"), oSep.duNode.get("col"),
                       oSep.duNode.get("orient")))
                if oSep.crosses(moTxt):
                    print("NOO")
                lYGT.append("")
        oTstRpt = TestReport("SeparatorChecker", [lY], [lYGT],
                             self.lsClassName, [sFilename])
        #return np.array(lY  , dtype=np.int), np.array(lYGT, dtype=np.int)
        fScore, sClassificationReport = oTstRpt.getClassificationReport()
        if fScore < 0.999: print("\t *** Accuracy score = %f" % fScore)
        #         if fScore < 1: print(sFilename, sClassificationReport)
        return oTstRpt
def get_col_partition(doer,
                      sxpCut,
                      dNS,
                      sFilename,
                      lFilterFun,
                      fRatio,
                      bVerbose=False,
                      funIndex=lambda x: x._du_index):
    """
    return the GT partition in columns, as well as 1 partition per filter function
    """
    global global_maxHeaderRowSpan

    if bVerbose: traceln("- loading %s" % sFilename)
    parser = etree.XMLParser()
    doc = etree.parse(sFilename, parser)
    root = doc.getroot()

    llsetRun = []

    pnum = 0
    lndPage = MultiPageXml.getChildByName(root, 'Page')
    assert len(lndPage) == 1, "NOT SUPPORTED: file has many pages - soorry"
    for ndPage in lndPage:
        pnum += 1
        if bVerbose: traceln("   - page %s - loading table GT" % pnum)

        loBaseline, dsetTableByCol, dsetTableDataByCol, global_maxHeaderRowSpan = doer.loadPageCol(
            ndPage, fRatio, funIndex=funIndex)

        if bVerbose:
            traceln("   - found %d objects on page" % (len(loBaseline)))

        # make a dictionary of cumulative sets, and the set of all objects
        lTableColK = sorted(dsetTableByCol.keys())
        lTableDataColK = sorted(dsetTableDataByCol.keys())
        if bVerbose:
            traceln("   - found %d cols" % (len(lTableColK)))
            traceln("   - found %d objects in the table" %
                    (sum(len(v) for v in dsetTableByCol.values())))
            traceln("   - found %d objects in the table data" %
                    (sum(len(v) for v in dsetTableDataByCol.values())))
        lNdCut = ndPage.xpath(sxpCut, namespaces=dNS)
        if bVerbose:
            traceln("   - found %d cuts" % (len(lNdCut)))
        else:
            traceln(
                "- loaded %40s " % sFilename,
                " %6d cols %6d 'S' cuts" % (len(lTableColK), len(lNdCut)),
                " %6d objects %6d table objects" %
                (len(loBaseline), sum(len(v)
                                      for v in dsetTableByCol.values())))
        loCut = []
        for ndCut in lNdCut:
            #now we need to infer the bounding box of that object
            (x1, y1), (x2, y2) = PageXml.getPointList(ndCut)  #the polygon
            # Create the shapely shape
            loCut.append(geom.LineString([(x1, y1), (x2, y2)]))

        w, h = float(ndPage.get("imageWidth")), float(
            ndPage.get("imageHeight"))
        #             # Add a fictive cut at top of page
        #             loCut.append(geom.LineString([(0, 0), (w, 0)]))
        #             # Add a fictive cut at end of page
        #             loCut.append(geom.LineString([(0, h), (w, h)]))

        # order it by line centroid x
        loCut.sort(key=lambda o: o.centroid.x)

        # dcumset is the GT!!
        lsetGT = [dsetTableByCol[k]
                  for k in lTableColK]  # list of set of du_index
        lsetDataGT = [dsetTableDataByCol[k] for k in lTableDataColK]

        # NOW, look at predictions
        for filterFun in lFilterFun:
            loBaselineInTable = [o for o in loBaseline if filterFun(o._du_nd)]
            if bVerbose:
                traceln("   - %d objects on page predicted in table (%d out)" %
                        (len(loBaselineInTable),
                         len(loBaseline) - len(loBaselineInTable)))

            # Now create the list of partitions created by the Cuts
            lsetRun = []
            partition = PolygonPartition(loBaselineInTable)
            if True:  # or bCutOnLeft:
                #cut if above the text that led to its creation
                setAllPrevIds = set(
                    [])  # cumulative set of what was already taken
                for oCut in loCut:
                    lo = partition.getObjectOnRightOfLine(oCut)
                    setIds = set(funIndex(o) for o in lo)
                    #print(oCut.centroid.x, setIds)
                    if setAllPrevIds:
                        prevColIds = setAllPrevIds.difference(
                            setIds)  # content of previous row
                        if prevColIds:
                            #an empty set is denoting alternative cuts leading to same partition
                            lsetRun.append(prevColIds)
                    setAllPrevIds = setIds
            else:
                assert False, "look at this code..."


#                     #cut if below the text that led to its creation
#                     cumSetIds = set([]) # cumulative set
#                     for oCut in loCut:
#                         lo = partition.getObjectAboveLine(oCut)
#                         setIds = set(o._du_index for o in lo)
#                         rowIds = setIds.difference(cumSetIds) # only last row!
#                         if rowIds:
#                             #an empty set is denoting alternative cuts leading to same partition
#                             lsetRun.append(rowIds)
#                         cumSetIds = setIds
#             _debugPartition("run", lsetRun)
#             _debugPartition("ref", lsetGT)
            llsetRun.append(lsetRun)
    return lsetGT, lsetDataGT, llsetRun
    def add_cut_to_DOM(self, root, ltlYlX=[]):
        """
        for each page:
        - sort the block by their baseline average y
        - the sorted list of Ys defines the cuts.

        Tag them if ltlYlX is given
            ltlYlX is a list of (ltY1Y2, ltX1X2) per page. 
            ltY1Y2 is the list of (Y1, Y2) of horizontal separators, 
            ltX1X2 is the list of (X1, X2) of vertical separators.

        Modify the XML DOM by adding a separator cut, annotated if GT given
        """
        domid = 0  #to add unique separator id

        ltlYCutXCut = []
        for iPage, ndPage in enumerate(
                MultiPageXml.getChildByName(root, 'Page')):
            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            # list of Ys of baselines, and indexing of block by Y
            #list of (X,Y)
            ltXY = []
            lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine')
            for ndBlock in lndTexLine:
                try:
                    ltXY.append(self.getDomBaselineXY(ndBlock))
                except:
                    pass

            # Groundtruth if any
            #lCells= MultiPageXml.getChildByName(ndPage, 'TableCell')

            # let's collect the segment forming the separators
            try:
                lY, lYLbl, lX, lXLbl = self._getLabelFromSeparator(
                    ltXY, ltlYlX[iPage], w, h)
            except NoSeparatorException:
                lX = list(set(x for x, _ in ltXY))  # zero or 1 cut per X
                lY = list(set(y for _, y in ltXY))  # zero or 1 cut per Y
                lX.sort()  # to have a nice XML
                lY.sort()
                lXLbl = [None] * len(lX)
                lYLbl = [None] * len(lY)

            ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

            #Vertical grid lines
            for y, ylbl in zip(lY, lYLbl):
                domid += 1
                self.addPageXmlSeparator(ndTR, ylbl, 0, y, w, y, domid)
            traceln(" - added %d horizontal cuts" % len(lX))

            #horizontal grid lines
            for x, xlbl in zip(lX, lXLbl):
                domid += 1
                self.addPageXmlSeparator(ndTR, xlbl, x, 0, x, h, domid)
            traceln(" - added %d vertical   cuts" % len(lY))

            ltlYCutXCut.append(([y for _, y in ltXY], [x for x, _ in ltXY]))

        return ltlYCutXCut
Example #23
0
                    nbRows + 1), lCells))
    except ValueError:
        return 1


# ------------------------------------------------------------------
#load mpxml
sFilename = sys.argv[1]
sOutFilename = sys.argv[2]

#for the pretty printer to format better...
parser = etree.XMLParser(remove_blank_text=True)
doc = etree.parse(sFilename, parser)
root = doc.getroot()

lPages = MultiPageXml.getChildByName(root, 'Page')
lCells = MultiPageXml.getChildByName(root, 'TableCell')
if lCells == []:
    print('no table')
    sys.exit(1)

# default: O for all cells: all cells must have all tags!
for cell in lCells:
    lText = MultiPageXml.getChildByName(cell, 'TextLine')
    [x.set(sDURow, lLabelsBIEOS_R[-1]) for x in lText]
    [x.set(sDUCol, lLabelsSM_C[-1]) for x in lText]
    [x.set(sDUHeader, lLabels_HEADER[-1]) for x in lText]

# FOR COLUMN HEADER: get max(cell[0,i].span)
maxRowSpan = computeMaxRowSpan(lCells)
    def loadPageCol(self,
                    ndPage,
                    fRatio,
                    shaper_fun=ShapeLoader.node_to_Point,
                    funIndex=lambda x: x._du_index):
        """
        load the page, looking for Baseline
        can filter by DU_row
        return a list of shapely objects
             , a dict of sorted list of objects, by column
             
        GT BUG: some Baseline are assigned to the wrong Cell
        => we also fix this here....
        
        """
        loBaseline = []  # list of Baseline shapes
        i = 0

        dsetTableByCol = defaultdict(set)  # sets of object ids, by col
        dsetTableDataByCol = defaultdict(set)  # sets of object ids, by col
        dO = {}

        dNodeSeen = {}
        # first associate a unique id to each baseline and list them
        lshapeCell = []
        lOrphanBaselineShape = []

        lCells = MultiPageXml.getChildByName(ndPage, "TableCell")
        maxHeaderRowSpan = computeMaxRowSpan(lCells)
        traceln("   - maxHeaderRowSpan=", maxHeaderRowSpan)
        for ndCell in lCells:
            row, col = int(ndCell.get("row")), int(ndCell.get("col"))
            rowSpan = int(ndCell.get("rowSpan"))
            plg = ShapeLoader.node_to_Polygon(ndCell)
            #ymin, ymax of polygon
            lx = [_x for _x, _y in plg.exterior.coords]
            xmin, xmax = min(lx), max(lx)
            plg._row = row
            plg._col = col
            plg._xmin, plg._xmax = xmin, xmax
            lshapeCell.append(plg)

            for nd in MultiPageXml.getChildByName(ndCell, "Baseline"):
                nd.set("du_index", "%d" % i)
                ndParent = nd.getparent()
                dNodeSeen[ndParent.get('id')] = True

                # Baseline as a shapely object
                try:
                    o = shaper_fun(nd)  #make a LineString
                except Exception as e:
                    traceln("ERROR: id=", nd.getparent().get("id"))
                    raise e
                # scale the objects, as done when cutting!!
                # useless currently since we make a Point...
                o = shapely.affinity.scale(o, xfact=fRatio, yfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                # is this object in the correct cell???
                # We must use the centroid of the text box, otherwise a baseline
                # may be assigned to the next row
                # NOOO x = ShapeLoader.node_to_Polygon(ndParent).centroid.x
                # we must look for the leftest coordinate
                # NO CHECK FOR COLUMNS

                dsetTableByCol[col].add(funIndex(o))

                if (row + rowSpan) > maxHeaderRowSpan:
                    dsetTableDataByCol[col].add(funIndex(o))

                i += 1


#         if lOrphanBaselineShape:
#             traceln("    *** error: %d Baseline in incorrect row - fixing this..." % len(lOrphanBaselineShape))
#             for o in lOrphanBaselineShape:
#                 bestrow, bestdeltacol = 0, 9999
#                 try:
#                     y = o.y
#                 except:
#                     y = o.centroid.y
#                 for plg in lshapeCell:
#                     if plg._ymin <= y and y <= plg._ymax:
#                         # sounds good
#                         deltacol = abs(o._bad_cell._col - plg._col)
#                         if deltacol == 0:
#                             # same column, ok it is that one
#                             bestrow = plg._row
#                             break
#                         else:
#                             if bestdeltacol > deltacol:
#                                 bestdeltacol = deltacol
#                                 bestrow = plg._row
#                 traceln("\t id=%s misplaced in row=%s instead of row=%s" %(
#                     o._du_nd.getparent().get("id")
#                     , o._bad_cell._row
#                     , bestrow))
#                 dsetTableByCol[bestrow].add(o._du_index)
#                 del o._bad_cell

# and (UGLY) process all Baseline outside any TableCell...

        for nd in MultiPageXml.getChildByName(ndPage, "Baseline"):
            try:
                dNodeSeen[nd.getparent().get('id')]
            except:
                #OLD "GOOD" CODE HERE
                nd.set("du_index", "%d" % i)

                # Baseline as a shapely object
                o = shaper_fun(nd)  #make a LineString

                # scale the objects, as done when cutting!!
                o = shapely.affinity.scale(o, xfact=fRatio)

                o._du_index = i
                o._du_nd = nd
                o._dom_id = nd.getparent().get("id")
                loBaseline.append(o)

                i += 1

        return loBaseline, dsetTableByCol, dsetTableDataByCol, maxHeaderRowSpan
Example #25
0
def addSeparator(root, lCells):
    """
    Add separator that correspond to cell boundaries
    modify the XML DOM
    """
    # let's collect the segment forming the separators
    dRowSep_lSgmt = collections.defaultdict(list)
    dColSep_lSgmt = collections.defaultdict(list)
    for cell in lCells:
        row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \
                                      in ["row", "col", "rowSpan", "colSpan"] ]
        coord = cell.xpath("./a:%s" % ("Coords"),
                           namespaces={"a": MultiPageXml.NS_PAGE_XML})[0]
        sPoints = coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        try:
            lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft()
            #now the top segments contribute to row separator of index: row
            dRowSep_lSgmt[row].extend(lT)
            #now the bottom segments contribute to row separator of index: row+rowSpan
            dRowSep_lSgmt[row + rowSpan].extend(lB)

            dColSep_lSgmt[col].extend(lL)
            dColSep_lSgmt[col + colSpan].extend(lR)
        except ValueError:
            pass

    #now make linear regression to draw relevant separators
    def getX(lSegment):
        lX = list()
        for x1, y1, x2, y2 in lSegment:
            lX.append(x1)
            lX.append(x2)
        return lX

    def getY(lSegment):
        lY = list()
        for x1, y1, x2, y2 in lSegment:
            lY.append(y1)
            lY.append(y2)
        return lY

    ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

    lB = []
    for irow, lSegment in dRowSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [
            np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment
        ]
        #duplicate each element
        W = [fN for fN in lfNorm for _ in (0, 1)]

        a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W)

        xmin, xmax = min(X), max(X)
        y1 = a + b * xmin
        y2 = a + b * xmax
        lB.append(b * 100)

        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "horizontal %.1f %.3f" % (a, b))
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(xmin, y1), (xmax, y2)])
        ndSep.append(ndCoord)

    sStat = "\tHORIZONTAL: Average=%.1f%%  stdev=%.2f%%  min=%.1f%% max=%.1f%%" % (
        np.average(lB), np.std(lB), min(lB), max(lB))
    ndTR.append(etree.Comment(sStat))
    print(sStat)

    lB = []
    for icol, lSegment in dColSep_lSgmt.items():
        X = getX(lSegment)
        Y = getY(lSegment)
        #sum(l,())
        lfNorm = [
            np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment
        ]
        #duplicate each element
        W = [fN for fN in lfNorm for _ in (0, 1)]

        # a * x + b
        a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W)
        lB.append(b * 100)

        ymin, ymax = min(Y), max(Y)
        x1 = a + b * ymin
        x2 = a + b * ymax
        ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion")
        ndSep.set("orient", "vertical %.1f %.3f" % (a, b))
        ndTR.append(ndSep)
        ndCoord = MultiPageXml.createPageXmlNode("Coords")
        MultiPageXml.setPoints(ndCoord, [(x1, ymin), (x2, ymax)])
        ndSep.append(ndCoord)
    sStat = "\tVERTICAL  : Average=%.1f%%  stdev=%.2f%%  min=%.1f%% max=%.1f%%" % (
        np.average(lB), np.std(lB), min(lB), max(lB))
    ndTR.append(etree.Comment(sStat))
    print(sStat)

    return
    def add_cut_to_DOM(self,
                       root,
                       fMinHorizProjection=0.05,
                       fMinVertiProjection=0.05,
                       ltlYlX=[],
                       fRatio=1.0,
                       fMinHLen=None):
        """
        for each page, compute the histogram of projection of text on Y then X
        axis.
        From this histogram, find cuts. 
        fMinProjection determines the threholds as a percentage of width (resp 
        height) of page. Any bin lower than it is considered as zero.
        Map cuts to table separators to annotate them
        Dynamically tune the threshold for cutting so as to reflect most separators
        as a cut.
        Tag them if ltlYlX is given 
        
        ltlYlX is a list of (ltY1Y2, ltX1X2) per page. 
        ltY1Y2 is the list of (Y1, Y2) of horizontal separators, 
        ltX1X2 is the list of (X1, X2) of vertical separators.
         
        Modify the XML DOM by adding a separator cut, annotated if GT given
        """
        domid = 0  #to add unique separator id

        for iPage, ndPage in enumerate(
                MultiPageXml.getChildByName(root, 'Page')):
            try:
                lYi, lXi = ltlYlX[iPage]
            #except TypeError:
            except:
                lYi, lXi = [], []

            w, h = int(ndPage.get("imageWidth")), int(
                ndPage.get("imageHeight"))

            #Histogram of projections
            lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine')
            aYHisto, aXHisto = self.getHisto(lndTexLine,
                                             w,
                                             fMinHorizProjection,
                                             h,
                                             fMinVertiProjection,
                                             fRatio,
                                             fMinHLen=fMinHLen)

            aYHisto = aYHisto - fMinHorizProjection
            aXHisto = aXHisto - fMinVertiProjection

            #find the centre of each area of 0s and its label
            lY, lYLbl = self.getCentreOfZeroAreas(aYHisto, lYi)
            # lX, lXLbl = self.getCentreOfZeroAreas(aXHisto, lXi)
            lX, lXLbl = self.getLowestOfZeroAreas(aXHisto, lXi)

            traceln(lY)
            traceln(lX)

            traceln(" - %d horizontal cuts" % len(lY))
            traceln(" - %d vertical cuts" % len(lX))

            ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0]

            # horizontal grid lines
            for y, ylbl in zip(lY, lYLbl):
                domid += 1
                self.addPageXmlSeparator(ndTR, ylbl, 0, y, w, y, domid)

            # Vertical grid lines
            for x, xlbl in zip(lX, lXLbl):
                domid += 1
                self.addPageXmlSeparator(ndTR, xlbl, x, 0, x, h, domid)

        return (lY, lX)
Example #27
0
    def predict(self, sFilename):
        """
        
        return Y_pred, YGT   (shape=(nb_node,) dtype=np.int)
        """         
        
        #for the pretty printer to format better...
        assert os.path.isfile(sFilename), sFilename
        doc  = etree.parse(sFilename, self.parser)
        #doc  = etree.parse(sFilename)
        root = doc.getroot()
        
        # Find cuts
        lY, lX = self.add_cut_to_DOM(root,
                            fMinHorizProjection=self.fMinHorizProjection,
                            fMinVertiProjection=self.fMinVertiProjection)
        
        # ################################################################
        # NOTE : we will assumes first and last row/column contain Other
        # ################################################################
        
        lyy = list(zip(lY[:-1], lY[1:])) # list of intervals
        lxx = list(zip(lX[:-1], lX[1:])) # list of intervals
        
        dTable = collections.defaultdict(lambda : collections.defaultdict(list) )
        # dTable[i][j] --> list of TExLine in that cell
        
        def getTableIndex(v, lvv):
            """
            index in the table row or columns. The 1st border is at 0
            """
            for i, (v1, v2) in enumerate(lvv):
                if v1 <= v and v <= v2:
                    return i+1
            if v < lvv[0][0]:
                return 0
            else:
                return len(lvv)+1
        
        
        #place each TextLine in the table rows and columns
        ndPage = MultiPageXml.getChildByName(root, 'Page')[0]
        w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight"))
        
        lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine') 
        
        imax, jmax = -1, -1
        for nd in lndTexLine:
            sPoints=MultiPageXml.getChildByName(nd,'Coords')[0].get('points')
            #x1,y1,x2,y2 = Polygon.parsePoints(sPoints).fitRectangle()
            x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle()
            
            i = getTableIndex((y1+y2)/2.0, lyy)
            j = getTableIndex((x1+x2)/2.0, lxx)
            
            dTable[i][j].append( (y1, y2, x1, x2, nd) )
            imax = max(i, imax)
            jmax = max(j, jmax)

        def getGT(nd):
            try:
                return CutPredictor.dLblIndex[nd.get('DU_row')]
            except:
                return 0

        def overlap(a, b):
            _y1, _y2, ax1, ax2, _nd = a
            _y1, _y2, bx1, bx2, _nd = b
            return min(ax2, bx2) - max(ax1, bx1)
        
        def label(lt):
            """
            set the attribute cutDU_row to each node for BIESO labelling
            """
            lt.sort()
            
            #the 'O'
            newlt = []
            for (y1, y2, x1, x2, nd) in lt:
                bInMargin = x2 < 350 or x1 > 4600
                
                if bInMargin:
                    nd.set('cutDU_row',  'O')
                else:
                    newlt.append((y1, y2, x1, x2, nd))
            
            for i, t in enumerate(newlt):
                (y1, y2, x1, x2, nd) = t
                
                #is there someone above?? if yes get the closest node above
                nd_just_above = None
                for j in range(i-1, -1, -1):
                    tt = newlt[j]
                    if overlap(tt, t) > 0:
                        nd_just_above = tt[4]
                        if nd_just_above.get('cutDU_row') != 'O': break
                
                if nd_just_above is None:
                    #S by default
                    nd.set('cutDU_row',  'S')
                else:
                    if nd_just_above.get('cutDU_row') == 'S':
                        nd_just_above.set('cutDU_row', 'B')
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'E':
                        nd_just_above.set('cutDU_row', 'I')
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'I':
                        nd.set('cutDU_row',  'E')
                    elif nd_just_above.get('cutDU_row') == 'B':
                        #bad luck, we do not see the intermediary node
                        nd           .set('cutDU_row', 'E')
                    elif nd_just_above.get('cutDU_row') == 'O':
                        raise Exception('Internal error')
                
        #now set the BIESO labels... (only vertically for now)
        liGTLabel = list()
        liLabel = list()
        for i in range(0, imax+1):
            dRow = dTable[i]
            for j in range(0, jmax+1):
                lt = dRow[j]
                if not lt: continue
                
                label(lt)

                liGTLabel.extend([getGT(nd) for y1, y2, x1, x2, nd in lt])
                liLabel.extend([self.dLblIndex[nd.get('cutDU_row')] for y1, y2, x1, x2, nd in lt])

        Y_pred = np.array(liLabel  , dtype=np.int)
        YGT    = np.array(liGTLabel, dtype=np.int)
                     
        sOutFilename = sFilename[:-6] + "_cut.mpxml"
        doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True)
        print('Annotated cut separators in %s'%sOutFilename)   
        
        del doc
        
        return Y_pred, YGT
Example #28
0
def tagSeparatorRegion(lPages):
    """
        tag separatorRegion
    """
    for page in lPages:
        lSeparators = MultiPageXml.getChildByName(page, 'SeparatorRegion')
        lTables = MultiPageXml.getChildByName(page, 'TableRegion')
        if lTables == []:
            print("no table for %s" % sys.argv[1])
            sys.exit(0)
        # default O
        [x.set(sDUSep, lLabels_OI[0]) for x in lSeparators]

        for table in lTables:
            polygonTable = ShapePo(MultiPageXml.getPointList(table))
            lPolygonSep = [
                ShapePo(LineString(MultiPageXml.getPointList(x)).buffer(10))
                for x in lSeparators
            ]

            table_prep = prep(polygonTable)
            [
                lSeparators[i].set(sDUSep, lLabels_OI[1])
                for i, x in enumerate(lPolygonSep) if table_prep.intersects(x)
            ]

            #             ## given some thickness to the table borders?
            #             lT, lR, lB, lL =  getVerHorBorders(table)
            #             lH = [];lV= []
            #             [lH.append(ShapePo(LineString(((x[0][0],x[0][1]),(x[0][2],x[0][3]))).buffer(10))) for x in [lT,lB]]
            #             [lV.append(ShapePo(LineString(((x[0][0],x[0][1]),(x[0][2],x[0][3]))).buffer(10))) for x in [lL,lR]]

            col1, colN, row1, RowN = defineTableBordersFromCells(table)

            ## vertical borders
            # intersection of vertical sep with vertival border
            for v in [col1, colN]:
                v_prep = prep(v)
                for i, x in enumerate(lPolygonSep):
                    if v_prep.intersects(x) and (x.bounds[3] - x.bounds[1]) > (
                            x.bounds[2] - x.bounds[0]):
                        lSeparators[i].set(sDUSep, lLabels_OI[2])

            ## horizontal borders
            for h in [row1, RowN]:
                h_prep = prep(h)
                for i, x in enumerate(lPolygonSep):
                    if h_prep.intersects(x) and (x.bounds[3] - x.bounds[1]) < (
                            x.bounds[2] - x.bounds[0]):
                        lSeparators[i].set(sDUSep, lLabels_OI[2])

        ## fix bindings
        for table in lTables:
            lCells = MultiPageXml.getChildByName(table, 'TableCell')
            lCells = list(filter(lambda x: int(x.get('rowSpan')) > 6, lCells))
            lPolygonCells = [
                ShapePo(MultiPageXml.getPointList(x)) for x in lCells
            ]
            for cell in lPolygonCells:
                cell_prep = prep(scale(cell, xfact=0.5))
                for i, x in enumerate(lPolygonSep):
                    if cell_prep.intersects(x) and (
                            x.bounds[3] - x.bounds[1]) > (x.bounds[2] -
                                                          x.bounds[0]):
                        lSeparators[i].set(sDUSep, lLabels_OI[0])
def removeSeparator(root):
    lnd = MultiPageXml.getChildByName(root, 'SeparatorRegion')
    n = len(lnd)
    for nd in lnd:
        nd.getparent().remove(nd)
    return n  
def tag_DU_row_col_header(root, lCells, maxRowSpan):
    """
    Tag the XML nodes corresponding to those cells
    Modify the XML DOM
    """
    for cell in lCells:
    
        lText = MultiPageXml.getChildByName(cell,'TextLine')
         
        # HEADER WISE: D CH O
        if int(cell.get('row')) < maxRowSpan:
            [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText]
        else:
            [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText]
        
        # ROW WISE: B I E S O
        if len(lText) == 0:
            pass
        if len(lText) == 1:
            lText[0].set(sDURow,lLabelsBIESO_R[3])
        elif len(lText) > 1:
    #         lText.sort(key=lambda x:float(x.prop('y')))
            lText[0].set(sDURow,lLabelsBIESO_R[0])
            [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]]
            lText[-1].set(sDURow,lLabelsBIESO_R[2])
    #         MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0])
    #         MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2])
    #         [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]]    
        
        #COLUM WISE: M S O 
        lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
        coord= lCoords[0]
        sPoints=coord.get('points')
        plgn = Polygon.parsePoints(sPoints)
        (cx,cy,cx2,cy2) = plgn.getBoundingBox()     
        
        for txt in lText:
            lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})       
            coord= lCoords[0]
            sPoints=coord.get('points')
            lsPair = sPoints.split(' ')
            lXY = list()
            for sPair in lsPair:
                try:
                    (sx,sy) = sPair.split(',')
                    lXY.append( (int(sx), int(sy)) )
                except ValueError:
                    traceln("WARNING: invalid coord in TextLine id=%s  IGNORED"%txt.get("id"))
            ## HOW to define a CM element!!!!
            if lXY:
                (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox()
                if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1):
                    txt.set(sDUCol,lLabelsSM_C[0])
                else:
                    txt.set(sDUCol,lLabelsSM_C[1])
            else:
                txt.set(sDUCol,lLabelsSM_C[-1])
                
    # textline outside table
    lRegions= MultiPageXml.getChildByName(root,'TextRegion')
    for region in lRegions:
        lText =  MultiPageXml.getChildByName(region,'TextLine')
        [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText]
        [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText]
        [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText]
        
    return