Example #1
0
def ajustRectangleShapes(pageNd, lClusterPoly):
    """
        reduce a cluster rectangle with just area which does not overlap other area
    """
    #lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    lNewPoly = []
    for i, pol1 in enumerate(lClusterPoly):
        #         print (i,'init:',pol1.area,pol1.bounds)
        init = pol1.wkt
        # take all other and createa multipo
        #         multi=MultiPolygon([p for p in lClusterPoly if p != pol1]).buffer(0)
        for j, pol2 in enumerate(lClusterPoly[i + 1:]):
            if pol1.intersection(pol2).area > pol2.area * 0.01:
                pol1 = pol1.symmetric_difference(pol2).difference(pol2)
                #                 pol1=pol1.difference(pol2)
                if not pol1.is_empty and type(pol1) in [
                        MultiPolygon, GeometryCollection
                ]:
                    pol1 = max(pol1.geoms, key=lambda p: p.area)

        cellNd = etree.Element('COLX')
        pageNd.append(cellNd)
        if not pol1.is_empty:
            cellNd.set('points', ShapeLoader.getCoordsString(pol1))
        lNewPoly.append(pol1)


#     lNewPoly.extend(lClusterPoly[1:])
#     print(len(lNewPoly) ,len(lClusterPoly))
    return lNewPoly
def processRegions(ndPage, bVerbose=False):
    """
        Delete empty regions
        resize no empty regions
    """
    lDel = []
    lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS)
    for ndRegion in lndRegions:
        lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS)
        if lTL == []:
            # to be deleted
            lDel.append(ndRegion)
        else:
            #resize it
            oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True)
            PageXml.getChildByName(ndRegion, 'Coords')[0].set(
                "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True))


#             contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ])
#             o = contour.minimum_rotated_rectangle
#             ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))

# delete empty regions
    [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel]

    if bVerbose:
        traceln(" - %d regions deleted" % (len(lDel)))
        traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
Example #3
0
def tableCreation(ndPage, dCellsIJ, dCellTxt, dCoordCell):
    """
        find TABLE tag
        add CELLS 
    
    """
    # get TABLE node
    xTable = ".//TABLE"
    lTableNds = ndPage.xpath(xTable)
    # discard fake table!!
    tableNd = lTableNds[-1]

    for cellwkt in dCellsIJ.keys():
        cellNd = etree.Element('CELL')
        i, j = dCellsIJ[cellwkt]
        x1, y1, x2, y2 = dCoordCell[cellwkt].bounds
        cellNd.set('row', f'{i}')
        cellNd.set('col', f'{j}')
        cellNd.set('x', str(x1))
        cellNd.set('y', str(y1))
        cellNd.set('height', str(abs(y2 - y1)))
        cellNd.set('width', str(abs(x2 - x1)))

        # empty
        #         cellNd.set('points',ShapeLoader.getCoordsString(dCoordCell[cellwkt]))
        try:
            cellNd.set('points',
                       ShapeLoader.getCoordsString(dCoordCell[cellwkt]))
        except:
            # take largest
            pol = max(dCoordCell[cellwkt].geoms, key=lambda p: p.area)
            cellNd.set('points', ShapeLoader.getCoordsString(pol))

        # populate with TL!:
        # sort by Y increasing!!
        dCellTxt[cellwkt].sort(
            key=lambda x: ShapeLoader.node_to_Point(x).bounds[1])
        [cellNd.append(t) for t in dCellTxt[cellwkt]]
        #cellNd.text= " ".join(dCellTxt[cellwkt])

        cellNd.set('colSpan', "1")
        cellNd.set('rowSpan', "1")
        tableNd.append(cellNd)

    return
Example #4
0
def shakeRectangleShapes(pageNd, lClusterPoly, dCellCndPerRow, dTextInCells):
    """
        delete one 'col' and see if the rectangle is stable (area? comparison; rather angle)
        
        angle see baseline object or 
        take the longest line in the polygon and compute its angle
            radian = math.atan((shape.lastpoint.x - shape.firstpoint.x)/(shape.lastpoint.y - shape.firstpoint.y))  
            degrees = radian * 180 / math.pi  
        return degrees  
    """
    #sort by size (reverse)
    #     lClusterPoly = [ShapeLoader.node_to_Polygon(nd) for nd in lClusters]

    ## can be restricted to ones with high angle?
    lNewPoly = []
    for i in sorted(dCellCndPerRow.keys()):
        lcellRows = dCellCndPerRow[i]
        # skip first
        ltl = []
        for c in lcellRows[1:]:
            if c.wkt in dTextInCells:
                for tl in dTextInCells[c.wkt]:
                    assert ShapeLoader.node_to_Polygon(tl).is_valid
                    ltl.append(tl)
        rectangle = ShapeLoader.minimum_rotated_rectangle(ltl,
                                                          bShapelyObject=True)
        #         print (i,lClusterPoly[i].intersection(rectangle).area / lClusterPoly[i].area)
        if not rectangle.is_empty:
            if abs(rectangle.bounds[1] -
                   rectangle.bounds[3]) < abs(lClusterPoly[i].bounds[1] -
                                              lClusterPoly[i].bounds[3]):
                #                 rectangle=scale(rectangle,xfact=1.5)
                cellNd = etree.Element('COL')
                pageNd.append(cellNd)
                rectangle = scale(rectangle, xfact=2)
                lNewPoly.append(rectangle)
                cellNd.set('points', ShapeLoader.getCoordsString(rectangle))

            else:
                lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
                lNewPoly.append(lClusterPoly[i])
        else:
            lClusterPoly[i] = scale(lClusterPoly[i], xfact=1.5)
            lNewPoly.append(lClusterPoly[i])
        ## get length and compute angle !!!  take the one with more horizontal angle::


#         cellNd = etree.Element('COL')
#         pageNd.append(cellNd)
#         if not rectangle.is_empty:
#             rectangle=scale(rectangle,xfact=2)
#             cellNd.set('points',ShapeLoader.getCoordsString(rectangle))
#     print (len(lClusterPoly),len(lNewPoly))
    return lNewPoly
Example #5
0
 def makeClusterNode(self, sAlgo):
     """
     Create an XML node reflecting the cluster
     """
     ndCluster = PageXml.createPageXmlNode('Cluster')
     ndCluster.set("name", self.name)
     ndCluster.set("algo", sAlgo)
     # add the space separated list of node ids
     ndCluster.set("content", " ".join(self.setID))
     ndCoords = PageXml.createPageXmlNode('Coords')
     ndCluster.append(ndCoords)
     if self.shape is None:
         ndCoords.set('points', "")
     else:
         ndCoords.set('points', ShapeLoader.getCoordsString(self.shape))
     ndCluster.tail = "\n"
     return ndCluster
Example #6
0
def transformMinima2envelop(pageNd, dClusterWithTL):
    """
    """
    lClustersPoly = []
    for row in dClusterWithTL:
        #         bb = ShapeLoader.contourObject(dClusterWithTL[row]).envelope
        #         if not bb.is_empty:
        #             cellNd = etree.Element('COL')
        #             pageNd.append(cellNd)
        #             cellNd.set('points',ShapeLoader.getCoordsString(bb))

        ch = ShapeLoader.contourObject(dClusterWithTL[row]).convex_hull
        lClustersPoly.append(ch)
        if not ch.is_empty:
            cellNd = etree.Element('LINE')
            pageNd.append(cellNd)
            cellNd.set('points', ShapeLoader.getCoordsString(ch))

    return lClustersPoly
    def makeTableNode(self):
        """
        Make a DOM tree for this table
        """
        lK = self._dCellNd.keys()
        lRow = list(set(_row for _row, _col in lK))
        lRow.sort()
        lCol = list(set(_col for _row, _col in lK))
        lCol.sort()

        ndTable = PageXml.createPageXmlNode("TableRegion")
        ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum))
        ndTable.tail = "\n"
        lCellShape = []
        lNdCell = []
        for row in lRow:
            for col in lCol:
                lNdText = self._dCellNd[(row, col)]
                #     <TableCell row="0" col="1" rowSpan="1" colSpan="1" id="TableCell_1500971530732_2485">
                #        <Coords points="480,42 485,323 878,323 874,38"/>

                if lNdText:
                    ndCell = PageXml.createPageXmlNode("TableCell")
                    ndCell.set(
                        "id", "p%s_t%s_r%s_c%s" %
                        (self.pagenum, self.tablenum, row, col))

                    # shape of the cell
                    oHull = ShapeLoader.convex_hull(lNdText,
                                                    bShapelyObject=True)
                    lCellShape.append(
                        oHull)  # keep those to compute table contour

                    # Coords sub-element
                    ndCoords = PageXml.createPageXmlNode("Coords")
                    ndCoords.set(
                        "points",
                        ShapeLoader.getCoordsString(oHull, bFailSafe=True))
                    ndCoords.tail = "\n"
                    ndCell.append(ndCoords)

                    # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false"
                    ndCell.set("row", str(row))
                    ndCell.set("rowSpan", "1")
                    ndCell.set("col", str(col))
                    ndCell.set("colSpan", "1")
                    ndCell.tail = "\n"

                    #add corner
                    cornerNode = PageXml.createPageXmlNode("CornerPts")
                    cornerNode.text = "0 1 2 3"
                    ndCell.append(cornerNode)

                    for nd in lNdText:
                        ndCell.append(nd)

                    lNdCell.append(ndCell)

        # Table geometry
        ndCoords = PageXml.createPageXmlNode("Coords")
        contour = cascaded_union(
            [p if p.is_valid else p.convex_hull for p in lCellShape])
        o = contour.minimum_rotated_rectangle
        ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))
        ndCoords.tail = "\n"
        ndTable.append(ndCoords)

        for nd in lNdCell:
            ndTable.append(nd)

        return ndTable
Example #8
0
def checkClusteringPerRow(pageNd, dCellCndPerRow, dTextInCells, dClusterWithTL,
                          dEdges):
    """
        recompute clusters for the rows
    """
    lNewPoly = []
    lNewClusterWithTL = []
    lNewClusterWithCC = []
    lRowtodel = []
    lTLtoBeAdded = []
    for rowid, rowCells in dCellCndPerRow.items():
        lNbClusterPerCol = []
        lClusterPerCol = []
        for cell in rowCells:
            if cell.wkt in dTextInCells.keys():
                lIds = [tl.get('id') for tl in dTextInCells[cell.wkt]]
                lClusterCells = connected_components(dEdges, lIds, TH=0.5)
                lNbClusterPerCol.append(len(lClusterCells))
                lClusterPerCol.append(lClusterCells)
                #print (len(rowCells),lIds,lClusterCells,lNbClusterPerCol)
            #else empty cell
        if len(lNbClusterPerCol) > 2:
            #             print (lNbClusterPerCol)
            try:
                nbRows = mode(lNbClusterPerCol)
            except statistics.StatisticsError:
                nbRows = 2
            if nbRows != 1:
                print('WARNING CUT ', rowid, "mode", nbRows)
                lRowtodel.append(rowid)
                dClusterCells = defaultdict(list)
                for colcluster in lClusterPerCol:
                    if len(colcluster) == nbRows:
                        # get tl instead of ids
                        # take the first element only for comparing position
                        colclustertxt = []
                        for cc in colcluster:
                            colclustertxt.append([
                                tl for tl in dClusterWithTL[rowid]
                                if tl.get('id') in cc
                            ])
                        sortedC = sorted(colclustertxt,
                                         key=lambda x: ShapeLoader.
                                         node_to_Polygon(x[0]).centroid.y)
                        for i, cc in enumerate(sortedC):
                            dClusterCells[i].append(cc)
                    else:
                        for cc in colcluster:
                            lTLtoBeAdded.extend([
                                tl for tl in dClusterWithTL[rowid]
                                if tl.get('id') in cc
                            ])
                for i, lcc in dClusterCells.items():
                    #                     print (i,lcc)
                    rect = ShapeLoader.contourObject([
                        x for cc in lcc for x in cc
                    ]).envelope  # minimal_rectangle?
                    rect = scale(rect, xfact=2)
                    lNewPoly.append(rect)
                    lNewClusterWithCC.append(lcc)
                    lNewClusterWithTL.append([x for cc in lcc for x in cc])
                    #                         lNewCluster.Append()
                    # need also to create a cluster with list of ids!!
                    rNd = etree.Element('LINE')
                    rNd.set('points', ShapeLoader.getCoordsString(rect))
                    pageNd.append(rNd)
                    # final check: if overlap between rectangle: top rectangle is the cut

    #return updated list of lCLusters
    return lRowtodel, lNewPoly, lNewClusterWithTL, lNewClusterWithCC, lTLtoBeAdded