def processRegions(ndPage, bVerbose=False):
    """
        Delete empty regions
        resize no empty regions
    """
    lDel = []
    lndRegions = ndPage.xpath(".//pg:TextRegion", namespaces=dNS)
    for ndRegion in lndRegions:
        lTL = ndRegion.xpath(".//pg:TextLine", namespaces=dNS)
        if lTL == []:
            # to be deleted
            lDel.append(ndRegion)
        else:
            #resize it
            oHull = ShapeLoader.convex_hull(lTL, bShapelyObject=True)
            PageXml.getChildByName(ndRegion, 'Coords')[0].set(
                "points", ShapeLoader.getCoordsString(oHull, bFailSafe=True))


#             contour = cascaded_union([p if p.is_valid else p.convex_hull for p in lTL ])
#             o = contour.minimum_rotated_rectangle
#             ndRegion.getChildByName('Coords').set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))

# delete empty regions
    [ndRegion.getparent().remove(ndRegion) for ndRegion in lDel]

    if bVerbose:
        traceln(" - %d regions deleted" % (len(lDel)))
        traceln(" - %d regions updated" % (len(lndRegions) - len(lDel)))
Esempio n. 2
0
 def computeShape(cls, ndPage, setID, bConvexHull=False):
     """ 
     compute a shape for this cluster, as the minimum rotated rectangle of its content
     or optionally as the convex hull
     """
     # let's find the nodes and compute the shape
     lNode = [
         ndPage.xpath(".//*[@id='%s']" % _id, namespaces=dNS)[0]
         for _id in setID
     ]
     return       ShapeLoader.convex_hull(lNode, bShapelyObject=True)   \
             if bConvexHull                                             \
             else ShapeLoader.minimum_rotated_rectangle(lNode, bShapelyObject=True)
    def makeTableNode(self):
        """
        Make a DOM tree for this table
        """
        lK = self._dCellNd.keys()
        lRow = list(set(_row for _row, _col in lK))
        lRow.sort()
        lCol = list(set(_col for _row, _col in lK))
        lCol.sort()

        ndTable = PageXml.createPageXmlNode("TableRegion")
        ndTable.set("id", "p%s_%s" % (self.pagenum, self.tablenum))
        ndTable.tail = "\n"
        lCellShape = []
        lNdCell = []
        for row in lRow:
            for col in lCol:
                lNdText = self._dCellNd[(row, col)]
                #     <TableCell row="0" col="1" rowSpan="1" colSpan="1" id="TableCell_1500971530732_2485">
                #        <Coords points="480,42 485,323 878,323 874,38"/>

                if lNdText:
                    ndCell = PageXml.createPageXmlNode("TableCell")
                    ndCell.set(
                        "id", "p%s_t%s_r%s_c%s" %
                        (self.pagenum, self.tablenum, row, col))

                    # shape of the cell
                    oHull = ShapeLoader.convex_hull(lNdText,
                                                    bShapelyObject=True)
                    lCellShape.append(
                        oHull)  # keep those to compute table contour

                    # Coords sub-element
                    ndCoords = PageXml.createPageXmlNode("Coords")
                    ndCoords.set(
                        "points",
                        ShapeLoader.getCoordsString(oHull, bFailSafe=True))
                    ndCoords.tail = "\n"
                    ndCell.append(ndCoords)

                    # row="0" col="0" rowSpan="1" colSpan="1" leftBorderVisible="false" rightBorderVisible="false" topBorderVisible="false" bottomBorderVisible="false"
                    ndCell.set("row", str(row))
                    ndCell.set("rowSpan", "1")
                    ndCell.set("col", str(col))
                    ndCell.set("colSpan", "1")
                    ndCell.tail = "\n"

                    #add corner
                    cornerNode = PageXml.createPageXmlNode("CornerPts")
                    cornerNode.text = "0 1 2 3"
                    ndCell.append(cornerNode)

                    for nd in lNdText:
                        ndCell.append(nd)

                    lNdCell.append(ndCell)

        # Table geometry
        ndCoords = PageXml.createPageXmlNode("Coords")
        contour = cascaded_union(
            [p if p.is_valid else p.convex_hull for p in lCellShape])
        o = contour.minimum_rotated_rectangle
        ndCoords.set("points", ShapeLoader.getCoordsString(o, bFailSafe=True))
        ndCoords.tail = "\n"
        ndTable.append(ndCoords)

        for nd in lNdCell:
            ndTable.append(nd)

        return ndTable