def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = "" #now we need to infer the bounding box of that object (x1, y1), (x2, y2) = PageXml.getPointList(ndBlock) #the polygon orientation = 0 classIndex = 0 #is computed later on #and create a Block # we pass the coordinates, not x1,y1,w,h !! cutBlk = Block(page, ((x1, y1), (x2, y2)), sText, orientation, classIndex, self, ndBlock, domid=domid) # Create the shapely shape cutBlk.shape = geom.LineString([(x1, y1), (x2, y2)]) cutBlk.angle = float(ndBlock.get("DU_angle")) cutBlk.angle_freq = float(ndBlock.get("DU_angle_freq")) cutBlk.angle_cumul_freq = float(ndBlock.get("DU_angle_cumul_freq")) cutBlk.set_support = literal_eval(ndBlock.get("DU_set_support")) yield cutBlk return
def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes """ assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #order blocks from top to bottom of page lOrderedNdBlock = list() for ndBlock in lNdBlock: lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: raise ValueError("Node %x has invalid coordinates" % str(ndBlock)) plg = Polygon(lXY) _, (xg, yg) = plg.getArea_and_CenterOfMass() lOrderedNdBlock.append( (yg, ndBlock)) #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved lOrderedNdBlock.sort() for _, ndBlock in lOrderedNdBlock: yield ndBlock return
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" traceln("Warning: no text in node %s"%domid) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1,y1,x2,y2 = plg.getBoundingBox() except ValueError: x1,y1,x2,y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not(self.BBoxDeltaFun is None): w,h = x2-x1, y2-y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk raise StopIteration()
def convertTR2Sep(filename): """ """ print (filename) tagname='TextRegion' xml = etree.parse(filename) ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}) for x in ltextsep: if "separator" in x.get('custom'): x.tag = 'SeparatorRegion' #now we need to convert that object to a line lXY = PageXml.getPointList(x) #the polygon assert lXY, "Separator without Coord??" plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ValueError: print("Warning: Coords might be bad, taking bounding box: ", lXY) x1,y1,x2,y2 = plg.getBoundingBox() # try: # x1,y1, x2,y2 = plg.fitRectangle() # except ZeroDivisionError: # x1,y1,x2,y2 = plg.getBoundingBox() # except ValueError: # x1,y1,x2,y2 = plg.getBoundingBox() if abs(x2-x1) > abs(y2-y1): # horizontal y1 = (y1+y2)/2 y2 = y1 else: x1 = (x1+x2)/2 x2=x1 ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0] PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)]) return xml
def get_col_partition(doer, sxpCut, dNS, sFilename, lFilterFun, fRatio, bVerbose=False, funIndex=lambda x: x._du_index): """ return the GT partition in columns, as well as 1 partition per filter function """ global global_maxHeaderRowSpan if bVerbose: traceln("- loading %s" % sFilename) parser = etree.XMLParser() doc = etree.parse(sFilename, parser) root = doc.getroot() llsetRun = [] pnum = 0 lndPage = MultiPageXml.getChildByName(root, 'Page') assert len(lndPage) == 1, "NOT SUPPORTED: file has many pages - soorry" for ndPage in lndPage: pnum += 1 if bVerbose: traceln(" - page %s - loading table GT" % pnum) loBaseline, dsetTableByCol, dsetTableDataByCol, global_maxHeaderRowSpan = doer.loadPageCol( ndPage, fRatio, funIndex=funIndex) if bVerbose: traceln(" - found %d objects on page" % (len(loBaseline))) # make a dictionary of cumulative sets, and the set of all objects lTableColK = sorted(dsetTableByCol.keys()) lTableDataColK = sorted(dsetTableDataByCol.keys()) if bVerbose: traceln(" - found %d cols" % (len(lTableColK))) traceln(" - found %d objects in the table" % (sum(len(v) for v in dsetTableByCol.values()))) traceln(" - found %d objects in the table data" % (sum(len(v) for v in dsetTableDataByCol.values()))) lNdCut = ndPage.xpath(sxpCut, namespaces=dNS) if bVerbose: traceln(" - found %d cuts" % (len(lNdCut))) else: traceln( "- loaded %40s " % sFilename, " %6d cols %6d 'S' cuts" % (len(lTableColK), len(lNdCut)), " %6d objects %6d table objects" % (len(loBaseline), sum(len(v) for v in dsetTableByCol.values()))) loCut = [] for ndCut in lNdCut: #now we need to infer the bounding box of that object (x1, y1), (x2, y2) = PageXml.getPointList(ndCut) #the polygon # Create the shapely shape loCut.append(geom.LineString([(x1, y1), (x2, y2)])) w, h = float(ndPage.get("imageWidth")), float( ndPage.get("imageHeight")) # # Add a fictive cut at top of page # loCut.append(geom.LineString([(0, 0), (w, 0)])) # # Add a fictive cut at end of page # loCut.append(geom.LineString([(0, h), (w, h)])) # order it by line centroid x loCut.sort(key=lambda o: o.centroid.x) # dcumset is the GT!! lsetGT = [dsetTableByCol[k] for k in lTableColK] # list of set of du_index lsetDataGT = [dsetTableDataByCol[k] for k in lTableDataColK] # NOW, look at predictions for filterFun in lFilterFun: loBaselineInTable = [o for o in loBaseline if filterFun(o._du_nd)] if bVerbose: traceln(" - %d objects on page predicted in table (%d out)" % (len(loBaselineInTable), len(loBaseline) - len(loBaselineInTable))) # Now create the list of partitions created by the Cuts lsetRun = [] partition = PolygonPartition(loBaselineInTable) if True: # or bCutOnLeft: #cut if above the text that led to its creation setAllPrevIds = set( []) # cumulative set of what was already taken for oCut in loCut: lo = partition.getObjectOnRightOfLine(oCut) setIds = set(funIndex(o) for o in lo) #print(oCut.centroid.x, setIds) if setAllPrevIds: prevColIds = setAllPrevIds.difference( setIds) # content of previous row if prevColIds: #an empty set is denoting alternative cuts leading to same partition lsetRun.append(prevColIds) setAllPrevIds = setIds else: assert False, "look at this code..." # #cut if below the text that led to its creation # cumSetIds = set([]) # cumulative set # for oCut in loCut: # lo = partition.getObjectAboveLine(oCut) # setIds = set(o._du_index for o in lo) # rowIds = setIds.difference(cumSetIds) # only last row! # if rowIds: # #an empty set is denoting alternative cuts leading to same partition # lsetRun.append(rowIds) # cumSetIds = setIds # _debugPartition("run", lsetRun) # _debugPartition("ref", lsetGT) llsetRun.append(lsetRun) return lsetGT, lsetDataGT, llsetRun
def resizeCell(self, cell, ns): """ replace the cell region by a BB for textlines: better for transcriber """ xpath = "./a:%s" % ("TextLine") lTextLines = cell.xpath(xpath, namespaces={'a': PageXml.NS_PAGE_XML}) if lTextLines == []: return True ## get minx,maxy for the cell lXY = PageXml.getPointList(cell) #the polygon plg = Polygon(lXY) x1, y1, x2, y2 = plg.fitRectangle() x1, y1, x2, y2 = plg.getBoundingBox() cellX1 = x1 cellX2 = x2 cellY1 = y1 cellY2 = y2 ## get all the textlines of the cell minx, miny, maxx, maxy = 9e9, 9e9, 0, 0 for line in lTextLines: lXY = PageXml.getPointList(line) #the polygon # in case empty cell if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle() except ZeroDivisionError: continue x1, y1, x2, y2 = plg.getBoundingBox() minx = min(minx, x1) miny = min(miny, y1) maxx = max(maxx, x2) maxy = max(maxy, y2) """ finally: simply use the BB of the textlines + padding """ # # new request: height= max(cell,text) # ## new new request: (12/10/2017): min (cell, text)! # HCell = cellY2 - cellY1 # HBBText = maxy - miny # miny -= self.vpadding # vertical padding (top) maxy += self.vpadding # vertical padding (bottom) # # # Height computation # ## if HBBText <= self.HeightTH * HCell: take HBBText as height for TextREgion # if HBBText > self.HeightTH * HCell: # miny = max(miny,cellY1) # maxy = min(maxy,cellY2) # # else : don't touch miny, maxy : use only Text for computing Height # # # Width computation minx -= self.hpadding # horizontal padding maxx += self.hpadding # horizontal padding # minx = min(cellX1,minx) # maxx = max(cellX2, maxx) # print cellX2, maxx corner = cell[0] # print minx,miny,maxx,miny,maxx,maxy,minx,maxy corner.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (minx, miny, maxx, miny, maxx, maxy, minx, maxy))
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath( self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" NodeType_PageXml.nbNoTextWarning += 1 if NodeType_PageXml.nbNoTextWarning < 33: traceln("Warning: no text in node %s" % domid) elif NodeType_PageXml.nbNoTextWarning == 33: traceln( "Warning: no text in node %s - *** %d repetition : I STOP WARNING ***" % (domid, NodeType_PageXml.nbNoTextWarning)) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle( bPreserveWidth=self.bPreserveWidth) except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1, y1, x2, y2 = plg.getBoundingBox() except ValueError: x1, y1, x2, y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not (self.BBoxDeltaFun is None): if type(self.BBoxDeltaFun) is tuple and len( self.BBoxDeltaFun) == 2: xFun, yFun = self.BBoxDeltaFun if xFun is not None: dx = xFun(x2 - x1) x1, x2 = int(round(x1 + dx)), int(round(x2 - dx)) if yFun is not None: dy = yFun(y2 - y1) y1, y2 = int(round(y1 + dy)), int(round(y2 - dy)) else: # historical code w, h = x2 - x1, y2 - y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1, y1, x2, y2 = [ int(round(v)) for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy] ] # store the rectangle" ndBlock.set( "DU_points", " ".join([ "%d,%d" % (int(x), int(y)) for x, y in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] ])) #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2 - x1, y2 - y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk return
def mergeBaselineCells(self, coldir, colid, docid): """ Take a file (pxml) with stuff processed on Transkribus Tale the CVL template tool xml (xml) merge them regenerate a mpxml """ xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid)) # print (xmlpath) mpxml = xmlpath + ".mpxml" mpxmldoc = etree.parse(mpxml) lxml = glob.glob(os.path.join(xmlpath, "*.xml")) pxmldoc = MultiPageXml.makeMultiPageXml(lxml) lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml")) mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml) lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page') lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page') assert len(lXMLPage) == len(lPXMLPage) for i, cvlpage in enumerate(lXMLPage): ## remove TextRegion from xcvlpage lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion') for tr in lTextRegions: tr.getparent().remove(tr) pxmlpage = lPXMLPage[i] lTL = [] lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion') for x in lTextRegions: lTL.extend(PageXml.getChildByName(x, 'TextLine')) ltable = PageXml.getChildByName(cvlpage, 'TableRegion') if len(ltable) == 0: raise "NO TABLE" lCells = PageXml.getChildByName(ltable[0], 'TableCell') lC = [Polygon(PageXml.getPointList(c)) for c in lCells] lT = [Polygon(PageXml.getPointList(t)) for t in lTL] for i, tl in enumerate(lT): ## normalization lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords') lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline') coordB = lCoordsB[0] coord = lCoordsPoints[0] iHeight = 30 # in pixel x1, y1, x2, y2 = Polygon( PageXml.getPointList(coordB)).getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) tl = Polygon(PageXml.getPointList(coordB)) lOverlap = [] for _, c in enumerate(lC): # print (lCells[j].get('row'),lCells[j].get('col'), self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox()) lOverlap.append(self.signedOverlap( c, tl)) #.getBoundingBox())) ## region of the same size as the textline # print (j,max(lOverlap),lOverlap.index(max(lOverlap))) if max(lOverlap) == 0: region = PageXml.createPageXmlNode('TextRegion') cvlpage.append(region) region.append(lTL[i]) else: cell = lCells[lOverlap.index(max(lOverlap))] cell.append(lTL[i]) # print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext())) pxmldoc.write(mpxml)
def op_eval_old(lsFilename, fSimil, bDetail=False): """ We load the XML - get the cut with @type="S" - get the text objects (geometry=Baseline) - """ nOk, nErr, nMiss = 0, 0, 0 # OLD STYLE (May'18) sxpCut = './/pc:CutSeparator[@orient="0" and @type="S"]' #how to find the cuts dNS = "./pc:TextEquiv" doer = SkewedCutAnnotator(True) traceln(" - Cut selector = ", sxpCut) def getPolylineAverageXY(ndPolyline): """ COPIED FROM tasks.DU_ABPTableCutAnnotator.BaselineCutAnnotator weighted average X and average Y of a polyline the weight indicate how long each segment at a given X, or Y, was. """ sPoints = ndPolyline.get('points') lXY = Polygon.parsePoints(sPoints).lXY # list of X and Y values and respective weights lXYWxWy = [((x1+x2)/2.0, abs(y2-y1), # for how long at this X? (y1+y2)/2.0, abs(x2-x1)) \ for (x1,y1), (x2, y2) in zip(lXY, lXY[1:])] fWeightedSumX = sum(x * wx for x, wx, _, _ in lXYWxWy) fWeightedSumY = sum(y * wy for _, _, y, wy in lXYWxWy) fSumWeightX = sum(wx for _, wx, _, _ in lXYWxWy) fSumWeightY = sum(wy for _, _, _, wy in lXYWxWy) Xavg = int(round(fWeightedSumX / fSumWeightX)) if fSumWeightX > 0 else 0 Yavg = int(round(fWeightedSumY / fSumWeightY)) if fSumWeightY > 0 else 0 # Xavg, Yavg = self.moduloSnap(Xavg, Yavg) return (Xavg, Yavg) def baseline_loader(nd): """ load the baseline as done in DU_ABPTableCutAnnotator """ x, y = getPolylineAverageXY(nd) # make a short horizontal line out of a point return geom.LineString([(x - 10, y), (x + 10, y)]) # load objects: Baseline and Cuts for sFilename in lsFilename: traceln("- loading %s" % sFilename) parser = etree.XMLParser() doc = etree.parse(sFilename, parser) root = doc.getroot() pnum = 0 for ndPage in MultiPageXml.getChildByName(root, 'Page'): pnum += 1 traceln(" - page %s - loading table GT" % pnum) loBaseline, dsetTableByRow = doer.loadPage( ndPage, shaper_fun=baseline_loader) traceln(" - found %d objects on page" % (len(loBaseline))) # make a dictionary of cumulative sets, and the set of all objects lTableRowK = sorted(dsetTableByRow.keys(), reverse=True) # bottom to top traceln(" - found %d objects in the table" % (sum(len(v) for v in dsetTableByRow.values()))) lNdCut = ndPage.xpath(sxpCut, namespaces={"pc": PageXml.NS_PAGE_XML}) traceln(" - found %d 'S' cut" % (len(lNdCut))) loCut = [] for ndCut in lNdCut: #now we need to infer the bounding box of that object (x1, y1), (x2, y2) = PageXml.getPointList(ndCut) #the polygon # make sure that the cut is above the baseline that created it y1 -= 1 y2 -= 1 assert y1 == y2 # in this version, the cuts were horizontal # Create the shapely shape loCut.append(geom.LineString([(x1, y1), (x2, y2)])) # order it by line centroid Y loCut.sort(key=lambda o: o.centroid.y, reverse=True) # from bottom to top # dcumset is the GT!! lsetGT = [dsetTableByRow[k] for k in lTableRowK] # list of set of du_index # Now create the list of partitions created by the Cuts, excluding the 'O' lsetRun = [] partition = PolygonPartition(loBaseline) cumSetIds = set([]) # cumulative set for oCut in loCut: lo = partition.getObjectBelowLine(oCut) setIds = set(o._du_index for o in lo if _isBaselineInTable(o._du_nd)) rowIds = setIds.difference(cumSetIds) # only last row! if rowIds: #an empty set is denoting alternative cuts leading to same partition lsetRun.append(rowIds) cumSetIds = setIds # _debugPartition("run", lsetRun) # _debugPartition("ref", lsetGT) _nOk, _nErr, _nMiss, _lFound, _lErr, _lMissed = evalPartitions( lsetRun, lsetGT, fSimil, jaccard_distance) nOk += _nOk nErr += _nErr nMiss += _nMiss if bDetail: _fP, _fR, _fF = computePRF(_nOk, _nErr, _nMiss) traceln( "ok=%d err=%d miss=%d P=%.1f R=%.1f F1=%.1f %s page=%d" % (_nOk, _nErr, _nMiss, _fP, _fR, _fF, sFilename, pnum)) fP, fR, fF = computePRF(nOk, nErr, nMiss) traceln("SUMMARY == P=%.1f%%\tR=%.1f%%\tF1=%.1f" % (fP, fR, fF)) traceln("ok=%d err=%d miss=%d P=%.1f R=%.1f F1=%.1f" % (nOk, nErr, nMiss, fP, fR, fF)) return (nOk, nErr, nMiss)
def get_row_partition(doer, sxpCut, dNS, sFilename, lFilterFun, bCutAbove=True, bVerbose=False, funIndex=lambda x: x._du_index, bIgnoreHeader=False): """ return the GT partition in rows, as well as 1 partition per filter fucntion """ # load objects: Baseline and Cuts if bVerbose: traceln("- loading %s" % sFilename) parser = etree.XMLParser() doc = etree.parse(sFilename, parser) root = doc.getroot() llsetRun = [] pnum = 0 lndPage = MultiPageXml.getChildByName(root, 'Page') assert len(lndPage) == 1, "NOT SUPPORTED: file has many pages - soorry" for ndPage in lndPage: pnum += 1 if bVerbose: traceln(" - page %s - loading table GT" % pnum) loBaseline, dsetTableByRow = doer.loadPage(ndPage, funIndex=funIndex, bIgnoreHeader=bIgnoreHeader) if bVerbose: traceln(" - found %d objects on page" % (len(loBaseline))) # make a dictionary of cumulative sets, and the set of all objects lTableRowK = sorted(dsetTableByRow.keys()) if bVerbose: traceln(" - found %d rows" % (len(lTableRowK))) traceln(" - found %d objects in the table" % (sum(len(v) for v in dsetTableByRow.values()))) lNdCut = ndPage.xpath(sxpCut, namespaces=dNS) if bVerbose: traceln(" - found %d 'S' cut" % (len(lNdCut))) else: traceln( "- loaded %40s " % sFilename, " %6d rows %6d 'S' cuts" % (len(lTableRowK), len(lNdCut)), " %6d objects %6d table objects" % (len(loBaseline), sum(len(v) for v in dsetTableByRow.values()))) loCut = [] for ndCut in lNdCut: #now we need to infer the bounding box of that object (x1, y1), (x2, y2) = PageXml.getPointList(ndCut) #the polygon # Create the shapely shape loCut.append(geom.LineString([(x1, y1), (x2, y2)])) w, h = float(ndPage.get("imageWidth")), float( ndPage.get("imageHeight")) # Add a fictive cut at top of page loCut.append(geom.LineString([(0, 0), (w, 0)])) # Add a fictive cut at end of page loCut.append(geom.LineString([(0, h), (w, h)])) # order it by line centroid Y loCut.sort(key=lambda o: o.centroid.y) # dcumset is the GT!! lsetGT = [dsetTableByRow[k] for k in lTableRowK] # list of set of du_index # NOW, look at predictions for filterFun in lFilterFun: loBaselineInTable = [o for o in loBaseline if filterFun(o._du_nd)] if bVerbose: traceln(" - %d objects on page predicted in table (%d out)" % (len(loBaselineInTable), len(loBaseline) - len(loBaselineInTable))) # Now create the list of partitions created by the Cuts lsetRun = [] partition = PolygonPartition(loBaselineInTable) if bCutAbove: #cut if above the text that led to its creation setAllPrevIds = set( []) # cumulative set of what was already taken for oCut in loCut: lo = partition.getObjectBelowLine(oCut) setIds = set(funIndex(o) for o in lo) if setAllPrevIds: prevRowIds = setAllPrevIds.difference( setIds) # content of previous row if prevRowIds: #an empty set is denoting alternative cuts leading to same partition lsetRun.append(prevRowIds) setAllPrevIds = setIds else: #cut if below the text that led to its creation cumSetIds = set([]) # cumulative set for oCut in loCut: lo = partition.getObjectAboveLine(oCut) setIds = set(funIndex(o) for o in lo) rowIds = setIds.difference(cumSetIds) # only last row! if rowIds: #an empty set is denoting alternative cuts leading to same partition lsetRun.append(rowIds) cumSetIds = setIds # _debugPartition("run", lsetRun) llsetRun.append(lsetRun) # _debugPartition("ref", lsetGT) return lsetGT, llsetRun