def _iter_TextRegionNodeTop2Bottom(self, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes """ assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #order blocks from top to bottom of page lOrderedNdBlock = list() for ndBlock in lNdBlock: lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: raise ValueError("Node %x has invalid coordinates" % str(ndBlock)) plg = Polygon(lXY) _, (xg, yg) = plg.getArea_and_CenterOfMass() lOrderedNdBlock.append( (yg, ndBlock)) #we want to order from top to bottom, so that TextRegions of different resolution are not interleaved lOrderedNdBlock.sort() for _, ndBlock in lOrderedNdBlock: yield ndBlock return
def regularTextLines(self, doc): """ from a baseline: create a regular TextLine: rectangle along the baseline """ lTextLines = PageXml.getChildByName(doc.getRootElement(), 'TextLine') for tl in lTextLines: coord = tl.children baseline = tl.children.next sPoints = baseline.prop('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) except ValueError: print tl plg = Polygon(lXY) iHeight = 50 # not points!! 300 dpi : 62.5 x1, y1, x2, y2 = plg.getBoundingBox() coord.setProp( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath(self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" traceln("Warning: no text in node %s"%domid) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1,y1,x2,y2 = plg.getBoundingBox() except ValueError: x1,y1,x2,y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not(self.BBoxDeltaFun is None): w,h = x2-x1, y2-y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2-x1, y2-y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk raise StopIteration()
def getPolylineAverageXY(cls, ndPolyline): """ weighted average X and average Y of a polyline the weight indicate how long each segment at a given X, or Y, was. """ sPoints = ndPolyline.get('points') lXY = Polygon.parsePoints(sPoints).lXY # list of X and Y values and respective weights lXYWxWy = [((x1+x2)/2.0, abs(y2-y1), # for how long at this X? (y1+y2)/2.0, abs(x2-x1)) \ for (x1,y1), (x2, y2) in zip(lXY, lXY[1:])] fWeightedSumX = sum(x * wx for x, wx, _, _ in lXYWxWy) fWeightedSumY = sum(y * wy for _, _, y, wy in lXYWxWy) fSumWeightX = sum(wx for _, wx, _, _ in lXYWxWy) fSumWeightY = sum(wy for _, _, _, wy in lXYWxWy) Xavg = int(round(fWeightedSumX / fSumWeightX)) if fSumWeightX > 0 else 0 Yavg = int(round(fWeightedSumY / fSumWeightY)) if fSumWeightY > 0 else 0 # Xavg, Yavg = self.moduloSnap(Xavg, Yavg) return (Xavg, Yavg)
def add_cluster_to_dom(root, llX): """ Cluster the Textline based on the vertical cuts """ for lX, (_iPage, ndPage) in zip( llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))): w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight")) lX.append(w) lX.sort() # cluster of objects on imax = len(lX) dCluster = {i: list() for i in range(imax)} #Histogram of projections lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine') # hack to use addClusterToDom class MyBlock: def __init__(self, nd): self.node = nd o = GraphBinaryConjugateSegmenter() o.lNode = [] for nd in lndTextline: o.lNode.append(MyBlock(nd)) for iNd, ndTextline in enumerate(lndTextline): sPoints = MultiPageXml.getChildByName(ndTextline, 'Coords')[0].get('points') try: x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle() xm = (x1 + x2) / 2.0 bLastColumn = True for i, xi in enumerate(lX): if xm <= xi: dCluster[i].append(iNd) ndTextline.set("DU_cluster", str(i)) bLastColumn = False break if bLastColumn: i = imax dCluster[i].append(iNd) ndTextline.set("DU_cluster", str(i)) except ZeroDivisionError: pass except ValueError: pass # add clusters lNdCluster = o.addClusterToDom(dCluster, bMoveContent=False, sAlgo="cut", pageNode=ndPage) # add a cut_X attribute to the clusters for ndCluster in lNdCluster: i = int(ndCluster.get('name')) ndCluster.set("cut_X", str(lX[i]))
def regularTextLinesold(self, doc): """ from a baseline: create a regular TextLine: also: for slanted baseline: """ from shapely.geometry import LineString from shapely.affinity import translate self.xmlns = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15' lTextLines = PageXml.getChildByName(doc.getroot(), 'TextLine') for tl in lTextLines: #get Coords xpath = "./a:%s" % ("Coords") lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] xpath = "./a:%s" % ("Baseline") lBL = tl.xpath(xpath, namespaces={"a": self.xmlns}) baseline = lBL[0] sPoints = baseline.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) except ValueError: print(tl) plg = Polygon(lXY) line = LineString(lXY) # 50 seems to large: the manual GT is 30 ? not always! iHeight = 30 # in pixel x1, y1, x2, y2 = plg.getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) else: print(tl)
def convertTR2Sep(filename): """ """ print (filename) tagname='TextRegion' xml = etree.parse(filename) ltextsep = xml.getroot().findall(f".//pc:{tagname}[@custom]", {"pc":"http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15"}) for x in ltextsep: if "separator" in x.get('custom'): x.tag = 'SeparatorRegion' #now we need to convert that object to a line lXY = PageXml.getPointList(x) #the polygon assert lXY, "Separator without Coord??" plg = Polygon(lXY) try: x1,y1, x2,y2 = plg.fitRectangle() except ValueError: print("Warning: Coords might be bad, taking bounding box: ", lXY) x1,y1,x2,y2 = plg.getBoundingBox() # try: # x1,y1, x2,y2 = plg.fitRectangle() # except ZeroDivisionError: # x1,y1,x2,y2 = plg.getBoundingBox() # except ValueError: # x1,y1,x2,y2 = plg.getBoundingBox() if abs(x2-x1) > abs(y2-y1): # horizontal y1 = (y1+y2)/2 y2 = y1 else: x1 = (x1+x2)/2 x2=x1 ndCoord = x.xpath(".//pc:Coords", namespaces={"pc":PageXml.NS_PAGE_XML})[0] PageXml.setPoints(ndCoord, [(x1,y1), (x2,y2)]) return xml
def get_separator_YX_from_DOM(self, root, fMinPageCoverage): """ get the x and y of the GT table separators return lists of y, for horizontal and of x for vertical separators, per page return [(y_list, x_list), ...] """ ltlYlX = [] for ndPage in MultiPageXml.getChildByName(root, 'Page'): w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) lYi, lXi = [], [] l = MultiPageXml.getChildByName(ndPage, 'TableRegion') if len(l) != 1: if l: traceln( "** warning ** %d TableRegion instead of expected 1" % len(l)) else: traceln("** warning ** no TableRegion, expected 1") if l: for ndTR in l: #enumerate the table separators for ndSep in MultiPageXml.getChildByName( ndTR, 'SeparatorRegion'): sPoints = MultiPageXml.getChildByName( ndSep, 'Coords')[0].get('points') [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY dx, dy = abs(x2 - x1), abs(y2 - y1) if dx > dy: #horizontal table line if dx > (fMinPageCoverage * w): #ym = (y1+y2)/2.0 # 2.0 to support python2 lYi.append((y1, y2)) else: if dy > (fMinPageCoverage * h): #xm = (x1+x2)/2.0 lXi.append((x1, x2)) ltlYlX.append((lYi, lXi)) return ltlYlX
def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage): """ get the index in our grid of the table lines return lists of index, for horizontal and for vertical grid lines, per page return [(h_list, v_list), ...] """ ltlHlV = [] for ndPage in MultiPageXml.getChildByName(root, 'Page'): w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) lHi, lVi = [], [] l = MultiPageXml.getChildByName(ndPage, 'TableRegion') if l: assert len(l) == 1, "More than 1 TableRegion??" ndTR = l[0] #enumerate the table separators for ndSep in MultiPageXml.getChildByName( ndTR, 'SeparatorRegion'): sPoints = MultiPageXml.getChildByName( ndSep, 'Coords')[0].get('points') [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY dx, dy = abs(x2 - x1), abs(y2 - y1) if dx > dy: #horizontal table line if dx > (fMinPageCoverage * w): ym = (y1 + y2) / 2.0 # 2.0 to support python2 #i = int(round(ym / self.iGridVertiStep, 0)) i = self.snapToGridIndex(ym, self.iGridVertiStep) lHi.append(i) else: if dy > (fMinPageCoverage * h): xm = (x1 + x2) / 2.0 #i = int(round(xm / self.iGridHorizStep, 0)) i = self.snapToGridIndex(xm, self.iGridHorizStep) lVi.append(i) ltlHlV.append((lHi, lVi)) return ltlHlV
def getHisto(self, lNd, w, _fMinHorizProjection, h, _fMinVertiProjection, fRatio=1.0, fMinHLen=None): """ return two Numpy array reflecting the histogram of projections of objects first array along Y axis (horizontal projection), 2nd along X axis (vertical projection) when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen """ hy = np.zeros((h, ), np.float) hx = np.zeros((w, ), np.float) for nd in lNd: sPoints = MultiPageXml.getChildByName(nd, 'Coords')[0].get('points') try: x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle() if fMinHLen is None or abs(x2 - x1) > fMinHLen: _x1, _x2 = self.scale(x1, x2, fRatio) else: _x1, _x2 = x1, x2 _y1, _y2 = self.scale(y1, y2, fRatio) hy[_y1:_y2 + 1] += float(x2 - x1) / w hx[_x1:_x2 + 1] += float(y2 - y1) / h except ZeroDivisionError: pass except ValueError: pass return hy, hx
def getTBLRBorders(lNodes): """ return top, bottom, left, right borders """ llT, llR, llB, llL = [], [], [], [] for bordercell in lNodes: coord = bordercell.xpath("./a:%s" % ("Coords"), namespaces={"a": MultiPageXml.NS_PAGE_XML})[0] sPoints = coord.get('points') plgn = Polygon.parsePoints(sPoints) try: lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() except: pass llT.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lT]) llR.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lR]) llB.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lB]) llL.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lL]) return (MultiLineString(llT).convex_hull), ( MultiLineString(llR).convex_hull ), (MultiLineString(llB).convex_hull), (MultiLineString(llL).convex_hull) return llT, llR, llB, llL
def reintegrateTextIntoCells(self, doc, lLTextLines=[]): """ from XMLDSTABLE """ def overlapX(zone1, zone2): [a1, a2] = zone1 #self.getX(),self.getX()+ self.getWidth() [b1, b2] = zone2 #zone.getX(),zone.getX()+ zone.getWidth() return min(a2, b2) >= max(a1, b1) def overlapY(zone1, zone2): [a1, a2] = zone1 #self.getY(),self.getY() + self.getHeight() [b1, b2] = zone2 #zone.getY(),zone.getY() + zone.getHeight() return min(a2, b2) >= max(a1, b1) def signedRatioOverlap(zone1, zone2): """ overlap self and zone return surface of self in zone """ [ x1, y1, x12, y12 ] = zone1 #self.getX(),self.getY(),self.getHeight(),self.getWidth() [ x2, y2, x22, y22 ] = zone2 #zone.getX(),zone.getY(),zone.getHeight(),zone.getWidth() w1, h1 = x12 - x1, y12 - y1 w2, h2 = x22 - x2, y22 - y2 fOverlap = 0.0 # print (x1,x12),(x2,x22) # print overlapX((x1,x12),(x2,x22)) # print (y1,y12),(y2,y22) # print overlapY((y1,y12),(y2,y22)) # if overlapX((x1,w1),(x2,w2)) and overlapY((y1,h1),(y2,h2)): if overlapX((x1, x12), (x2, x22)) and overlapY((y1, y12), (y2, y22)): [x11, y11, x12, y12] = [x1, y1, x1 + w1, y1 + h1] [x21, y21, x22, y22] = [x2, y2, x2 + w2, y2 + h2] s1 = w1 * h1 # possible ? if s1 == 0: s1 = 1.0 #intersection nx1 = max(x11, x21) nx2 = min(x12, x22) ny1 = max(y11, y21) ny2 = min(y12, y22) h = abs(nx2 - nx1) w = abs(ny2 - ny1) inter = h * w if inter > 0: fOverlap = inter / s1 else: # if overX and Y this is not possible ! fOverlap = 0.0 return fOverlap def bestRegionsAssignment(plgtl, lRegions): """ find the best (max overlap for self) region for self """ lOverlap = [] for _, plg in lRegions: lOverlap.append( signedRatioOverlap(plgtl.getBoundingBox(), plg.getBoundingBox())) # print plgtl.getBoundingBox(), lOverlap if max(lOverlap) == 0: return None return lRegions[lOverlap.index(max(lOverlap))] lPages = PageXml.getChildByName(doc.getroot(), 'Page') lRegionsToBeDeleted = [] for i, page in enumerate(lPages): if lLTextLines == []: lTextLines = PageXml.getChildByName(page, 'TextLine') else: lTextLines = lLTextLines[i] lCells = PageXml.getChildByName(page, 'TableCell') # print len(lCells),len(lTextLines) lOCells = [] for cell in lCells: #get Coords xpath = "./a:%s" % ("Coords") lCoords = cell.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] sPoints = coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) plg = Polygon(lXY) lOCells.append((cell, plg)) # find the best assignment of each text for tl in lTextLines: #get Coords xpath = "./a:%s" % ("Coords") lCoords = tl.xpath(xpath, namespaces={"a": self.xmlns}) coord = lCoords[0] sPoints = coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: (sx, sy) = sPair.split(',') lXY.append((int(sx), int(sy))) plg = Polygon(lXY) cell = bestRegionsAssignment(plg, lOCells) if cell: c, _ = cell lRegionsToBeDeleted.append(c.parent) ## what about parent TextRegion delete at least TextRegion/TextEquiv # tl.unlinkNode() tlcp = tl.docCopyNode(c.doc, True) # tlcp.unlinkNode() c.append(tlcp) # print c for region in lRegionsToBeDeleted: region.getParent().remove(region)
def mergeBaselineCells(self, coldir, colid, docid): """ Take a file (pxml) with stuff processed on Transkribus Tale the CVL template tool xml (xml) merge them regenerate a mpxml """ xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid)) # print (xmlpath) mpxml = xmlpath + ".mpxml" mpxmldoc = etree.parse(mpxml) lxml = glob.glob(os.path.join(xmlpath, "*.xml")) pxmldoc = MultiPageXml.makeMultiPageXml(lxml) lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml")) mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml) lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page') lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page') assert len(lXMLPage) == len(lPXMLPage) for i, cvlpage in enumerate(lXMLPage): ## remove TextRegion from xcvlpage lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion') for tr in lTextRegions: tr.getparent().remove(tr) pxmlpage = lPXMLPage[i] lTL = [] lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion') for x in lTextRegions: lTL.extend(PageXml.getChildByName(x, 'TextLine')) ltable = PageXml.getChildByName(cvlpage, 'TableRegion') if len(ltable) == 0: raise "NO TABLE" lCells = PageXml.getChildByName(ltable[0], 'TableCell') lC = [Polygon(PageXml.getPointList(c)) for c in lCells] lT = [Polygon(PageXml.getPointList(t)) for t in lTL] for i, tl in enumerate(lT): ## normalization lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords') lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline') coordB = lCoordsB[0] coord = lCoordsPoints[0] iHeight = 30 # in pixel x1, y1, x2, y2 = Polygon( PageXml.getPointList(coordB)).getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) tl = Polygon(PageXml.getPointList(coordB)) lOverlap = [] for _, c in enumerate(lC): # print (lCells[j].get('row'),lCells[j].get('col'), self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox()) lOverlap.append(self.signedOverlap( c, tl)) #.getBoundingBox())) ## region of the same size as the textline # print (j,max(lOverlap),lOverlap.index(max(lOverlap))) if max(lOverlap) == 0: region = PageXml.createPageXmlNode('TextRegion') cvlpage.append(region) region.append(lTL[i]) else: cell = lCells[lOverlap.index(max(lOverlap))] cell.append(lTL[i]) # print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext())) pxmldoc.write(mpxml)
def _iter_GraphNode(self, doc, domNdPage, page): """ Get the DOM, the DOM page node, the page object iterator on the DOM, that returns nodes (of class Block) """ #--- XPATH contexts assert self.sxpNode, "CONFIG ERROR: need an xpath expression to enumerate elements corresponding to graph nodes" lNdBlock = domNdPage.xpath( self.sxpNode, namespaces=self.dNS) #all relevant nodes of the page for ndBlock in lNdBlock: domid = ndBlock.get("id") sText = self._get_GraphNodeText(doc, domNdPage, ndBlock) if sText == None: sText = "" NodeType_PageXml.nbNoTextWarning += 1 if NodeType_PageXml.nbNoTextWarning < 33: traceln("Warning: no text in node %s" % domid) elif NodeType_PageXml.nbNoTextWarning == 33: traceln( "Warning: no text in node %s - *** %d repetition : I STOP WARNING ***" % (domid, NodeType_PageXml.nbNoTextWarning)) #raise ValueError, "No text in node: %s"%ndBlock #now we need to infer the bounding box of that object lXY = PageXml.getPointList(ndBlock) #the polygon if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle( bPreserveWidth=self.bPreserveWidth) except ZeroDivisionError: # traceln("Warning: ignoring invalid polygon id=%s page=%s"%(ndBlock.prop("id"), page.pnum)) # continue # if True: # #we reduce a bit this rectangle, to ovoid overlap # w,h = x2-x1, y2-y1 # dx = max(w * 0.066, min(20, w/3)) #we make sure that at least 1/"rd of te width will remain! # dy = max(h * 0.066, min(20, w/3)) # x1,y1, x2,y2 = [ int(round(v)) for v in [x1+dx,y1+dy, x2-dx,y2-dy] ] x1, y1, x2, y2 = plg.getBoundingBox() except ValueError: x1, y1, x2, y2 = plg.getBoundingBox() #we reduce a bit this rectangle, to ovoid overlap if not (self.BBoxDeltaFun is None): if type(self.BBoxDeltaFun) is tuple and len( self.BBoxDeltaFun) == 2: xFun, yFun = self.BBoxDeltaFun if xFun is not None: dx = xFun(x2 - x1) x1, x2 = int(round(x1 + dx)), int(round(x2 - dx)) if yFun is not None: dy = yFun(y2 - y1) y1, y2 = int(round(y1 + dy)), int(round(y2 - dy)) else: # historical code w, h = x2 - x1, y2 - y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1, y1, x2, y2 = [ int(round(v)) for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy] ] # store the rectangle" ndBlock.set( "DU_points", " ".join([ "%d,%d" % (int(x), int(y)) for x, y in [(x1, y1), (x2, y1), (x2, y2), (x1, y2)] ])) #TODO orientation = 0 #no meaning for PageXml classIndex = 0 #is computed later on #and create a Block blk = Block(page, (x1, y1, x2 - x1, y2 - y1), sText, orientation, classIndex, self, ndBlock, domid=domid) yield blk return
def predict(self, sFilename): """ return Y_pred, YGT (shape=(nb_node,) dtype=np.int) """ #for the pretty printer to format better... assert os.path.isfile(sFilename), sFilename doc = etree.parse(sFilename, self.parser) #doc = etree.parse(sFilename) root = doc.getroot() # Find cuts lY, lX = self.add_cut_to_DOM(root, fMinHorizProjection=self.fMinHorizProjection, fMinVertiProjection=self.fMinVertiProjection) # ################################################################ # NOTE : we will assumes first and last row/column contain Other # ################################################################ lyy = list(zip(lY[:-1], lY[1:])) # list of intervals lxx = list(zip(lX[:-1], lX[1:])) # list of intervals dTable = collections.defaultdict(lambda : collections.defaultdict(list) ) # dTable[i][j] --> list of TExLine in that cell def getTableIndex(v, lvv): """ index in the table row or columns. The 1st border is at 0 """ for i, (v1, v2) in enumerate(lvv): if v1 <= v and v <= v2: return i+1 if v < lvv[0][0]: return 0 else: return len(lvv)+1 #place each TextLine in the table rows and columns ndPage = MultiPageXml.getChildByName(root, 'Page')[0] w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight")) lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine') imax, jmax = -1, -1 for nd in lndTexLine: sPoints=MultiPageXml.getChildByName(nd,'Coords')[0].get('points') #x1,y1,x2,y2 = Polygon.parsePoints(sPoints).fitRectangle() x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle() i = getTableIndex((y1+y2)/2.0, lyy) j = getTableIndex((x1+x2)/2.0, lxx) dTable[i][j].append( (y1, y2, x1, x2, nd) ) imax = max(i, imax) jmax = max(j, jmax) def getGT(nd): try: return CutPredictor.dLblIndex[nd.get('DU_row')] except: return 0 def overlap(a, b): _y1, _y2, ax1, ax2, _nd = a _y1, _y2, bx1, bx2, _nd = b return min(ax2, bx2) - max(ax1, bx1) def label(lt): """ set the attribute cutDU_row to each node for BIESO labelling """ lt.sort() #the 'O' newlt = [] for (y1, y2, x1, x2, nd) in lt: bInMargin = x2 < 350 or x1 > 4600 if bInMargin: nd.set('cutDU_row', 'O') else: newlt.append((y1, y2, x1, x2, nd)) for i, t in enumerate(newlt): (y1, y2, x1, x2, nd) = t #is there someone above?? if yes get the closest node above nd_just_above = None for j in range(i-1, -1, -1): tt = newlt[j] if overlap(tt, t) > 0: nd_just_above = tt[4] if nd_just_above.get('cutDU_row') != 'O': break if nd_just_above is None: #S by default nd.set('cutDU_row', 'S') else: if nd_just_above.get('cutDU_row') == 'S': nd_just_above.set('cutDU_row', 'B') nd .set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'E': nd_just_above.set('cutDU_row', 'I') nd .set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'I': nd.set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'B': #bad luck, we do not see the intermediary node nd .set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'O': raise Exception('Internal error') #now set the BIESO labels... (only vertically for now) liGTLabel = list() liLabel = list() for i in range(0, imax+1): dRow = dTable[i] for j in range(0, jmax+1): lt = dRow[j] if not lt: continue label(lt) liGTLabel.extend([getGT(nd) for y1, y2, x1, x2, nd in lt]) liLabel.extend([self.dLblIndex[nd.get('cutDU_row')] for y1, y2, x1, x2, nd in lt]) Y_pred = np.array(liLabel , dtype=np.int) YGT = np.array(liGTLabel, dtype=np.int) sOutFilename = sFilename[:-6] + "_cut.mpxml" doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True) print('Annotated cut separators in %s'%sOutFilename) del doc return Y_pred, YGT
def resizeCell(self, cell, ns): """ replace the cell region by a BB for textlines: better for transcriber """ xpath = "./a:%s" % ("TextLine") lTextLines = cell.xpath(xpath, namespaces={'a': PageXml.NS_PAGE_XML}) if lTextLines == []: return True ## get minx,maxy for the cell lXY = PageXml.getPointList(cell) #the polygon plg = Polygon(lXY) x1, y1, x2, y2 = plg.fitRectangle() x1, y1, x2, y2 = plg.getBoundingBox() cellX1 = x1 cellX2 = x2 cellY1 = y1 cellY2 = y2 ## get all the textlines of the cell minx, miny, maxx, maxy = 9e9, 9e9, 0, 0 for line in lTextLines: lXY = PageXml.getPointList(line) #the polygon # in case empty cell if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle() except ZeroDivisionError: continue x1, y1, x2, y2 = plg.getBoundingBox() minx = min(minx, x1) miny = min(miny, y1) maxx = max(maxx, x2) maxy = max(maxy, y2) """ finally: simply use the BB of the textlines + padding """ # # new request: height= max(cell,text) # ## new new request: (12/10/2017): min (cell, text)! # HCell = cellY2 - cellY1 # HBBText = maxy - miny # miny -= self.vpadding # vertical padding (top) maxy += self.vpadding # vertical padding (bottom) # # # Height computation # ## if HBBText <= self.HeightTH * HCell: take HBBText as height for TextREgion # if HBBText > self.HeightTH * HCell: # miny = max(miny,cellY1) # maxy = min(maxy,cellY2) # # else : don't touch miny, maxy : use only Text for computing Height # # # Width computation minx -= self.hpadding # horizontal padding maxx += self.hpadding # horizontal padding # minx = min(cellX1,minx) # maxx = max(cellX2, maxx) # print cellX2, maxx corner = cell[0] # print minx,miny,maxx,miny,maxx,maxy,minx,maxy corner.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (minx, miny, maxx, miny, maxx, maxy, minx, maxy))
def tag_DU_row_col_header(root, lCells, maxRowSpan): """ Tag the XML nodes corresponding to those cells Modify the XML DOM """ for cell in lCells: lText = MultiPageXml.getChildByName(cell,'TextLine') # HEADER WISE: D CH O if int(cell.get('row')) < maxRowSpan: [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText] else: [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText] # ROW WISE: B I E S O if len(lText) == 0: pass if len(lText) == 1: lText[0].set(sDURow,lLabelsBIESO_R[3]) elif len(lText) > 1: # lText.sort(key=lambda x:float(x.prop('y'))) lText[0].set(sDURow,lLabelsBIESO_R[0]) [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]] lText[-1].set(sDURow,lLabelsBIESO_R[2]) # MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0]) # MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2]) # [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]] #COLUM WISE: M S O lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML}) coord= lCoords[0] sPoints=coord.get('points') plgn = Polygon.parsePoints(sPoints) (cx,cy,cx2,cy2) = plgn.getBoundingBox() for txt in lText: lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML}) coord= lCoords[0] sPoints=coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx,sy) = sPair.split(',') lXY.append( (int(sx), int(sy)) ) except ValueError: traceln("WARNING: invalid coord in TextLine id=%s IGNORED"%txt.get("id")) ## HOW to define a CM element!!!! if lXY: (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox() if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1): txt.set(sDUCol,lLabelsSM_C[0]) else: txt.set(sDUCol,lLabelsSM_C[1]) else: txt.set(sDUCol,lLabelsSM_C[-1]) # textline outside table lRegions= MultiPageXml.getChildByName(root,'TextRegion') for region in lRegions: lText = MultiPageXml.getChildByName(region,'TextLine') [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText] [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText] [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText] return
def getCellsSeparators(lCell): """ return two dictionaries row -> ((x1, y1), (x2, y2)) NOTE: top of row col -> ((x1, y1), (x2, y2)) NOTE: left of column """ dRowSep = {} dColSep = {} # let's collect the segments forming the cell borders, by row, by col dRowSep_lSgmt = collections.defaultdict(list) dColSep_lSgmt = collections.defaultdict(list) for cell in lCell: row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \ in ["row", "col", "rowSpan", "colSpan"] ] coord = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})[0] sPoints = coord.get('points') plgn = Polygon.parsePoints(sPoints) try: lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() except ZeroDivisionError: traceln("ERROR: cell %s row=%d col=%d has empty area and is IGNORED" % (cell.get("id"), row, col)) continue #now the top segments contribute to row separator of index: row dRowSep_lSgmt[row].extend(lT) #now the bottom segments contribute to row separator of index: row+rowSpan dRowSep_lSgmt[row+rowSpan].extend(lB) dColSep_lSgmt[col].extend(lL) dColSep_lSgmt[col+colSpan].extend(lR) #now make linear regression to draw relevant separators def getX(lSegment): lX = list() for x1,_y1,x2,_y2 in lSegment: lX.append(x1) lX.append(x2) return lX def getY(lSegment): lY = list() for _x1,y1,_x2,y2 in lSegment: lY.append(y1) lY.append(y2) return lY for row, lSegment in dRowSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment] #duplicate each element sumW = sum(lfNorm) * 2 W = [fN/sumW for fN in lfNorm for _ in (0,1)] # a * x + b a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W) xmin, xmax = min(X), max(X) y1 = a + b * xmin y2 = a + b * xmax dRowSep[row] = ((xmin, y1), (xmax, y2)) for col, lSegment in dColSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment] #duplicate each element sumW = sum(lfNorm) * 2 W = [fN/sumW for fN in lfNorm for _ in (0,1)] a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W) ymin, ymax = min(Y), max(Y) x1 = a + b * ymin x2 = a + b * ymax dColSep[col] = ((x1, ymin), (x2, ymax)) return dRowSep, dColSep
res = cv2.bitwise_and(frame, frame, mask=mask) # res = cv2.flip(res,0) # finds contours im2, contours, heirarchy = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) # Creates two polygon objects for c in range(0, len(contours)): if cv2.contourArea(contours[c]) >= cv2.contourArea(contours[high]): sec = high high = c elif (cv2.contourArea(contours[c]) > cv2.contourArea(contours[sec]) and cv2.contourArea( contours[c]) <= cv2.contourArea(contours[high])): sec = c top_tape = Polygon(frame, mask) bottom_tape = Polygon(frame, mask) # Runs bottom tape when contours is > 1 if len(contours) > 0: bottom_tape.run(contours[high]) # Runs top tape when contour count is > 2 if len(contours) > 1: top_tape.run(contours[sec]) hasRun = True # Finds center is two objects are found if hasRun: top_tape.center = list(top_tape.center) bottom_tape.center = list(bottom_tape.center)
def addSeparator(root, lCells): """ Add separator that correspond to cell boundaries modify the XML DOM """ # let's collect the segment forming the separators dRowSep_lSgmt = collections.defaultdict(list) dColSep_lSgmt = collections.defaultdict(list) for cell in lCells: row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \ in ["row", "col", "rowSpan", "colSpan"] ] coord = cell.xpath("./a:%s" % ("Coords"), namespaces={"a": MultiPageXml.NS_PAGE_XML})[0] sPoints = coord.get('points') plgn = Polygon.parsePoints(sPoints) try: lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() #now the top segments contribute to row separator of index: row dRowSep_lSgmt[row].extend(lT) #now the bottom segments contribute to row separator of index: row+rowSpan dRowSep_lSgmt[row + rowSpan].extend(lB) dColSep_lSgmt[col].extend(lL) dColSep_lSgmt[col + colSpan].extend(lR) except ValueError: pass #now make linear regression to draw relevant separators def getX(lSegment): lX = list() for x1, y1, x2, y2 in lSegment: lX.append(x1) lX.append(x2) return lX def getY(lSegment): lY = list() for x1, y1, x2, y2 in lSegment: lY.append(y1) lY.append(y2) return lY ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0] lB = [] for irow, lSegment in dRowSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [ np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment ] #duplicate each element W = [fN for fN in lfNorm for _ in (0, 1)] a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W) xmin, xmax = min(X), max(X) y1 = a + b * xmin y2 = a + b * xmax lB.append(b * 100) ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion") ndSep.set("orient", "horizontal %.1f %.3f" % (a, b)) ndTR.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(xmin, y1), (xmax, y2)]) ndSep.append(ndCoord) sStat = "\tHORIZONTAL: Average=%.1f%% stdev=%.2f%% min=%.1f%% max=%.1f%%" % ( np.average(lB), np.std(lB), min(lB), max(lB)) ndTR.append(etree.Comment(sStat)) print(sStat) lB = [] for icol, lSegment in dColSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [ np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment ] #duplicate each element W = [fN for fN in lfNorm for _ in (0, 1)] # a * x + b a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W) lB.append(b * 100) ymin, ymax = min(Y), max(Y) x1 = a + b * ymin x2 = a + b * ymax ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion") ndSep.set("orient", "vertical %.1f %.3f" % (a, b)) ndTR.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, ymin), (x2, ymax)]) ndSep.append(ndCoord) sStat = "\tVERTICAL : Average=%.1f%% stdev=%.2f%% min=%.1f%% max=%.1f%%" % ( np.average(lB), np.std(lB), min(lB), max(lB)) ndTR.append(etree.Comment(sStat)) print(sStat) return
def _iter_GraphNode(self, doc, sFilename, page=None): """ Get the json dict iterator on the DOM, that returns nodes (of class Block) """ # --- XPATH contexts lNdBlock = doc['GlynchResults']['Areas'] #page_w, page_h = self.getPageWidthandHeight(sFilename) page = Page(1, 1, 1, 1) for ndBlock in lNdBlock: try: sText = ndBlock['label'] except KeyError: sText = "" traceln("WARNING: no 'label' in : %s" % ndBlock) if sText == None: sText = "" traceln("Warning: no text in node") # raise ValueError, "No text in node: %s"%ndBlock # now we need to infer the bounding box of that object lXY = self.getPointList(ndBlock) # the polygon if lXY == []: continue plg = Polygon(lXY) try: x1, y1, x2, y2 = plg.fitRectangle( bPreserveWidth=self.bPreserveWidth) except ZeroDivisionError: x1, y1, x2, y2 = plg.getBoundingBox() except ValueError: x1, y1, x2, y2 = plg.getBoundingBox() # we reduce a bit this rectangle, to ovoid overlap if not (self.BBoxDeltaFun is None): if type(self.BBoxDeltaFun) is tuple and len( self.BBoxDeltaFun) == 2: xFun, yFun = self.BBoxDeltaFun if xFun is not None: dx = xFun(x2 - x1) x1, x2 = int(round(x1 + dx)), int(round(x2 - dx)) if yFun is not None: dy = yFun(y2 - y1) y1, y2 = int(round(y1 + dy)), int(round(y2 - dy)) else: # historical code w, h = x2 - x1, y2 - y1 dx = self.BBoxDeltaFun(w) dy = self.BBoxDeltaFun(h) x1, y1, x2, y2 = [ int(round(v)) for v in [x1 + dx, y1 + dy, x2 - dx, y2 - dy] ] # TODO orientation = 0 # no meaning for PageXml classIndex = 0 # is computed later on # and create a Block blk = Block(page, (x1, y1, x2 - x1, y2 - y1), sText, orientation, classIndex, self, ndBlock, domid=None) blk.setShape(geom.Polygon(lXY)) yield blk return
def computeSkewing(self): """ input: self output: skewing ange compute text skewing in the row """ def getX(lSegment): lX = list() for x1, y1, x2, y2 in lSegment: lX.append(x1) lX.append(x2) return lX def getY(lSegment): lY = list() for x1, y1, x2, y2 in lSegment: lY.append(y1) lY.append(y2) return lY import numpy as np from util.Polygon import Polygon if self.getCells(): dRowSep_lSgmt = [] # alternative: compute the real top ones? for wedding more robust!! for cell in self.getCells(): # lTopText = filter(lambda x:x.getAttribute('DU_row') == 'B', [text for text in cell.getObjects()]) try: lTopText = [cell.getObjects()[0]] except IndexError: lTopText = [] for text in lTopText: sPoints = text.getAttribute('points') spoints = ' '.join( "%s,%s" % ((x, y)) for x, y in zip(*[iter(sPoints.split(','))] * 2)) it_sXsY = (sPair.split(',') for sPair in spoints.split(' ')) plgn = Polygon( (float(sx), float(sy)) for sx, sy in it_sXsY) lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() dRowSep_lSgmt.extend(lT) if dRowSep_lSgmt != []: X = getX(dRowSep_lSgmt) Y = getY(dRowSep_lSgmt) lfNorm = [ np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in dRowSep_lSgmt ] #duplicate each element W = [fN for fN in lfNorm for _ in (0, 1)] # a * x + b a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W) xmin, xmax = min(X), max(X) y1 = a + b * xmin y2 = a + b * xmax ro = XMLDSTABLEROWClass(self.getIndex()) #[(x1, ymin), (x2, ymax) ro.setX(xmin) ro.setY(y1) ro.setWidth(xmax - xmin) ro.setHeight(y2 - y1) ro.setPage(self.getPage()) ro.setParent(self.getParent()) ro.addAttribute( 'points', ','.join([str(xmin), str(y1), str(xmax), str(y2)])) ro.tagMe()