def add_cluster_to_dom(root, llX): """ Cluster the Textline based on the vertical cuts """ for lX, (_iPage, ndPage) in zip( llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))): w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight")) lX.append(w) lX.sort() # cluster of objects on imax = len(lX) dCluster = {i: list() for i in range(imax)} #Histogram of projections lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine') # hack to use addClusterToDom class MyBlock: def __init__(self, nd): self.node = nd o = GraphBinaryConjugateSegmenter() o.lNode = [] for nd in lndTextline: o.lNode.append(MyBlock(nd)) for iNd, ndTextline in enumerate(lndTextline): sPoints = MultiPageXml.getChildByName(ndTextline, 'Coords')[0].get('points') try: x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle() xm = (x1 + x2) / 2.0 bLastColumn = True for i, xi in enumerate(lX): if xm <= xi: dCluster[i].append(iNd) ndTextline.set("DU_cluster", str(i)) bLastColumn = False break if bLastColumn: i = imax dCluster[i].append(iNd) ndTextline.set("DU_cluster", str(i)) except ZeroDivisionError: pass except ValueError: pass # add clusters lNdCluster = o.addClusterToDom(dCluster, bMoveContent=False, sAlgo="cut", pageNode=ndPage) # add a cut_X attribute to the clusters for ndCluster in lNdCluster: i = int(ndCluster.get('name')) ndCluster.set("cut_X", str(lX[i]))
def getPolylineAverageXY(cls, ndPolyline): """ weighted average X and average Y of a polyline the weight indicate how long each segment at a given X, or Y, was. """ sPoints = ndPolyline.get('points') lXY = Polygon.parsePoints(sPoints).lXY # list of X and Y values and respective weights lXYWxWy = [((x1+x2)/2.0, abs(y2-y1), # for how long at this X? (y1+y2)/2.0, abs(x2-x1)) \ for (x1,y1), (x2, y2) in zip(lXY, lXY[1:])] fWeightedSumX = sum(x * wx for x, wx, _, _ in lXYWxWy) fWeightedSumY = sum(y * wy for _, _, y, wy in lXYWxWy) fSumWeightX = sum(wx for _, wx, _, _ in lXYWxWy) fSumWeightY = sum(wy for _, _, _, wy in lXYWxWy) Xavg = int(round(fWeightedSumX / fSumWeightX)) if fSumWeightX > 0 else 0 Yavg = int(round(fWeightedSumY / fSumWeightY)) if fSumWeightY > 0 else 0 # Xavg, Yavg = self.moduloSnap(Xavg, Yavg) return (Xavg, Yavg)
def get_separator_YX_from_DOM(self, root, fMinPageCoverage): """ get the x and y of the GT table separators return lists of y, for horizontal and of x for vertical separators, per page return [(y_list, x_list), ...] """ ltlYlX = [] for ndPage in MultiPageXml.getChildByName(root, 'Page'): w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) lYi, lXi = [], [] l = MultiPageXml.getChildByName(ndPage, 'TableRegion') if len(l) != 1: if l: traceln( "** warning ** %d TableRegion instead of expected 1" % len(l)) else: traceln("** warning ** no TableRegion, expected 1") if l: for ndTR in l: #enumerate the table separators for ndSep in MultiPageXml.getChildByName( ndTR, 'SeparatorRegion'): sPoints = MultiPageXml.getChildByName( ndSep, 'Coords')[0].get('points') [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY dx, dy = abs(x2 - x1), abs(y2 - y1) if dx > dy: #horizontal table line if dx > (fMinPageCoverage * w): #ym = (y1+y2)/2.0 # 2.0 to support python2 lYi.append((y1, y2)) else: if dy > (fMinPageCoverage * h): #xm = (x1+x2)/2.0 lXi.append((x1, x2)) ltlYlX.append((lYi, lXi)) return ltlYlX
def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage): """ get the index in our grid of the table lines return lists of index, for horizontal and for vertical grid lines, per page return [(h_list, v_list), ...] """ ltlHlV = [] for ndPage in MultiPageXml.getChildByName(root, 'Page'): w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) lHi, lVi = [], [] l = MultiPageXml.getChildByName(ndPage, 'TableRegion') if l: assert len(l) == 1, "More than 1 TableRegion??" ndTR = l[0] #enumerate the table separators for ndSep in MultiPageXml.getChildByName( ndTR, 'SeparatorRegion'): sPoints = MultiPageXml.getChildByName( ndSep, 'Coords')[0].get('points') [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY dx, dy = abs(x2 - x1), abs(y2 - y1) if dx > dy: #horizontal table line if dx > (fMinPageCoverage * w): ym = (y1 + y2) / 2.0 # 2.0 to support python2 #i = int(round(ym / self.iGridVertiStep, 0)) i = self.snapToGridIndex(ym, self.iGridVertiStep) lHi.append(i) else: if dy > (fMinPageCoverage * h): xm = (x1 + x2) / 2.0 #i = int(round(xm / self.iGridHorizStep, 0)) i = self.snapToGridIndex(xm, self.iGridHorizStep) lVi.append(i) ltlHlV.append((lHi, lVi)) return ltlHlV
def getHisto(self, lNd, w, _fMinHorizProjection, h, _fMinVertiProjection, fRatio=1.0, fMinHLen=None): """ return two Numpy array reflecting the histogram of projections of objects first array along Y axis (horizontal projection), 2nd along X axis (vertical projection) when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen """ hy = np.zeros((h, ), np.float) hx = np.zeros((w, ), np.float) for nd in lNd: sPoints = MultiPageXml.getChildByName(nd, 'Coords')[0].get('points') try: x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle() if fMinHLen is None or abs(x2 - x1) > fMinHLen: _x1, _x2 = self.scale(x1, x2, fRatio) else: _x1, _x2 = x1, x2 _y1, _y2 = self.scale(y1, y2, fRatio) hy[_y1:_y2 + 1] += float(x2 - x1) / w hx[_x1:_x2 + 1] += float(y2 - y1) / h except ZeroDivisionError: pass except ValueError: pass return hy, hx
def getTBLRBorders(lNodes): """ return top, bottom, left, right borders """ llT, llR, llB, llL = [], [], [], [] for bordercell in lNodes: coord = bordercell.xpath("./a:%s" % ("Coords"), namespaces={"a": MultiPageXml.NS_PAGE_XML})[0] sPoints = coord.get('points') plgn = Polygon.parsePoints(sPoints) try: lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() except: pass llT.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lT]) llR.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lR]) llB.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lB]) llL.extend([[(x1, y1), (x2, y2)] for x1, y1, x2, y2 in lL]) return (MultiLineString(llT).convex_hull), ( MultiLineString(llR).convex_hull ), (MultiLineString(llB).convex_hull), (MultiLineString(llL).convex_hull) return llT, llR, llB, llL
def predict(self, sFilename): """ return Y_pred, YGT (shape=(nb_node,) dtype=np.int) """ #for the pretty printer to format better... assert os.path.isfile(sFilename), sFilename doc = etree.parse(sFilename, self.parser) #doc = etree.parse(sFilename) root = doc.getroot() # Find cuts lY, lX = self.add_cut_to_DOM(root, fMinHorizProjection=self.fMinHorizProjection, fMinVertiProjection=self.fMinVertiProjection) # ################################################################ # NOTE : we will assumes first and last row/column contain Other # ################################################################ lyy = list(zip(lY[:-1], lY[1:])) # list of intervals lxx = list(zip(lX[:-1], lX[1:])) # list of intervals dTable = collections.defaultdict(lambda : collections.defaultdict(list) ) # dTable[i][j] --> list of TExLine in that cell def getTableIndex(v, lvv): """ index in the table row or columns. The 1st border is at 0 """ for i, (v1, v2) in enumerate(lvv): if v1 <= v and v <= v2: return i+1 if v < lvv[0][0]: return 0 else: return len(lvv)+1 #place each TextLine in the table rows and columns ndPage = MultiPageXml.getChildByName(root, 'Page')[0] w, h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight")) lndTexLine = MultiPageXml.getChildByName(ndPage, 'TextLine') imax, jmax = -1, -1 for nd in lndTexLine: sPoints=MultiPageXml.getChildByName(nd,'Coords')[0].get('points') #x1,y1,x2,y2 = Polygon.parsePoints(sPoints).fitRectangle() x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle() i = getTableIndex((y1+y2)/2.0, lyy) j = getTableIndex((x1+x2)/2.0, lxx) dTable[i][j].append( (y1, y2, x1, x2, nd) ) imax = max(i, imax) jmax = max(j, jmax) def getGT(nd): try: return CutPredictor.dLblIndex[nd.get('DU_row')] except: return 0 def overlap(a, b): _y1, _y2, ax1, ax2, _nd = a _y1, _y2, bx1, bx2, _nd = b return min(ax2, bx2) - max(ax1, bx1) def label(lt): """ set the attribute cutDU_row to each node for BIESO labelling """ lt.sort() #the 'O' newlt = [] for (y1, y2, x1, x2, nd) in lt: bInMargin = x2 < 350 or x1 > 4600 if bInMargin: nd.set('cutDU_row', 'O') else: newlt.append((y1, y2, x1, x2, nd)) for i, t in enumerate(newlt): (y1, y2, x1, x2, nd) = t #is there someone above?? if yes get the closest node above nd_just_above = None for j in range(i-1, -1, -1): tt = newlt[j] if overlap(tt, t) > 0: nd_just_above = tt[4] if nd_just_above.get('cutDU_row') != 'O': break if nd_just_above is None: #S by default nd.set('cutDU_row', 'S') else: if nd_just_above.get('cutDU_row') == 'S': nd_just_above.set('cutDU_row', 'B') nd .set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'E': nd_just_above.set('cutDU_row', 'I') nd .set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'I': nd.set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'B': #bad luck, we do not see the intermediary node nd .set('cutDU_row', 'E') elif nd_just_above.get('cutDU_row') == 'O': raise Exception('Internal error') #now set the BIESO labels... (only vertically for now) liGTLabel = list() liLabel = list() for i in range(0, imax+1): dRow = dTable[i] for j in range(0, jmax+1): lt = dRow[j] if not lt: continue label(lt) liGTLabel.extend([getGT(nd) for y1, y2, x1, x2, nd in lt]) liLabel.extend([self.dLblIndex[nd.get('cutDU_row')] for y1, y2, x1, x2, nd in lt]) Y_pred = np.array(liLabel , dtype=np.int) YGT = np.array(liGTLabel, dtype=np.int) sOutFilename = sFilename[:-6] + "_cut.mpxml" doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True) print('Annotated cut separators in %s'%sOutFilename) del doc return Y_pred, YGT
def tag_DU_row_col_header(root, lCells, maxRowSpan): """ Tag the XML nodes corresponding to those cells Modify the XML DOM """ for cell in lCells: lText = MultiPageXml.getChildByName(cell,'TextLine') # HEADER WISE: D CH O if int(cell.get('row')) < maxRowSpan: [x.set(sDUHeader,lLabels_HEADER[1]) for x in lText] else: [x.set(sDUHeader,lLabels_HEADER[0]) for x in lText] # ROW WISE: B I E S O if len(lText) == 0: pass if len(lText) == 1: lText[0].set(sDURow,lLabelsBIESO_R[3]) elif len(lText) > 1: # lText.sort(key=lambda x:float(x.prop('y'))) lText[0].set(sDURow,lLabelsBIESO_R[0]) [x.set(sDURow,lLabelsBIESO_R[1]) for x in lText[1:-1]] lText[-1].set(sDURow,lLabelsBIESO_R[2]) # MultiPageXml.setCustomAttr(lText[0],"table","rtype",lLabelsBIESO_R[0]) # MultiPageXml.setCustomAttr(lText[-1],"table","rtype",lLabelsBIESO_R[2]) # [MultiPageXml.setCustomAttr(x,"table","rtype",lLabelsBIESO_R[1]) for x in lText[1:-1]] #COLUM WISE: M S O lCoords = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML}) coord= lCoords[0] sPoints=coord.get('points') plgn = Polygon.parsePoints(sPoints) (cx,cy,cx2,cy2) = plgn.getBoundingBox() for txt in lText: lCoords = txt.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML}) coord= lCoords[0] sPoints=coord.get('points') lsPair = sPoints.split(' ') lXY = list() for sPair in lsPair: try: (sx,sy) = sPair.split(',') lXY.append( (int(sx), int(sy)) ) except ValueError: traceln("WARNING: invalid coord in TextLine id=%s IGNORED"%txt.get("id")) ## HOW to define a CM element!!!! if lXY: (x1,y1,x2,y2) = Polygon(lXY).getBoundingBox() if x2> cx2 and (x2 - cx2) > 0.75 * (cx2 - x1): txt.set(sDUCol,lLabelsSM_C[0]) else: txt.set(sDUCol,lLabelsSM_C[1]) else: txt.set(sDUCol,lLabelsSM_C[-1]) # textline outside table lRegions= MultiPageXml.getChildByName(root,'TextRegion') for region in lRegions: lText = MultiPageXml.getChildByName(region,'TextLine') [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText] [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText] [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText] return
def addSeparator(root, lCells): """ Add separator that correspond to cell boundaries modify the XML DOM """ # let's collect the segment forming the separators dRowSep_lSgmt = collections.defaultdict(list) dColSep_lSgmt = collections.defaultdict(list) for cell in lCells: row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \ in ["row", "col", "rowSpan", "colSpan"] ] coord = cell.xpath("./a:%s" % ("Coords"), namespaces={"a": MultiPageXml.NS_PAGE_XML})[0] sPoints = coord.get('points') plgn = Polygon.parsePoints(sPoints) try: lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() #now the top segments contribute to row separator of index: row dRowSep_lSgmt[row].extend(lT) #now the bottom segments contribute to row separator of index: row+rowSpan dRowSep_lSgmt[row + rowSpan].extend(lB) dColSep_lSgmt[col].extend(lL) dColSep_lSgmt[col + colSpan].extend(lR) except ValueError: pass #now make linear regression to draw relevant separators def getX(lSegment): lX = list() for x1, y1, x2, y2 in lSegment: lX.append(x1) lX.append(x2) return lX def getY(lSegment): lY = list() for x1, y1, x2, y2 in lSegment: lY.append(y1) lY.append(y2) return lY ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0] lB = [] for irow, lSegment in dRowSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [ np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment ] #duplicate each element W = [fN for fN in lfNorm for _ in (0, 1)] a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W) xmin, xmax = min(X), max(X) y1 = a + b * xmin y2 = a + b * xmax lB.append(b * 100) ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion") ndSep.set("orient", "horizontal %.1f %.3f" % (a, b)) ndTR.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(xmin, y1), (xmax, y2)]) ndSep.append(ndCoord) sStat = "\tHORIZONTAL: Average=%.1f%% stdev=%.2f%% min=%.1f%% max=%.1f%%" % ( np.average(lB), np.std(lB), min(lB), max(lB)) ndTR.append(etree.Comment(sStat)) print(sStat) lB = [] for icol, lSegment in dColSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [ np.linalg.norm([[x1, y1], [x2, y2]]) for x1, y1, x2, y2 in lSegment ] #duplicate each element W = [fN for fN in lfNorm for _ in (0, 1)] # a * x + b a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W) lB.append(b * 100) ymin, ymax = min(Y), max(Y) x1 = a + b * ymin x2 = a + b * ymax ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion") ndSep.set("orient", "vertical %.1f %.3f" % (a, b)) ndTR.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, ymin), (x2, ymax)]) ndSep.append(ndCoord) sStat = "\tVERTICAL : Average=%.1f%% stdev=%.2f%% min=%.1f%% max=%.1f%%" % ( np.average(lB), np.std(lB), min(lB), max(lB)) ndTR.append(etree.Comment(sStat)) print(sStat) return
def getCellsSeparators(lCell): """ return two dictionaries row -> ((x1, y1), (x2, y2)) NOTE: top of row col -> ((x1, y1), (x2, y2)) NOTE: left of column """ dRowSep = {} dColSep = {} # let's collect the segments forming the cell borders, by row, by col dRowSep_lSgmt = collections.defaultdict(list) dColSep_lSgmt = collections.defaultdict(list) for cell in lCell: row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \ in ["row", "col", "rowSpan", "colSpan"] ] coord = cell.xpath("./a:%s" % ("Coords"),namespaces={"a":MultiPageXml.NS_PAGE_XML})[0] sPoints = coord.get('points') plgn = Polygon.parsePoints(sPoints) try: lT, lR, lB, lL = plgn.partitionSegmentTopRightBottomLeft() except ZeroDivisionError: traceln("ERROR: cell %s row=%d col=%d has empty area and is IGNORED" % (cell.get("id"), row, col)) continue #now the top segments contribute to row separator of index: row dRowSep_lSgmt[row].extend(lT) #now the bottom segments contribute to row separator of index: row+rowSpan dRowSep_lSgmt[row+rowSpan].extend(lB) dColSep_lSgmt[col].extend(lL) dColSep_lSgmt[col+colSpan].extend(lR) #now make linear regression to draw relevant separators def getX(lSegment): lX = list() for x1,_y1,x2,_y2 in lSegment: lX.append(x1) lX.append(x2) return lX def getY(lSegment): lY = list() for _x1,y1,_x2,y2 in lSegment: lY.append(y1) lY.append(y2) return lY for row, lSegment in dRowSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment] #duplicate each element sumW = sum(lfNorm) * 2 W = [fN/sumW for fN in lfNorm for _ in (0,1)] # a * x + b a, b = np.polynomial.polynomial.polyfit(X, Y, 1, w=W) xmin, xmax = min(X), max(X) y1 = a + b * xmin y2 = a + b * xmax dRowSep[row] = ((xmin, y1), (xmax, y2)) for col, lSegment in dColSep_lSgmt.items(): X = getX(lSegment) Y = getY(lSegment) #sum(l,()) lfNorm = [math.sqrt(np.linalg.norm((x2 - x1, y2 - y1))) for x1,y1,x2,y2 in lSegment] #duplicate each element sumW = sum(lfNorm) * 2 W = [fN/sumW for fN in lfNorm for _ in (0,1)] a, b = np.polynomial.polynomial.polyfit(Y, X, 1, w=W) ymin, ymax = min(Y), max(Y) x1 = a + b * ymin x2 = a + b * ymax dColSep[col] = ((x1, ymin), (x2, ymax)) return dRowSep, dColSep