def addEdgeToDoc(self, Y=None): """ To display the grpah conveniently we add new Edge elements """ import random (pnum, page, ndPage) = next(self._iter_Page_DocNode(self.doc)) w = int(ndPage.get("imageWidth")) nn = 1 + len([e for e in self.lEdge if type(e) not in [HorizontalEdge, VerticalEdge, Edge_BL]]) ii = 0 for edge in self.lEdge: if type(edge) in [HorizontalEdge, VerticalEdge]: A, B = edge.A.shape.centroid, edge.B.shape.centroid elif type(edge) in [Edge_BL]: A = edge.A.shape.centroid # not readable _pt, B = shapely.ops.nearest_points(A, edge.B.shape) _pt, B = shapely.ops.nearest_points(edge.A.shape, edge.B.shape) else: ii += 1 x = 1 + ii * (w/nn) pt = geom.Point(x, 0) A, _ = shapely.ops.nearest_points(edge.A.shape, pt) B, _ = shapely.ops.nearest_points(edge.B.shape, pt) ndSep = MultiPageXml.createPageXmlNode("Edge") ndSep.set("DU_type", type(edge).__name__) ndPage.append(ndSep) MultiPageXml.setPoints(ndSep, [(A.x, A.y), (B.x, B.y)]) return
def add_cluster_to_dom(root, llX): """ Cluster the Textline based on the vertical cuts """ for lX, (_iPage, ndPage) in zip( llX, enumerate(MultiPageXml.getChildByName(root, 'Page'))): w, _h = int(ndPage.get("imageWidth")), int(ndPage.get("imageHeight")) lX.append(w) lX.sort() # cluster of objects on imax = len(lX) dCluster = {i: list() for i in range(imax)} #Histogram of projections lndTextline = MultiPageXml.getChildByName(ndPage, 'TextLine') # hack to use addClusterToDom class MyBlock: def __init__(self, nd): self.node = nd o = GraphBinaryConjugateSegmenter() o.lNode = [] for nd in lndTextline: o.lNode.append(MyBlock(nd)) for iNd, ndTextline in enumerate(lndTextline): sPoints = MultiPageXml.getChildByName(ndTextline, 'Coords')[0].get('points') try: x1, _y1, x2, _y2 = Polygon.parsePoints(sPoints).fitRectangle() xm = (x1 + x2) / 2.0 bLastColumn = True for i, xi in enumerate(lX): if xm <= xi: dCluster[i].append(iNd) ndTextline.set("DU_cluster", str(i)) bLastColumn = False break if bLastColumn: i = imax dCluster[i].append(iNd) ndTextline.set("DU_cluster", str(i)) except ZeroDivisionError: pass except ValueError: pass # add clusters lNdCluster = o.addClusterToDom(dCluster, bMoveContent=False, sAlgo="cut", pageNode=ndPage) # add a cut_X attribute to the clusters for ndCluster in lNdCluster: i = int(ndCluster.get('name')) ndCluster.set("cut_X", str(lX[i]))
def convertDoc(self, sFilename): assert sFilename.endswith(self.sXml_HumanAnnotation_Extension) g = Graph_MultiPageXml() doc = etree.parse(sFilename, encoding='utf-8') #the Heigh/Ho annotation runs over consecutive pages, so we keep those values accross pages self._initSegmentationLabel() self.lSeenResoNum = list() for pnum, page, domNdPage in g._iter_Page_DocNode(doc): self._convertPageAnnotation(pnum, page, domNdPage) MultiPageXml.setMetadata(doc, None, self.sMetadata_Creator, self.sMetadata_Comments) assert sFilename.endswith(self.sXml_HumanAnnotation_Extension) sDUFilename = sFilename[:-len(self.sXml_HumanAnnotation_Extension)] + self.sXml_MachineAnnotation_Extension # doc.save(sDUFilename, encoding='utf-8', pretty_print=True) doc.write(sDUFilename, xml_declaration=True, encoding="utf-8", pretty_print=True #compression=0, #0 to 9 ) # doc.saveFormatFileEnc(sDUFilename, "utf-8", True) #True to indent the XML # doc.freeDoc() return sDUFilename
def main(lsFilename, lsOutFilename): #for the pretty printer to format better... parser = etree.XMLParser(remove_blank_text=True) for sFilename, sOutFilename in zip(lsFilename, lsOutFilename): doc = etree.parse(sFilename, parser) root = doc.getroot() lCells= MultiPageXml.getChildByName(root,'TableCell') if not lCells: traceln("ERROR: no TableCell - SKIPPING THIS FILE!!!") continue # default: O for all cells: all cells must have all tags! for cell in lCells: lText = MultiPageXml.getChildByName(cell,'TextLine') [x.set(sDURow,lLabelsBIESO_R[-1]) for x in lText] [x.set(sDUCol,lLabelsSM_C[-1]) for x in lText] [x.set(sDUHeader,lLabels_HEADER[-1]) for x in lText] if False: # Oct' 2018 RV and JL decided that we keep the binding TextLine (if any!) # ignore "binding" cells # dirty... # lCells = list(filter(lambda x: int(x.get('rowSpan')) < 5, lCells)) # less dirty maxrow = max(int(x.get('row')) for x in lCells) binding_rowspan = max(5, maxrow * 0.8) traceln(" - max row = %d => considering rowspan > %d as binding cells" % (maxrow, binding_rowspan)) lValidCell, lBindingCell = [], [] for ndCell in lCells: if int(ndCell.get('rowSpan')) < binding_rowspan: lValidCell.append(ndCell) else: lBindingCell.append(ndCell) nDiscarded = len(lBindingCell) if nDiscarded > 1: traceln("**************** WARNING ****************") traceln(" - %d cells discarded as binding cells" % nDiscarded) for ndCell in lBindingCell: ndCell.set("type", "table-binding") lCells = lValidCell # FOR COLUMN HEADER: get max(cell[0,i].span) maxRowSpan = computeMaxRowSpan(lCells) tag_DU_row_col_header(root, lCells, maxRowSpan) try: removeSeparator(root) addSeparator(root, lCells) doc.write(sOutFilename, encoding='utf-8',pretty_print=True,xml_declaration=True) traceln('annotation done for %s --> %s' % (sFilename, sOutFilename)) except TableAnnotationException: traceln("No Table region in file ", sFilename, " IGNORED!!") del doc
def storeMultiPageXml(self,lListDocs,outputFileName=None): """ write a multipagePageXml file """ from xml_formats.PageXml import MultiPageXml mp = MultiPageXml() newDoc = mp.makeMultiPageXmlMemory(list(map(lambda xy:xy[0],lListDocs))) ## print(outputFileName) if outputFileName is None: outputFileName = os.path.dirname(self.inputFileName) + os.sep + ".."+os.sep +"col" + os.sep + os.path.basename(self.inputFileName)[:-7] + "_du.mpxml" # print(outputFileName) res= newDoc.write(outputFileName, encoding="UTF-8",pretty_print=True,xml_declaration=True)
def remove_cuts_from_dom(self, root): """ clean the DOM from any existing cut return the number of removed cut lines """ lnd = MultiPageXml.getChildByName(root, 'CutSeparator') n = len(lnd) for nd in lnd: nd.getparent().remove(nd) #check... lnd = MultiPageXml.getChildByName(root, 'CutSeparator') assert len(lnd) == 0 return n
def add_grid_to_DOM(self, root, ltlHlV=None): """ Add the grid lines to the DOM Tag them if ltlHlV is given Modify the XML DOM return the number of grid lines created """ domid = 0 #to add unique separator id and count them for iPage, ndPage in enumerate( MultiPageXml.getChildByName(root, 'Page')): try: lHi, lVi = ltlHlV[iPage] except IndexError: lHi, lVi = [], [] w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) ndTR = MultiPageXml.getChildByName(root, 'TableRegion')[0] def addPageXmlSeparator(nd, i, lGTi, x1, y1, x2, y2, domid): ndSep = MultiPageXml.createPageXmlNode("GridSeparator") if lGTi: # propagate the groundtruth info we have sLabel = self.getLabel(i, lGTi) ndSep.set("type", sLabel) if abs(x2 - x1) > abs(y2 - y1): ndSep.set("orient", "0") else: ndSep.set("orient", "90") ndSep.set("id", "s_%d" % domid) nd.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)]) ndSep.append(ndCoord) return ndSep #Vertical grid lines for i, (x1, y1, x2, y2) in enumerate(self.iterGridVerticalLines(w, h)): domid += 1 addPageXmlSeparator(ndTR, i, lVi, x1, y1, x2, y2, domid) #horizontal grid lines for i, (x1, y1, x2, y2) in enumerate(self.iterGridHorizontalLines(w, h)): domid += 1 addPageXmlSeparator(ndTR, i, lHi, x1, y1, x2, y2, domid) return domid
def test_DS2PageXmlConversion(): filename = os.path.join(sTESTS_DIR, 'testDS2PageXml/RRB_MM_01_033_Jahr_1810.ds.xml') conv= DS2PageXMLConvertor() conv.inputFileName = filename doc = conv.loadDom(filename) lPageXmlDocs = conv.run(doc) mp = MultiPageXml() # newDoc = mp.makeMultiPageXmlMemory(map(lambda (x,y):x,lPageXmlDocs)) newDoc = mp.makeMultiPageXmlMemory([x for x,_y in lPageXmlDocs]) newDoc.write(os.path.join(sTESTS_DIR, "testDS2PageXml/RRB_MM_01_033_Jahr_1810.mpxml"), xml_declaration=True, encoding="UTF-8", pretty_print=True)
def run(self): docdom = self.loadDom() self.convertTableCells(docdom) try: PageXml.setMetadata(docdom, None, 'NLE', Comments='TableCell 2 TextRegion') except ValueError: MultiPageXml.setMetadata(docdom, None, 'NLE', Comments='TableCell 2 TextRegion') return docdom
def addPageXmlSeparator(cls, ndPage, oCut, domid): ndSep = MultiPageXml.createPageXmlNode("CutSeparator") # propagate the groundtruth info we have ndSep.set("DU_type", oCut._du_label) ndSep.set("orient", "0") ndSep.set("DU_angle", "%.1f" % math.degrees(oCut._du_angle)) ndSep.set("DU_angle_freq", "%.3f" % oCut._du_angle_freq) ndSep.set("DU_angle_cumul_freq", "%.3f" % oCut._du_angle_cumfreq) ndSep.set("DU_set_support", "%s" % oCut._du_set_support) ndSep.set("id", "cs_%d" % domid) ndPage.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, oCut.coords) ndSep.append(ndCoord) return ndSep
def addPageXmlSeparator(cls, nd, sLabel, x1, y1, x2, y2, domid): ndSep = MultiPageXml.createPageXmlNode("CutSeparator") if not sLabel is None: # propagate the groundtruth info we have ndSep.set("type", sLabel) if abs(x2 - x1) > abs(y2 - y1): ndSep.set("orient", "0") else: ndSep.set("orient", "90") ndSep.set("id", "s_%d" % domid) nd.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)]) ndSep.append(ndCoord) return ndSep
def addEdgeToDOM(self): """ To display the grpah conveniently we add new Edge elements Since we change the BAseline representation, we show the new one """ super().addEdgeToDOM() for blk in self.lNode: assert blk.type.name in ["row", "sepH"], blk.type.name if blk.type.name == "row": ndBaseline = blk.node.xpath(".//pc:Baseline", namespaces=self.dNS)[0] o = self.shaper_fun(ndBaseline) MultiPageXml.setPoints(ndBaseline, list(o.coords)) return
def op_gt_recall(lsFilename, bCutAbove, lDegAngle, fMinHorizProjection=0.05, fCutHeight=25): cAll = Counter() for sFilename in lsFilename: traceln("- loading GT: %s" % sFilename) #for the pretty printer to format better... parser = etree.XMLParser(remove_blank_text=True) doc = etree.parse(sFilename, parser) root = doc.getroot() doer = SkewedCutAnnotator(bCutAbove, lAngle=[math.radians(x) for x in lDegAngle]) pnum = 0 for ndPage in MultiPageXml.getChildByName(root, 'Page'): pnum += 1 traceln(" --- page %s - constructing separator candidates" % pnum) #load the page objects and the GT partition (defined by the table) if any loBaseline, dsetTableByRow = doer.loadPage(ndPage) traceln(" - found %d objects on page" % (len(loBaseline))) # find almost-horizontal cuts and tag them if GT is available loHCut = doer.findHCut(ndPage, loBaseline, dsetTableByRow, fCutHeight) cAll.update(Counter(o._du_label for o in loHCut)) lk = sorted(cAll.keys()) traceln("GT: ALL CUT Label count: ", " ".join("%s:%d" % (k, cAll[k]) for k in lk))
def addPageXmlSeparator(nd, i, lGTi, x1, y1, x2, y2, domid): ndSep = MultiPageXml.createPageXmlNode("GridSeparator") if lGTi: # propagate the groundtruth info we have sLabel = self.getLabel(i, lGTi) ndSep.set("type", sLabel) if abs(x2 - x1) > abs(y2 - y1): ndSep.set("orient", "0") else: ndSep.set("orient", "90") ndSep.set("id", "s_%d" % domid) nd.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)]) ndSep.append(ndCoord) return ndSep
def addSeparator(root, lCells): """ Add separator that correspond to cell boundaries modify the XML DOM """ dRow, dCol = getCellsSeparators(lCells) try: ndTR = MultiPageXml.getChildByName(root,'TableRegion')[0] except IndexError: raise TableAnnotationException("No TableRegion!!! ") lRow = sorted(dRow.keys()) lB = [] for row in lRow: (x1, y1), (x2, y2) = dRow[row] b = math.degrees(math.atan((y2-y1) / (x2-x1))) lB.append(b) ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion") ndSep.set("orient", "horizontal angle=%.2f" % b) ndSep.set("row", "%d" % row) ndTR.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)]) ndSep.append(ndCoord) sStat = "\tHORIZONTAL: Average=%.1f° stdev=%.2f° min=%.1f° max=%.1f°" % ( np.average(lB), np.std(lB), min(lB), max(lB) ) ndTR.append(etree.Comment(sStat)) traceln(sStat) lCol = sorted(dCol.keys()) lB = [] for col in lCol: (x1, y1), (x2, y2) = dCol[col] b = 90 -math.degrees(math.atan((x2-x1) / (y2 - y1))) lB.append(b) ndSep = MultiPageXml.createPageXmlNode("SeparatorRegion") ndSep.set("orient", "vertical %.2f" % b) ndSep.set("col", "%d" % col) ndTR.append(ndSep) ndCoord = MultiPageXml.createPageXmlNode("Coords") MultiPageXml.setPoints(ndCoord, [(x1, y1), (x2, y2)]) ndSep.append(ndCoord) sStat = "\tVERTICAL : Average=%.1f° stdev=%.2f° min=%.1f° max=%.1f°" % ( np.average(lB), np.std(lB), min(lB), max(lB) ) ndTR.append(etree.Comment(sStat)) traceln(sStat) return
def remove_grid_from_dom(self, root): """ clean the DOM from any existing grid (useful to choose at run-time the grid increment (called step) return the number of removed grid lines """ for iPage, ndPage in enumerate( MultiPageXml.getChildByName(root, 'Page')): lnd = MultiPageXml.getChildByName(root, 'GridSeparator') n = len(lnd) for nd in lnd: nd.getparent().remove(nd) #check... lnd = MultiPageXml.getChildByName(root, 'GridSeparator') assert len(lnd) == 0 return n
def isBaselineHorizontal(ndText): lNdBaseline = MultiPageXml.getChildByName(ndText ,'Baseline') if lNdBaseline: try: o = ShapeLoader.node_to_LineString(lNdBaseline[0]) except: return True (minx, miny, maxx, maxy) = o.bounds return bool(maxx-minx >= maxy-miny) return True
def get_separator_YX_from_DOM(self, root, fMinPageCoverage): """ get the x and y of the GT table separators return lists of y, for horizontal and of x for vertical separators, per page return [(y_list, x_list), ...] """ ltlYlX = [] for ndPage in MultiPageXml.getChildByName(root, 'Page'): w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) lYi, lXi = [], [] l = MultiPageXml.getChildByName(ndPage, 'TableRegion') if len(l) != 1: if l: traceln( "** warning ** %d TableRegion instead of expected 1" % len(l)) else: traceln("** warning ** no TableRegion, expected 1") if l: for ndTR in l: #enumerate the table separators for ndSep in MultiPageXml.getChildByName( ndTR, 'SeparatorRegion'): sPoints = MultiPageXml.getChildByName( ndSep, 'Coords')[0].get('points') [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY dx, dy = abs(x2 - x1), abs(y2 - y1) if dx > dy: #horizontal table line if dx > (fMinPageCoverage * w): #ym = (y1+y2)/2.0 # 2.0 to support python2 lYi.append((y1, y2)) else: if dy > (fMinPageCoverage * h): #xm = (x1+x2)/2.0 lXi.append((x1, x2)) ltlYlX.append((lYi, lXi)) return ltlYlX
def get_grid_GT_index_from_DOM(self, root, fMinPageCoverage): """ get the index in our grid of the table lines return lists of index, for horizontal and for vertical grid lines, per page return [(h_list, v_list), ...] """ ltlHlV = [] for ndPage in MultiPageXml.getChildByName(root, 'Page'): w, h = int(ndPage.get("imageWidth")), int( ndPage.get("imageHeight")) lHi, lVi = [], [] l = MultiPageXml.getChildByName(ndPage, 'TableRegion') if l: assert len(l) == 1, "More than 1 TableRegion??" ndTR = l[0] #enumerate the table separators for ndSep in MultiPageXml.getChildByName( ndTR, 'SeparatorRegion'): sPoints = MultiPageXml.getChildByName( ndSep, 'Coords')[0].get('points') [(x1, y1), (x2, y2)] = Polygon.parsePoints(sPoints).lXY dx, dy = abs(x2 - x1), abs(y2 - y1) if dx > dy: #horizontal table line if dx > (fMinPageCoverage * w): ym = (y1 + y2) / 2.0 # 2.0 to support python2 #i = int(round(ym / self.iGridVertiStep, 0)) i = self.snapToGridIndex(ym, self.iGridVertiStep) lHi.append(i) else: if dy > (fMinPageCoverage * h): xm = (x1 + x2) / 2.0 #i = int(round(xm / self.iGridHorizStep, 0)) i = self.snapToGridIndex(xm, self.iGridHorizStep) lVi.append(i) ltlHlV.append((lHi, lVi)) return ltlHlV
def getDomBaselineXY(cls, domNode): """ find the baseline descendant node and return its "central" point """ try: ndBaseline = MultiPageXml.getChildByName(domNode, 'Baseline')[0] except IndexError as e: traceln("WARNING: No Baseline child in ", domNode.get('id')) raise e x, y = cls.getPolylineAverageXY(ndBaseline) # modulo should be done only after the GT assigns labels. return (x, y)
def _shapeFromNodePoints(cls, nd, ShapeClass): """ Find the Coords child of the node and parse its points e.g. <SeparatorRegion orient="horizontal 373.8 0.006" row="1"> <Coords points="3324,392 3638,394"/> </SeparatorRegion> returns a shape of given class """ sPoints = nd.get('points') if sPoints is None: sPoints = MultiPageXml.getChildByName(nd, 'Coords')[0].get('points') return cls._shapeFromPoints(sPoints, ShapeClass)
def evalClusterByRow(self, sFilename): """ Evaluate the quality of the partitioning by table row, by comparing the GT table information to the partition done automatically (thanks to the separators added to the DOM). """ self.doc = etree.parse(sFilename) root = self.doc.getroot() # doer = BaselineCutAnnotator() # # #load the groundtruth table separators, if any, per page (1 in tABP) # ltlYlX = doer.get_separator_YX_from_DOM(root, self.fMinPageCoverage) # for (lHi, lVi) in ltlYlX: # traceln(" - found %d horizontal, %d vertical GT separators" % (len(lHi), len(lVi))) # #create DOM node reflecting the cuts # #first clean (just in case!) # n = doer.remove_cuts_from_dom(root) # if n > 0: # traceln(" - removed %d pre-existing cut lines" % n) # # # if GT, then we have labelled cut lines in DOM # _ltlYCutXCut = doer.add_cut_to_DOM(root, ltlYlX=ltlYlX) lClassicType = [nt for nt in self.getNodeTypeList() if nt in self._lClassicNodeType] lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType] #load the block nodes per page for (pnum, page, domNdPage) in self._iter_Page_DomNode(self.doc): #now that we have the page, let's create the node for each type! lClassicPageNode = [nd for nodeType in lClassicType for nd in nodeType._iter_GraphNode(self.doc, domNdPage, page) ] lSpecialType = [nt for nt in self.getNodeTypeList() if nt not in self._lClassicNodeType] # -- GT --------------------------------------------- # partition by columns ad rows dGTByRow = collections.defaultdict(list) dGTByCol = collections.defaultdict(list) for blk in lClassicPageNode: cell = MultiPageXml.getAncestorByName(blk, 'TableCell')[0] row, col, rowSpan, colSpan = [int(cell.get(sProp)) for sProp \ in ["row", "col", "rowSpan", "colSpan"] ] # TODO: deal with span dGTByRow[row].append(blk) dGTByCol[col].append(col) for k,l in dGTByRow.items: l.sort(key=lambda o: (o.x1, o.y1)) for k,l in dGTByCol.items: l.sort(key=lambda o: (o.y1, o.x1))
def getDocSeparators(sFilename): """ return two dictionaries row -> list of (x1, y1, x2, y2) col -> list of (x1, y1, x2, y2) """ parser = etree.XMLParser() doc = etree.parse(sFilename, parser) root = doc.getroot() lCell= MultiPageXml.getChildByName(root,'TableCell') if not lCell: raise DocSeparatorException("No TableCell element in %s" %sFilename) dRowSep, dColSep = getCellsSeparators(lCell) del doc return dRowSep, dColSep
def storeMPXML(self, lFiles): """ store files in lFiles as mpxml """ docDir = os.path.join(self.coldir + os.sep + 'col', self.docid) doc = MultiPageXml.makeMultiPageXml(lFiles) ## add cornerNode lCells = PageXml.getChildByName(doc.getRootElement(), 'TableCell') for domNode in lCells: cornerNode = libxml2.newNode('CornerPts') cornerNode.setContent("0 1 2 3") cornerNode.setNs(doc.getRootElement().ns()) domNode.addChild(cornerNode) sMPXML = docDir + ".mpxml" doc.saveFormatFileEnc(sMPXML, "UTF-8", True) return doc, sMPXML
def defineTableBordersFromCells(table): """ Table points are too loose: redefine thme with cells regions """ lCells = MultiPageXml.getChildByName(table, 'TableCell') maxRow = max(int(x.get('row')) for x in lCells) maxCol = max(int(x.get('col')) for x in lCells) #first col col1 = filter(lambda x: x.get('col') == "0", lCells) colN = filter(lambda x: x.get('col') == str(maxCol), lCells) row1 = filter(lambda x: x.get('row') == "0", lCells) rowN = filter(lambda x: x.get('row') == str(maxRow), lCells) # T R B L lColSep_1 = getTBLRBorders(col1)[3] lColSep_N = getTBLRBorders(colN)[1] lRowSep_1 = getTBLRBorders(row1)[0] lRowSep_N = getTBLRBorders(rowN)[2] return lColSep_1, lColSep_N, lRowSep_1, lRowSep_N
def op_cut(sFilename, sOutFilename, lDegAngle, bCutAbove, fMinHorizProjection=0.05, fCutHeight=25): #for the pretty printer to format better... parser = etree.XMLParser(remove_blank_text=True) doc = etree.parse(sFilename, parser) root = doc.getroot() doer = SkewedCutAnnotator(bCutAbove, lAngle=[math.radians(x) for x in lDegAngle]) pnum = 0 domid = 0 for ndPage in MultiPageXml.getChildByName(root, 'Page'): pnum += 1 traceln(" --- page %s - constructing separator candidates" % pnum) #load the page objects and the GT partition (defined by the table) if any loBaseline, dsetTableByRow = doer.loadPage(ndPage) traceln(" - found %d objects on page" % (len(loBaseline))) # find almost-horizontal cuts and tag them if GT is available loHCut = doer.findHCut(ndPage, loBaseline, dsetTableByRow, fCutHeight) #create DOM node reflecting the cuts #first clean (just in case!) n = doer.remove_cuts_from_dom(ndPage) if n > 0: traceln(" - removed %d pre-existing cut lines" % n) # if GT, then we have labelled cut lines in DOM domid = doer.add_Hcut_to_Page(ndPage, loHCut, domid) doc.write(sOutFilename, encoding='utf-8', pretty_print=True, xml_declaration=True) print('Annotated cut separators added to %s' % sOutFilename)
def storeMPXML(self, lFiles): """ store files in lFiles as mpxml """ docDir = os.path.join(self.coldir + os.sep + 'col', self.docid) doc = MultiPageXml.makeMultiPageXml(lFiles) sMPXML = docDir + ".mpxml" # print sMPXML doc.write(sMPXML, encoding="UTF-8", pretty_print=True, xml_declaration=True) # trace("\t\t- validating the MultiPageXml ...") # if not MultiPageXml.validate(doc): # traceln(" *** WARNING: XML file is invalid against the schema: '%s'"%self.outputFileName) # traceln(" Ok!") return doc, sMPXML
def getHisto(self, lNd, w, _fMinHorizProjection, h, _fMinVertiProjection, fRatio=1.0, fMinHLen=None): """ return two Numpy array reflecting the histogram of projections of objects first array along Y axis (horizontal projection), 2nd along X axis (vertical projection) when fMinHLen is given , we do not scale horizontally text shorter than fMinHLen """ hy = np.zeros((h, ), np.float) hx = np.zeros((w, ), np.float) for nd in lNd: sPoints = MultiPageXml.getChildByName(nd, 'Coords')[0].get('points') try: x1, y1, x2, y2 = Polygon.parsePoints(sPoints).fitRectangle() if fMinHLen is None or abs(x2 - x1) > fMinHLen: _x1, _x2 = self.scale(x1, x2, fRatio) else: _x1, _x2 = x1, x2 _y1, _y2 = self.scale(y1, y2, fRatio) hy[_y1:_y2 + 1] += float(x2 - x1) / w hx[_x1:_x2 + 1] += float(y2 - y1) / h except ZeroDivisionError: pass except ValueError: pass return hy, hx
def processParameters(self): """ what to do with the parameters provided by the command line """ if self.colid is None: print('collection id missing!') sys.exit(1) self.bFullCol = self.docid != None if self.bRegenerateMPXML and self.docid is not None: l = glob.glob(os.path.join(self.coldir, sCOL, self.docid, "*.pxml")) doc = MultiPageXml.makeMultiPageXml(l) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) doc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) return doc return None
def _doShape_getChildByName(cls, node, name, ShapeClass, fun=None): """ do a MultiPageXml.getChildByName from a node to get some nodes e.g. "SeparatorRegion" construct a shape of given Shapely class from the coordinates of each node e.g. <SeparatorRegion orient="horizontal 373.8 0.006" row="1"> <Coords points="3324,392 3638,394"/> </SeparatorRegion> if fun is given applies fun, with arguments: shape, current_node return the list of shape objects in same order as retrived by getChildByName """ lO = [] for _nd in MultiPageXml.getChildByName(node, name): try: o = cls._shapeFromNodePoints(_nd, ShapeClass) except Exception as e: print('ERROR: cannot load this element "%s"' % str(_nd)) print(' because "%s"' % e) continue if not fun is None: fun(o, _nd) lO.append(o) return lO