def storeMPXML(self, lFiles): """ store files in lFiles as mpxml """ docDir = os.path.join(self.coldir + os.sep + 'col', self.docid) doc = MultiPageXml.makeMultiPageXml(lFiles) sMPXML = docDir + ".mpxml" # print sMPXML doc.write(sMPXML, encoding="UTF-8", pretty_print=True, xml_declaration=True) # trace("\t\t- validating the MultiPageXml ...") # if not MultiPageXml.validate(doc): # traceln(" *** WARNING: XML file is invalid against the schema: '%s'"%self.outputFileName) # traceln(" Ok!") return doc, sMPXML
def storeMPXML(self,lFiles): """ store files in lFiles as mpxml """ docDir = os.path.join(self.coldir+os.sep+'col',self.docid) doc = MultiPageXml.makeMultiPageXml(lFiles) ## add cornerNode lCells = PageXml.getChildByName(doc.getRootElement(),'TableCell') for domNode in lCells: cornerNode = libxml2.newNode('CornerPts') cornerNode.setContent("0 1 2 3") cornerNode.setNs(doc.getRootElement().ns()) domNode.addChild(cornerNode) sMPXML = docDir+".mpxml" doc.saveFormatFileEnc(sMPXML,"UTF-8",True) return doc, sMPXML
def processParameters(self): """ what to do with the parameters provided by the command line """ if self.colid is None: print('collection id missing!') sys.exit(1) self.bFullCol = self.docid != None if self.bRegenerateMPXML and self.docid is not None: l = glob.glob(os.path.join(self.coldir, sCOL, self.docid, "*.pxml")) doc = MultiPageXml.makeMultiPageXml(l) outputFileName = os.path.join( self.coldir, sCOL, self.docid + TableProcessing.sMPXMLExtension) doc.write(outputFileName, xml_declaration=True, encoding="UTF-8", pretty_print=True) return doc return None
def mergeBaselineCells(self, coldir, colid, docid): """ Take a file (pxml) with stuff processed on Transkribus Tale the CVL template tool xml (xml) merge them regenerate a mpxml """ xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid)) # print (xmlpath) mpxml = xmlpath + ".mpxml" mpxmldoc = etree.parse(mpxml) lxml = glob.glob(os.path.join(xmlpath, "*.xml")) pxmldoc = MultiPageXml.makeMultiPageXml(lxml) lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml")) mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml) lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page') lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page') assert len(lXMLPage) == len(lPXMLPage) for i, cvlpage in enumerate(lXMLPage): ## remove TextRegion from xcvlpage lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion') for tr in lTextRegions: tr.getparent().remove(tr) pxmlpage = lPXMLPage[i] lTL = [] lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion') for x in lTextRegions: lTL.extend(PageXml.getChildByName(x, 'TextLine')) ltable = PageXml.getChildByName(cvlpage, 'TableRegion') if len(ltable) == 0: raise "NO TABLE" lCells = PageXml.getChildByName(ltable[0], 'TableCell') lC = [Polygon(PageXml.getPointList(c)) for c in lCells] lT = [Polygon(PageXml.getPointList(t)) for t in lTL] for i, tl in enumerate(lT): ## normalization lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords') lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline') coordB = lCoordsB[0] coord = lCoordsPoints[0] iHeight = 30 # in pixel x1, y1, x2, y2 = Polygon( PageXml.getPointList(coordB)).getBoundingBox() if coord is not None: coord.set( 'points', "%d,%d %d,%d %d,%d %d,%d" % (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2)) tl = Polygon(PageXml.getPointList(coordB)) lOverlap = [] for _, c in enumerate(lC): # print (lCells[j].get('row'),lCells[j].get('col'), self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox()) lOverlap.append(self.signedOverlap( c, tl)) #.getBoundingBox())) ## region of the same size as the textline # print (j,max(lOverlap),lOverlap.index(max(lOverlap))) if max(lOverlap) == 0: region = PageXml.createPageXmlNode('TextRegion') cvlpage.append(region) region.append(lTL[i]) else: cell = lCells[lOverlap.index(max(lOverlap))] cell.append(lTL[i]) # print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext())) pxmldoc.write(mpxml)