コード例 #1
0
    def storeMPXML(self, lFiles):
        """
            store files in lFiles as mpxml
        """
        docDir = os.path.join(self.coldir + os.sep + 'col', self.docid)

        doc = MultiPageXml.makeMultiPageXml(lFiles)

        sMPXML = docDir + ".mpxml"
        #         print sMPXML
        doc.write(sMPXML,
                  encoding="UTF-8",
                  pretty_print=True,
                  xml_declaration=True)

        #         trace("\t\t- validating the MultiPageXml ...")
        #         if not MultiPageXml.validate(doc):
        #                 traceln("   *** WARNING: XML file is invalid against the schema: '%s'"%self.outputFileName)
        #         traceln(" Ok!")

        return doc, sMPXML
コード例 #2
0
    def storeMPXML(self,lFiles):
        """
            store files in lFiles as mpxml
        """
        docDir = os.path.join(self.coldir+os.sep+'col',self.docid)
        
        doc = MultiPageXml.makeMultiPageXml(lFiles)


        ## add cornerNode
        lCells = PageXml.getChildByName(doc.getRootElement(),'TableCell')
        for domNode in lCells:
            cornerNode = libxml2.newNode('CornerPts')
            cornerNode.setContent("0 1 2 3")
            cornerNode.setNs(doc.getRootElement().ns())
            domNode.addChild(cornerNode)            

        sMPXML  = docDir+".mpxml"
        doc.saveFormatFileEnc(sMPXML,"UTF-8",True)       
        
        return doc, sMPXML
コード例 #3
0
    def processParameters(self):
        """
            what to do with the parameters provided by the command line
        """
        if self.colid is None:
            print('collection id missing!')
            sys.exit(1)

        self.bFullCol = self.docid != None

        if self.bRegenerateMPXML and self.docid is not None:
            l = glob.glob(os.path.join(self.coldir, sCOL, self.docid,
                                       "*.pxml"))
            doc = MultiPageXml.makeMultiPageXml(l)
            outputFileName = os.path.join(
                self.coldir, sCOL,
                self.docid + TableProcessing.sMPXMLExtension)
            doc.write(outputFileName,
                      xml_declaration=True,
                      encoding="UTF-8",
                      pretty_print=True)
            return doc
        return None
コード例 #4
0
    def mergeBaselineCells(self, coldir, colid, docid):
        """
        
            Take a file (pxml) with stuff processed on Transkribus
            Tale the CVL template tool xml (xml)
            
            merge them
            
            regenerate a mpxml
             
        """

        xmlpath = os.path.abspath(os.path.join(coldir, sCOL, docid))
        #         print (xmlpath)

        mpxml = xmlpath + ".mpxml"
        mpxmldoc = etree.parse(mpxml)

        lxml = glob.glob(os.path.join(xmlpath, "*.xml"))
        pxmldoc = MultiPageXml.makeMultiPageXml(lxml)

        lhtrxml = glob.glob(os.path.join(xmlpath, "*.pxml"))
        mpxmldoc = MultiPageXml.makeMultiPageXml(lhtrxml)

        lPXMLPage = PageXml.getChildByName(mpxmldoc.getroot(), 'Page')
        lXMLPage = PageXml.getChildByName(pxmldoc.getroot(), 'Page')

        assert len(lXMLPage) == len(lPXMLPage)
        for i, cvlpage in enumerate(lXMLPage):
            ## remove TextRegion from xcvlpage
            lTextRegions = PageXml.getChildByName(cvlpage, 'TextRegion')
            for tr in lTextRegions:
                tr.getparent().remove(tr)

            pxmlpage = lPXMLPage[i]
            lTL = []
            lTextRegions = PageXml.getChildByName(pxmlpage, 'TextRegion')
            for x in lTextRegions:
                lTL.extend(PageXml.getChildByName(x, 'TextLine'))

            ltable = PageXml.getChildByName(cvlpage, 'TableRegion')
            if len(ltable) == 0:
                raise "NO TABLE"
            lCells = PageXml.getChildByName(ltable[0], 'TableCell')

            lC = [Polygon(PageXml.getPointList(c)) for c in lCells]
            lT = [Polygon(PageXml.getPointList(t)) for t in lTL]
            for i, tl in enumerate(lT):
                ## normalization
                lCoordsPoints = PageXml.getChildByName(lTL[i], 'Coords')
                lCoordsB = PageXml.getChildByName(lTL[i], 'Baseline')
                coordB = lCoordsB[0]
                coord = lCoordsPoints[0]
                iHeight = 30  # in pixel
                x1, y1, x2, y2 = Polygon(
                    PageXml.getPointList(coordB)).getBoundingBox()
                if coord is not None:
                    coord.set(
                        'points', "%d,%d %d,%d %d,%d %d,%d" %
                        (x1, y1 - iHeight, x2, y1 - iHeight, x2, y2, x1, y2))
                    tl = Polygon(PageXml.getPointList(coordB))
                lOverlap = []
                for _, c in enumerate(lC):
                    #                     print (lCells[j].get('row'),lCells[j].get('col'),  self.signedOverlap(c,tl),tl.getBoundingBox(),c.getBoundingBox())
                    lOverlap.append(self.signedOverlap(
                        c, tl))  #.getBoundingBox()))
                ## region of the same size as the textline
#                 print (j,max(lOverlap),lOverlap.index(max(lOverlap)))
                if max(lOverlap) == 0:
                    region = PageXml.createPageXmlNode('TextRegion')
                    cvlpage.append(region)
                    region.append(lTL[i])

                else:
                    cell = lCells[lOverlap.index(max(lOverlap))]
                    cell.append(lTL[i])
#                     print (cell.get('row'),cell.get('col'),''.join(lTL[i].itertext()))

        pxmldoc.write(mpxml)