def getPayloadContent(self): format = self.__metadata.getField("dc_format") slash = self.__oid.rfind("/") pid = self.__oid[slash+1:] print " *** payload content, format: %s, pid: %s *** " % (format, pid) contentStr = "" if format.startswith("text"): contentStr = "<pre>" payload = self.__storage.getPayload(self.__oid, pid) str = StringWriter() IOUtils.copy(payload.getInputStream(), str) contentStr += str.toString() contentStr += "</pre>" elif format.find("vnd.ms-")>-1 or format.find("vnd.oasis.opendocument.")>-1: #get the html version if exist.... pid = pid[:pid.find(".")] + ".htm" payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader() document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//div[@class='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") return contentStr
class SyncXmlFilesWalker(GenericDirWalkerXML): """ Synchronizes XML files one-by-one with the eXist XML repository """ def __init__(self, input_params = None): """ input_params - the parameters for the callback function """ super(GenericDirWalkerXML, self).__init__() self.main_cfg = input_params["main_config"] self.webdav_cfg = input_params["webdav_config"] self.transformer = Transformer(self.main_cfg) def create_sync_file(self): OutputFormat.createPrettyPrint() self.format = OutputFormat.createCompactFormat() self.document = DocumentFactory.getInstance().createDocument() self.root = self.document.addElement("collection") self.root.addAttribute("name", "synclist") return self.root def add_item_to_repo(self,repo_bound_file): name = self.root.addElement("file") name.addText(repo_bound_file) def close_sync_file(self): self.format = OutputFormat.createPrettyPrint() self.writer = XMLWriter(FileWriter(self.main_cfg.get_temp_files_folder()+"reposync.xml"), self.format) try: self.writer.write(self.document) self.writer.flush() except Exception, ex: LOG.error("Error while writing sync file reposync.xml", ex) finally:
def getPayloadContent(self): mimeType = self.__mimeType print " * single.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % ( contextPath, portalId, self.__oid, ) else: pid = self.__oid[self.__oid.rfind("/") + 1 :] payload = self.__storage.getPayload(self.__oid, pid) print " * single.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif ( mimeType == "application/pdf" or mimeType.find("vnd.ms") > -1 or mimeType.find("vnd.oasis.opendocument.") > -1 ): # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * single.py: pid=%s" % pid # contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \ # (contextPath, portalId, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//*[local-name()='body']") # linkNodes = slideNode.selectNodes("//img") # contentStr = slideNode.asXML(); # encode character entities correctly slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = '<p class="error">No preview available</p>' elif mimeType.startswith("image/"): src = "%s/%s" % (self.__oid, self.__pid) contentStr = ( '<a class="image" href="%(src)s" style="max-width:98%%">' '<img src="%(src)s" style="max-width:100%%" /></a>' % {"src": self.__pid} ) return contentStr
def write_to_disk(self): #print "XXXX write_to_disk ", self.xmlfile #print "XXXX write_to_disk writing dom ", self.xmldoc.asXML() format = OutputFormat.createPrettyPrint() writer = XMLWriter(FileWriter(self.xmlfile), format) try: writer.write(self.xmldoc) writer.flush() except Exception, ex: LOG.error("Error while writing %s to disk" % self.xmlfile, ex)
def getPayloadContent(self): mimeType = self.__mimeType print " * detail.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType == "application/octet-stream": dcFormat = self.__json.get("response/docs/dc_format") if dcFormat is not None: dcFormat = dcFormat[1:-1] print dcFormat, mimeType if dcFormat != mimeType: return "<div><em>(File not found)</em></div>" else: return "<div><em>(Binary file)</em></div>" elif mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % \ (contextPath, portalId, self.__oid) else: pid = self.__oid[self.__oid.rfind("/")+1:] payload = self.__storage.getPayload(self.__oid, pid) #print " * detail.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find("vnd.ms")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1: # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * detail.py: pid=%s" % pid #contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \ # (contextPath, portalId, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//*[local-name()='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = "<p class=\"error\">No preview available</p>" return contentStr
def write_cache_doc_to_file(self, cache_doc): format = OutputFormat.createPrettyPrint() writer = XMLWriter( FileWriter(self.cache_file), format ) try: writer.write(cache_doc) writer.flush() except Exception, e: print "Error WRITE_CACHE_DOC_TO_FILE ", e
def write_cache_doc_to_file(self, cache_doc): #print "XXXX WRITE CACHE TO FILE ", cache_doc.getRootElement().getDocument().asXML() format = OutputFormat.createPrettyPrint() writer = XMLWriter( FileWriter(self.cache_file), format ) try: writer.write(cache_doc) writer.flush() except Exception, e: print "Error WRITE_CACHE_DOC_TO_FILE ", e
def getPayloadContent(self): mimeType = self.__mimeType print " * single.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/download/%s"></iframe>' % \ (portalPath, self.__oid) else: pid = self.__oid[self.__oid.rfind("/") + 1:] payload = self.__storage.getPayload(self.__oid, pid) print " * single.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find( "vnd.ms") > -1 or mimeType.find( "vnd.oasis.opendocument.") > -1: # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * single.py: pid=%s" % pid #contentStr = '<iframe class="iframe-preview" src="%s/download/%s/%s"></iframe>' % \ # (portalPath, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode( "//*[local-name()='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = "<p class=\"error\">No preview available</p>" elif mimeType.startswith("image/"): src = "%s/%s" % (self.__oid, self.__pid) contentStr = '<a class="image" href="%(src)s" style="max-width:98%%">' \ '<img src="%(src)s" style="max-width:100%%" /></a>' % { "src": self.__pid } return contentStr
def __getPayloadContent(self, oid, pid): print " * combined.py: oid='%s' pid='%s'" % (oid, pid) payload = self.__storage.getPayload(oid, pid) if payload is None: return "<div>Error: No content for '%s'</div>" % oid mimeType = payload.contentType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/download/%s"></iframe>' % \ (portalPath, oid) else: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find("vnd.ms")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1: # get the html version if exist... pid = os.path.splitext(pid)[0] + ".htm" print " * combined.py: pid=%s" % pid payload = self.__storage.getPayload(oid, pid) saxReader = SAXReader(False) try: document = saxReader.read(payload.getInputStream()) slideNode = document.selectSingleNode("//*[local-name()='body']") slideNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) format.setExpandEmptyElements(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") except: traceback.print_exc() contentStr = "<p class=\"error\">No preview available</p>" elif mimeType.startswith("image/"): src = "%s/%s" % (oid, pid) contentStr = '<a class="image" href="%(src)s" style="max-width:98%%">' \ '<img src="%(src)s" style="max-width:100%%" /></a>' % { "src": pid } return contentStr
def getPayloadContent(self): mimeType = self.__mimeType print " * detail.py: payload content mimeType=%s" % mimeType contentStr = "" if mimeType.startswith("text/"): if mimeType == "text/html": contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s"></iframe>' % \ (contextPath, portalId, self.__oid) else: pid = self.__oid[self.__oid.rfind("/")+1:] payload = self.__storage.getPayload(self.__oid, pid) print " * detail.py: pid=%s payload=%s" % (pid, payload) if payload is not None: sw = StringWriter() sw.write("<pre>") IOUtils.copy(payload.getInputStream(), sw) sw.write("</pre>") sw.flush() contentStr = sw.toString() elif mimeType == "application/pdf" or mimeType.find("vnd")>-1 or mimeType.find("vnd.oasis.opendocument.")>-1: # get the html version if exist... pid = os.path.splitext(self.__pid)[0] + ".htm" print " * detail.py: pid=%s" % pid #contentStr = '<iframe class="iframe-preview" src="%s/%s/download/%s/%s"></iframe>' % \ # (contextPath, portalId, self.__oid, pid) payload = self.__storage.getPayload(self.__oid, pid) saxReader = SAXReader(Boolean.parseBoolean("false")) try: document = saxReader.read(payload.getInputStream()) except: traceback.print_exc() #slideNode = document.selectSingleNode("//div[@class='body']") slideNode = document.selectSingleNode("//*[local-name()='body']") #linkNodes = slideNode.selectNodes("//img") #contentStr = slideNode.asXML(); # encode character entities correctly out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(slideNode) writer.close() contentStr = out.toString("UTF-8") return contentStr
def save_xml(self, path , pretty = True): """ Saves the xml document to file """ self.backup_xml(path) format = None xmlwriter = None if pretty == True: format = OutputFormat.createPrettyPrint() format.setIndent(" ") format.setLineSeparator("\r\n") if format is None: xmlwriter = XMLWriter(FileWriter(path)) else: xmlwriter = XMLWriter(FileWriter(path), format) xmlwriter.write(self.xmldoc) xmlwriter.close()
def __createEpub(self): title = self.__manifest.get("title") response.setHeader("Content-Disposition", "attachment; filename=%s.epub" % urllib.quote(title)) out = response.getOutputStream("application/epub+zip") zipOutputStream = ZipOutputStream(out) #save mimetype... and the rest of standard files in epub zipOutputStream.putNextEntry(ZipEntry("mimetype")) epubMimetypeStream = self.__getResourceAsStream("/epub/mimetype") IOUtils.copy(epubMimetypeStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("META-INF/container.xml")) epubContainerStream = self.__getResourceAsStream("/epub/container.xml") IOUtils.copy(epubContainerStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("OEBPS/epub.css")) epubcss = self.__getResourceAsStream("/epub/epub.css") IOUtils.copy(epubcss, zipOutputStream) zipOutputStream.closeEntry() #### Creating toc.ncx #### tocXml = ElementTree.Element("ncx", {"version": "2005-1", "xml:lang":"en", "xmlns":"http://www.daisy.org/z3986/2005/ncx/"}) headNode = ElementTree.Element("head") tocXml.append(headNode) headNode.append(ElementTree.Element("meta", {"name": "dtb:uid", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:depth", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:totalPageCount", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:maxPageNumber", "content": "1"})) headNode.append(ElementTree.Element("meta", {"name": "dtb:generator", "content": "ICE v2"})) #docTitle docTitle = ElementTree.Element("docTitle") textNode = ElementTree.Element("text") textNode.text = title docTitle.append(textNode) tocXml.append(docTitle) #docAuthor docAuthor = ElementTree.Element("docAuthor") textNode = ElementTree.Element("text") textNode.text = "ICE v2" docAuthor.append(textNode) tocXml.append(docAuthor) #navMap navMap = ElementTree.Element("navMap") tocXml.append(navMap) #### Creating content.opf #### contentXml = ElementTree.Element("package", {"version": "2.0", "xmlns":"http://www.idpf.org/2007/opf", "unique-identifier":"BookId"}) metadataNode = ElementTree.Element("metadata", {"xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf"}) contentXml.append(metadataNode) #metadata information metadata = ElementTree.Element("dc:title") metadata.text = title metadataNode.append(metadata) metadata = ElementTree.Element("dc:language") metadata.text = "en-AU" metadataNode.append(metadata) metadata = ElementTree.Element("dc:creator", {"opf:role":"aut"}) metadata.text = "ICE" metadataNode.append(metadata) metadata = ElementTree.Element("dc:publisher") metadata.text = "University of Southern Queensland" metadataNode.append(metadata) metadata = ElementTree.Element("dc:identifier", {"id":"BookId"}) metadata.text = title metadataNode.append(metadata) #manifest manifest = ElementTree.Element("manifest") contentXml.append(manifest) spine = ElementTree.Element("spine", {"toc":"ncx"}) contentXml.append(spine) item = ElementTree.Element("item", {"id":"ncx", "href":"toc.ncx", "media-type":"text/xml"}) manifest.append(item) css = ElementTree.Element("item", {"id":"style", "href":"epub.css", "media-type":"text/css"}) manifest.append(css) count = 1 for itemHash in self.__orderedItem: id, title, htmlFileName, payloadDict, isImage = self.__itemRefDict[itemHash] for payloadId in payloadDict: payload, payloadType = payloadDict[payloadId] if isinstance(payload, Payload): payloadId = payloadId.lower() zipEntryId = payloadId.replace(" ", "_").replace("\\", "/") if payloadType == "application/xhtml+xml": zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId)) ##process the html.... saxReader = SAXReader(False) try: saxDoc = saxReader.read(payload.open()) payload.close() # ## remove class or style nodes # classOrStyleNodes = saxDoc.selectNodes("//@class | //@style ") # for classOrStyleNode in classOrStyleNodes: # node = classOrStyleNode # if classOrStyleNode.getParent(): # node = classOrStyleNode.getParent() # if node.getQualifiedName() == "img": # attr = node.attribute(QName("class")) # attr = node.attribute(QName("class")) # if attr: # node.remove(attr) # attr = node.attribute(QName("style")) # if attr: # node.remove(attr) ## remove name in a tags ahrefs = saxDoc.selectNodes("//*[local-name()='a' and @name!='']") for a in ahrefs: attr = a.attribute(QName("name")) if attr: a.remove(attr) ## fix images src name.... replace space with underscore and all lower case imgs = saxDoc.selectNodes("//*[local-name()='img' and contains(@src, '_files')]") for img in imgs: srcAttr = img.attribute(QName("src")) if srcAttr: src = srcAttr.getValue() #hash the sourcename filepath, filename = os.path.split(src) filename, ext = os.path.splitext(filename) filename = hashlib.md5(filename).hexdigest() src = os.path.join(filepath.lower().replace(" ", "_"), "node-%s%s" % (filename, ext)) img.addAttribute(QName("src"), src.replace(" ", "_")) bodyNode = saxDoc.selectSingleNode("//*[local-name()='div' and @class='body']") bodyNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(bodyNode) writer.flush() contentStr = out.toString("UTF-8") htmlString = """<?xml version="1.0" encoding="UTF-8"?> <html xmlns="http://www.w3.org/1999/xhtml"><head><title>%s</title> <link rel="stylesheet" href="epub.css"/> </head><body>%s</body></html>""" htmlString = htmlString % (title, contentStr) self.__copyString(htmlString, zipOutputStream) includeFile = False except: traceback.print_exc() else: #images.... zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload.open(), zipOutputStream) payload.close() zipOutputStream.closeEntry() else: zipOutputStream.putNextEntry(ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload, zipOutputStream) zipOutputStream.closeEntry() itemNode = ElementTree.Element("item", {"media-type":payloadType, "href": zipEntryId}) if payloadId == htmlFileName.lower(): itemNode.set("id", itemHash) else: itemNode.set("id", payloadId.replace("/", "_")) manifest.append(itemNode) if not isImage: navPoint = ElementTree.Element("navPoint", {"class":"chapter", "id":"%s" % itemHash, "playOrder":"%s" % count}) else: navPoint = ElementTree.Element("navPoint", {"class":"chapter", "id":"%s" % htmlFileName, "playOrder":"%s" % count}) navMap.append(navPoint) navLabel = ElementTree.Element("navLabel") navPoint.append(navLabel) textNode = ElementTree.Element("text") textNode.text = title navLabel.append(textNode) content = ElementTree.Element("content") navPoint.append(content) content.set("src", htmlFileName) count +=1 itemRefNode = ElementTree.Element("itemref") spine.append(itemRefNode) itemRefNode.set("idref", itemHash) #saving content.opf... zipOutputStream.putNextEntry(ZipEntry("OEBPS/content.opf")) self.__copyString(ElementTree.tostring(contentXml), zipOutputStream) zipOutputStream.closeEntry() #saving toc.ncx zipOutputStream.putNextEntry(ZipEntry("OEBPS/toc.ncx")) self.__copyString(ElementTree.tostring(tocXml), zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.close()
def __createEpub(self): title = self.__manifest.getString(None, "title") self.vc("response").setHeader( "Content-Disposition", "attachment; filename=%s.epub" % urllib.quote(title)) out = self.vc("response").getOutputStream("application/epub+zip") zipOutputStream = ZipOutputStream(out) #save mimetype... and the rest of standard files in epub zipOutputStream.putNextEntry(ZipEntry("mimetype")) epubMimetypeStream = self.__getResourceAsStream("/epub/mimetype") IOUtils.copy(epubMimetypeStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("META-INF/container.xml")) epubContainerStream = self.__getResourceAsStream("/epub/container.xml") IOUtils.copy(epubContainerStream, zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.putNextEntry(ZipEntry("OEBPS/epub.css")) epubcss = self.__getResourceAsStream("/epub/epub.css") IOUtils.copy(epubcss, zipOutputStream) zipOutputStream.closeEntry() #### Creating toc.ncx #### tocXml = ElementTree.Element( "ncx", { "version": "2005-1", "xml:lang": "en", "xmlns": "http://www.daisy.org/z3986/2005/ncx/" }) headNode = ElementTree.Element("head") tocXml.append(headNode) headNode.append( ElementTree.Element("meta", { "name": "dtb:uid", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:depth", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:totalPageCount", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:maxPageNumber", "content": "1" })) headNode.append( ElementTree.Element("meta", { "name": "dtb:generator", "content": "ICE v2" })) #docTitle docTitle = ElementTree.Element("docTitle") textNode = ElementTree.Element("text") textNode.text = title docTitle.append(textNode) tocXml.append(docTitle) #docAuthor docAuthor = ElementTree.Element("docAuthor") textNode = ElementTree.Element("text") textNode.text = "ICE v2" docAuthor.append(textNode) tocXml.append(docAuthor) #navMap navMap = ElementTree.Element("navMap") tocXml.append(navMap) #### Creating content.opf #### contentXml = ElementTree.Element( "package", { "version": "2.0", "xmlns": "http://www.idpf.org/2007/opf", "unique-identifier": "BookId" }) metadataNode = ElementTree.Element( "metadata", { "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf" }) contentXml.append(metadataNode) #metadata information metadata = ElementTree.Element("dc:title") metadata.text = title metadataNode.append(metadata) metadata = ElementTree.Element("dc:language") metadata.text = "en-AU" metadataNode.append(metadata) metadata = ElementTree.Element("dc:creator", {"opf:role": "aut"}) metadata.text = "ICE" metadataNode.append(metadata) metadata = ElementTree.Element("dc:publisher") metadata.text = "University of Southern Queensland" metadataNode.append(metadata) metadata = ElementTree.Element("dc:identifier", {"id": "BookId"}) metadata.text = title metadataNode.append(metadata) #manifest manifest = ElementTree.Element("manifest") contentXml.append(manifest) spine = ElementTree.Element("spine", {"toc": "ncx"}) contentXml.append(spine) item = ElementTree.Element("item", { "id": "ncx", "href": "toc.ncx", "media-type": "text/xml" }) manifest.append(item) css = ElementTree.Element("item", { "id": "style", "href": "epub.css", "media-type": "text/css" }) manifest.append(css) count = 1 for itemHash in self.__orderedItem: id, title, htmlFileName, payloadDict, isImage = self.__itemRefDict[ itemHash] for payloadId in payloadDict: payload, payloadType = payloadDict[payloadId] if isinstance(payload, Payload): payloadId = payloadId.lower() zipEntryId = payloadId.replace(" ", "_").replace("\\", "/") if payloadType == "application/xhtml+xml": zipOutputStream.putNextEntry( ZipEntry("OEBPS/%s" % zipEntryId)) ##process the html.... saxReader = SAXReader(False) try: saxDoc = saxReader.read(payload.open()) payload.close() # ## remove class or style nodes # classOrStyleNodes = saxDoc.selectNodes("//@class | //@style ") # for classOrStyleNode in classOrStyleNodes: # node = classOrStyleNode # if classOrStyleNode.getParent(): # node = classOrStyleNode.getParent() # if node.getQualifiedName() == "img": # attr = node.attribute(QName("class")) # attr = node.attribute(QName("class")) # if attr: # node.remove(attr) # attr = node.attribute(QName("style")) # if attr: # node.remove(attr) ## remove name in a tags ahrefs = saxDoc.selectNodes( "//*[local-name()='a' and @name!='']") for a in ahrefs: attr = a.attribute(QName("name")) if attr: a.remove(attr) ## fix images src name.... replace space with underscore and all lower case imgs = saxDoc.selectNodes( "//*[local-name()='img' and contains(@src, '_files')]" ) for img in imgs: srcAttr = img.attribute(QName("src")) if srcAttr: src = srcAttr.getValue() #hash the sourcename filepath, filename = os.path.split(src) filename, ext = os.path.splitext(filename) filename = hashlib.md5( filename).hexdigest() src = os.path.join( filepath.lower().replace(" ", "_"), "node-%s%s" % (filename, ext)) img.addAttribute(QName("src"), src.replace(" ", "_")) bodyNode = saxDoc.selectSingleNode( "//*[local-name()='div' and @class='body']") bodyNode.setName("div") out = ByteArrayOutputStream() format = OutputFormat.createPrettyPrint() format.setSuppressDeclaration(True) writer = XMLWriter(out, format) writer.write(bodyNode) writer.flush() contentStr = out.toString("UTF-8") htmlString = """<?xml version="1.0" encoding="UTF-8"?> <html xmlns="http://www.w3.org/1999/xhtml"><head><title>%s</title> <link rel="stylesheet" href="epub.css"/> </head><body>%s</body></html>""" htmlString = htmlString % (title, contentStr) self.__copyString(htmlString, zipOutputStream) includeFile = False except: traceback.print_exc() else: #images.... zipOutputStream.putNextEntry( ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload.open(), zipOutputStream) payload.close() zipOutputStream.closeEntry() else: zipOutputStream.putNextEntry( ZipEntry("OEBPS/%s" % zipEntryId)) IOUtils.copy(payload, zipOutputStream) zipOutputStream.closeEntry() itemNode = ElementTree.Element("item", { "media-type": payloadType, "href": zipEntryId }) if payloadId == htmlFileName.lower(): itemNode.set("id", itemHash) else: itemNode.set("id", payloadId.replace("/", "_")) manifest.append(itemNode) if not isImage: navPoint = ElementTree.Element( "navPoint", { "class": "chapter", "id": "%s" % itemHash, "playOrder": "%s" % count }) else: navPoint = ElementTree.Element( "navPoint", { "class": "chapter", "id": "%s" % htmlFileName, "playOrder": "%s" % count }) navMap.append(navPoint) navLabel = ElementTree.Element("navLabel") navPoint.append(navLabel) textNode = ElementTree.Element("text") textNode.text = title navLabel.append(textNode) content = ElementTree.Element("content") navPoint.append(content) content.set("src", htmlFileName) count += 1 itemRefNode = ElementTree.Element("itemref") spine.append(itemRefNode) itemRefNode.set("idref", itemHash) #saving content.opf... zipOutputStream.putNextEntry(ZipEntry("OEBPS/content.opf")) self.__copyString(ElementTree.tostring(contentXml), zipOutputStream) zipOutputStream.closeEntry() #saving toc.ncx zipOutputStream.putNextEntry(ZipEntry("OEBPS/toc.ncx")) self.__copyString(ElementTree.tostring(tocXml), zipOutputStream) zipOutputStream.closeEntry() zipOutputStream.close()