def run(self, path_to_ebook): # print("run FanficAuthorsNetCSSFix") # logger.warn("logger") book_format = 'epub' ## Really crude brute force check to see if it's a ## fanficauthors.net epub: epub = ZipFile( path_to_ebook, 'r') # works equally well with inputio as a path or a blob tocfile = "content/toc.ncx" if not (tocfile in epub.namelist() and "fanficauthors.net" in epub.read(tocfile)): # bail without doing anything return path_to_ebook print("It's a fanficauthors.net epub!") tmpfile = self.temporary_file('.' + book_format) outputepub = ZipFile(tmpfile, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(tmpfile, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 for fname in epub.namelist(): if fname.endswith('.html'): outputepub.writestr( fname, epub.read(fname).replace( """body { margin-top: 0px; padding-top: 0px; }""", """body { background-color: #FFFFFF; text-align: justify; margin: 2%; adobe-hyphenate: none; }""")) elif fname != "mimetype": outputepub.writestr(fname, epub.read(fname)) for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() # file = open(path_to_ebook, 'r+b') ext = os.path.splitext(path_to_ebook)[-1][1:].lower() mi = get_metadata(tmpfile, ext) mi.publisher = "fanficauthors.net" set_metadata(tmpfile, mi, ext) # return path_to_ebook return tmpfile.name
def generate_plugin(self): """Generate ZIP file with specified stylesheets.""" self._preprocess() __output = self.out __temp = StringIO.StringIO() __failed = False try: __zip = ZipFile(__temp, "w") __zip.debug = 3 try: # integrator self._run_generation(__zip, self.__generate_integrator, "%s/integrator.xml" % (self.plugin_name)) # plugin self._run_generation(__zip, self.__generate_plugin_file, "%s/plugin.xml" % (self.plugin_name)) # catalog self._run_generation(__zip, self.__generate_catalog, "%s/cfg/catalog.xml" % (self.plugin_name)) # font-mappings # self._run_generation(__zip, self.__generate_font_mappings, # "%s/cfg/fo/font-mappins.xml" % (self.plugin_name)) # custom XSLT self._run_generation(__zip, self.__generate_custom, "%s/cfg/fo/xsl/custom.xsl" % (self.plugin_name)) # custom XSLT attribute sets self._run_generation(__zip, self.__generate_custom_attr, "%s/cfg/fo/attrs/custom.xsl" % (self.plugin_name)) # shell XSLT if self.override_shell: self._run_generation(__zip, self.__generate_shell, "%s/xsl/fo/topic2fo_shell_%s.xsl" % (self.plugin_name, self.formatter)) # if not self.link_pagenumber or self.table_continued: for lang in self.variable_languages: self._run_generation(__zip, lambda: self.__generate_vars(lang), "%s/cfg/common/vars/%s.xml" % (self.plugin_name, lang)) # if self.generate_shell: # # shell XSLT # self._run_generation(__zip, self.__generate_shell, # "%s/xsl/fo/.xsl" % (self.plugin_name)) except: __failed = True raise Exception("Failed to write plugin", sys.exc_info()[1]), None, sys.exc_info()[2] finally: if __zip != None: __zip.close() if not __failed: __output.write(__temp.getvalue()) except: __failed = True raise Exception("Failed to write ZIP file to output", sys.exc_info()[1]), None, sys.exc_info()[2] finally: __temp.close()
def reset_orig_chapters_epub(inputio, outfile): inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob ## build zip in memory in case updating in place(CLI). zipio = BytesIO() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 changed = False unmerge_tocncxdoms = {} ## spin through file contents, saving any unmerge toc.ncx files. for zf in inputepub.namelist(): ## logger.debug("zf:%s"%zf) if zf.endswith('/toc.ncx'): ## logger.debug("toc.ncx zf:%s"%zf) unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf)) tocncxdom = parseString(inputepub.read('toc.ncx')) ## spin through file contents. for zf in inputepub.namelist(): if zf not in ['mimetype', 'toc.ncx'] and not zf.endswith('/toc.ncx'): entrychanged = False data = inputepub.read(zf) # if isinstance(data,unicode): # logger.debug("\n\n\ndata is unicode\n\n\n") if re.match(r'.*/file\d+\.xhtml', zf): #logger.debug("zf:%s"%zf) data = data.decode('utf-8') soup = make_soup(data) chapterorigtitle = None tag = soup.find('meta', {'name': 'chapterorigtitle'}) if tag: chapterorigtitle = tag['content'] # toctitle is separate for add_chapter_numbers:toconly users. chaptertoctitle = None tag = soup.find('meta', {'name': 'chaptertoctitle'}) if tag: chaptertoctitle = tag['content'] chaptertoctitle = chapterorigtitle chaptertitle = None tag = soup.find('meta', {'name': 'chaptertitle'}) if tag: chaptertitle = tag['content'] chaptertitle_tag = tag #logger.debug("chaptertitle:(%s) chapterorigtitle:(%s)"%(chaptertitle, chapterorigtitle)) if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle: origdata = data # data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>', # u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>') # data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>') # data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>') chaptertitle_tag['content'] = chapterorigtitle title_tag = soup.find('title') if title_tag and title_tag.string == chaptertitle: title_tag.string.replace_with(chapterorigtitle) h3_tag = soup.find('h3') if h3_tag and h3_tag.string == chaptertitle: h3_tag.string.replace_with(chapterorigtitle) data = unicode(soup) entrychanged = (origdata != data) changed = changed or entrychanged if entrychanged: logger.debug("\nentrychanged:%s\n" % zf) _replace_tocncx(tocncxdom, zf, chaptertoctitle) ## Also look for and update individual ## book toc.ncx files for anthology in case ## it's unmerged. zf_toc = zf[:zf.rfind('/OEBPS/')] + '/toc.ncx' mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')]) + 1 if zf_toc in unmerge_tocncxdoms: _replace_tocncx(unmerge_tocncxdoms[zf_toc], zf[mergedprefix_len:], chaptertoctitle) outputepub.writestr(zf, data.encode('utf-8')) else: # possibly binary data, thus no .encode(). outputepub.writestr(zf, data) for tocnm, tocdom in unmerge_tocncxdoms.items(): outputepub.writestr(tocnm, tocdom.toxml(encoding='utf-8')) outputepub.writestr('toc.ncx', tocncxdom.toxml(encoding='utf-8')) outputepub.close() # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 # only *actually* write if changed. if changed: if isinstance(outfile, basestring): with open(outfile, "wb") as outputio: outputio.write(zipio.getvalue()) else: outfile.write(zipio.getvalue()) inputepub.close() zipio.close() return changed
def writeStoryImpl(self, out): ## Python 2.5 ZipFile is rather more primative than later ## versions. It can operate on a file, or on a StringIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. zipio = StringIO.StringIO() ## mimetype must be first file and uncompressed. Python 2.5 ## ZipFile can't change compression type file-by-file, so we ## have to close and re-open outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED) outputepub.debug=3 outputepub.writestr('mimetype','application/epub+zip') outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, 'a', compression=ZIP_DEFLATED) outputepub.debug=3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toxml(encoding='utf-8')) containerdom.unlink() del containerdom ## Epub has two metadata files with real data. We're putting ## them in content.opf (pointed to by META-INF/container.xml) ## and toc.ncx (pointed to by content.opf) ## content.opf contains metadata, a 'manifest' list of all ## other included files, and another 'spine' list of the items in the ## file uniqueid= 'fanficfare-uid:%s-u%s-s%s' % ( self.getMetadata('site'), self.story.getList('authorId')[0], self.getMetadata('storyId')) contentdom = getDOMImplementation().createDocument(None, "package", None) package = contentdom.documentElement package.setAttribute("version","2.0") package.setAttribute("xmlns","http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier","fanficfare-uid") metadata=newTag(contentdom,"metadata", attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/", "xmlns:opf":"http://www.idpf.org/2007/opf"}) package.appendChild(metadata) metadata.appendChild(newTag(contentdom,"dc:identifier", text=uniqueid, attrs={"id":"fanficfare-uid"})) if self.getMetadata('title'): metadata.appendChild(newTag(contentdom,"dc:title",text=self.getMetadata('title'))) if self.getMetadata('author'): if self.story.isList('author'): for auth in self.story.getList('author'): metadata.appendChild(newTag(contentdom,"dc:creator", attrs={"opf:role":"aut"}, text=auth)) else: metadata.appendChild(newTag(contentdom,"dc:creator", attrs={"opf:role":"aut"}, text=self.getMetadata('author'))) metadata.appendChild(newTag(contentdom,"dc:contributor",text="FanFicFare [https://github.com/JimmXinu/FanFicFare]",attrs={"opf:role":"bkp"})) metadata.appendChild(newTag(contentdom,"dc:rights",text="")) if self.story.getMetadata('langcode'): metadata.appendChild(newTag(contentdom,"dc:language",text=self.story.getMetadata('langcode'))) else: metadata.appendChild(newTag(contentdom,"dc:language",text='en')) # published, created, updated, calibre # Leave calling self.story.getMetadataRaw directly in case date format changes. if self.story.getMetadataRaw('datePublished'): metadata.appendChild(newTag(contentdom,"dc:date", attrs={"opf:event":"publication"}, text=self.story.getMetadataRaw('datePublished').strftime("%Y-%m-%d"))) if self.story.getMetadataRaw('dateCreated'): metadata.appendChild(newTag(contentdom,"dc:date", attrs={"opf:event":"creation"}, text=self.story.getMetadataRaw('dateCreated').strftime("%Y-%m-%d"))) if self.story.getMetadataRaw('dateUpdated'): metadata.appendChild(newTag(contentdom,"dc:date", attrs={"opf:event":"modification"}, text=self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%d"))) metadata.appendChild(newTag(contentdom,"meta", attrs={"name":"calibre:timestamp", "content":self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%dT%H:%M:%S")})) if self.getMetadata('description'): metadata.appendChild(newTag(contentdom,"dc:description",text= self.getMetadata('description'))) for subject in self.story.getSubjectTags(): metadata.appendChild(newTag(contentdom,"dc:subject",text=subject)) if self.getMetadata('site'): metadata.appendChild(newTag(contentdom,"dc:publisher", text=self.getMetadata('site'))) if self.getMetadata('storyUrl'): metadata.appendChild(newTag(contentdom,"dc:identifier", attrs={"opf:scheme":"URL"}, text=self.getMetadata('storyUrl'))) metadata.appendChild(newTag(contentdom,"dc:source", text=self.getMetadata('storyUrl'))) ## end of metadata, create manifest. items = [] # list of (id, href, type, title) tuples(all strings) itemrefs = [] # list of strings -- idrefs from .opfs' spines items.append(("ncx","toc.ncx","application/x-dtbncx+xml",None)) ## we'll generate the toc.ncx file, ## but it needs to be in the items manifest. guide = None coverIO = None coverimgid = "image0000" if not self.story.cover and self.story.oldcover: logger.debug("writer_epub: no new cover, has old cover, write image.") (oldcoverhtmlhref, oldcoverhtmltype, oldcoverhtmldata, oldcoverimghref, oldcoverimgtype, oldcoverimgdata) = self.story.oldcover outputepub.writestr(oldcoverhtmlhref,oldcoverhtmldata) outputepub.writestr(oldcoverimghref,oldcoverimgdata) coverimgid = "image0" items.append((coverimgid, oldcoverimghref, oldcoverimgtype, None)) items.append(("cover",oldcoverhtmlhref,oldcoverhtmltype,None)) itemrefs.append("cover") metadata.appendChild(newTag(contentdom,"meta",{"content":"image0", "name":"cover"})) guide = newTag(contentdom,"guide") guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover", "title":"Cover", "href":oldcoverhtmlhref})) if self.getConfig('include_images'): imgcount=0 for imgmap in self.story.getImgUrls(): imgfile = "OEBPS/"+imgmap['newsrc'] outputepub.writestr(imgfile,imgmap['data']) items.append(("image%04d"%imgcount, imgfile, imgmap['mime'], None)) imgcount+=1 if 'cover' in imgfile: # make sure coverimgid is set to the cover, not # just the first image. coverimgid = items[-1][0] items.append(("style","OEBPS/stylesheet.css","text/css",None)) if self.story.cover: # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. items.append(("cover","OEBPS/cover.xhtml","application/xhtml+xml",None)) itemrefs.append("cover") # # <meta name="cover" content="cover.jpg"/> metadata.appendChild(newTag(contentdom,"meta",{"content":coverimgid, "name":"cover"})) # cover stuff for later: # at end of <package>: # <guide> # <reference type="cover" title="Cover" href="Text/cover.xhtml"/> # </guide> guide = newTag(contentdom,"guide") guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover", "title":"Cover", "href":"OEBPS/cover.xhtml"})) if self.hasConfig("cover_content"): COVER = string.Template(self.getConfig("cover_content")) else: COVER = self.EPUB_COVER coverIO = StringIO.StringIO() coverIO.write(COVER.substitute(dict(self.story.getAllMetadata().items()+{'coverimg':self.story.cover}.items()))) if self.getConfig("include_titlepage"): items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page")) itemrefs.append("title_page") if len(self.story.getChapters()) > 1 and self.getConfig("include_tocpage") and not self.metaonly : items.append(("toc_page","OEBPS/toc_page.xhtml","application/xhtml+xml","Table of Contents")) itemrefs.append("toc_page") dologpage = ( self.getConfig("include_logpage") == "smart" and \ (self.story.logfile or self.story.getMetadataRaw("status") == "In-Progress") ) \ or self.getConfig("include_logpage") == "true" if dologpage: items.append(("log_page","OEBPS/log_page.xhtml","application/xhtml+xml","Update Log")) itemrefs.append("log_page") for index, chap in enumerate(self.story.getChapters(fortoc=True)): if chap.html: i=index+1 items.append(("file%04d"%i, "OEBPS/file%04d.xhtml"%i, "application/xhtml+xml", chap.title)) itemrefs.append("file%04d"%i) manifest = contentdom.createElement("manifest") package.appendChild(manifest) for item in items: (id,href,type,title)=item manifest.appendChild(newTag(contentdom,"item", attrs={'id':id, 'href':href, 'media-type':type})) spine = newTag(contentdom,"spine",attrs={"toc":"ncx"}) package.appendChild(spine) for itemref in itemrefs: spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":itemref, "linear":"yes"})) # guide only exists if there's a cover. if guide: package.appendChild(guide) # write content.opf to zip. contentxml = contentdom.toxml(encoding='utf-8') # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace('<meta content="%s" name="cover"/>'%coverimgid, '<meta name="cover" content="%s"/>'%coverimgid) outputepub.writestr("content.opf",contentxml) contentdom.unlink() del contentdom ## create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version","2005-1") ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:uid", "content":uniqueid})) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:depth", "content":"1"})) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:totalPageCount", "content":"0"})) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:maxPageNumber", "content":"0"})) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild(newTag(tocncxdom,"text",text=self.getMetadata('title'))) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) # <navPoint id="<id>" playOrder="<risingnumberfrom0>"> # <navLabel> # <text><chapter title></text> # </navLabel> # <content src="<chapterfile>"/> # </navPoint> index=0 for item in items: (id,href,type,title)=item # only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title. if title : navPoint = newTag(tocncxdom,"navPoint", attrs={'id':id, 'playOrder':unicode(index)}) tocnavMap.appendChild(navPoint) navLabel = newTag(tocncxdom,"navLabel") navPoint.appendChild(navLabel) ## the xml library will re-escape as needed. navLabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title))) navPoint.appendChild(newTag(tocncxdom,"content",attrs={"src":href})) index=index+1 # write toc.ncx to zip file outputepub.writestr("toc.ncx",tocncxdom.toxml(encoding='utf-8')) tocncxdom.unlink() del tocncxdom # write stylesheet.css file. outputepub.writestr("OEBPS/stylesheet.css",self.EPUB_CSS.substitute(self.story.getAllMetadata())) # write title page. if self.getConfig("titlepage_use_table"): TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY NO_TITLE_ENTRY = self.EPUB_TABLE_NO_TITLE_ENTRY TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END else: TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START TITLE_ENTRY = self.EPUB_TITLE_ENTRY WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables. NO_TITLE_ENTRY = self.EPUB_NO_TITLE_ENTRY TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END if coverIO: outputepub.writestr("OEBPS/cover.xhtml",coverIO.getvalue()) coverIO.close() titlepageIO = StringIO.StringIO() self.writeTitlePage(out=titlepageIO, START=TITLE_PAGE_START, ENTRY=TITLE_ENTRY, WIDE_ENTRY=WIDE_TITLE_ENTRY, END=TITLE_PAGE_END, NO_TITLE_ENTRY=NO_TITLE_ENTRY) if titlepageIO.getvalue(): # will be false if no title page. outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue()) titlepageIO.close() # write toc page. tocpageIO = StringIO.StringIO() self.writeTOCPage(tocpageIO, self.EPUB_TOC_PAGE_START, self.EPUB_TOC_ENTRY, self.EPUB_TOC_PAGE_END) if tocpageIO.getvalue(): # will be false if no toc page. outputepub.writestr("OEBPS/toc_page.xhtml",tocpageIO.getvalue()) tocpageIO.close() if dologpage: # write log page. logpageIO = StringIO.StringIO() self.writeLogPage(logpageIO) outputepub.writestr("OEBPS/log_page.xhtml",logpageIO.getvalue()) logpageIO.close() if self.hasConfig('chapter_start'): CHAPTER_START = string.Template(self.getConfig("chapter_start")) else: CHAPTER_START = self.EPUB_CHAPTER_START if self.hasConfig('chapter_end'): CHAPTER_END = string.Template(self.getConfig("chapter_end")) else: CHAPTER_END = self.EPUB_CHAPTER_END for index, chap in enumerate(self.story.getChapters()): # (url,title,html) if chap.html: #logger.debug('Writing chapter text for: %s' % chap.title) vals={'url':removeEntities(chap.url), 'chapter':removeEntities(chap.title), 'origchapter':removeEntities(chap.origtitle), 'tocchapter':removeEntities(chap.toctitle), 'index':"%04d"%(index+1), 'number':index+1} # escape double quotes in all vals. for k,v in vals.items(): if isinstance(v,basestring): vals[k]=v.replace('"','"') fullhtml = CHAPTER_START.substitute(vals) + \ chap.html + CHAPTER_END.substitute(vals) # ffnet(& maybe others) gives the whole chapter text # as one line. This causes problems for nook(at # least) when the chapter size starts getting big # (200k+) fullhtml = re.sub(r'(</p>|<br ?/>)\n*',r'\1\n',fullhtml) outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8')) del fullhtml if self.story.calibrebookmark: outputepub.writestr("META-INF/calibre_bookmarks.txt",self.story.calibrebookmark) # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() out.write(zipio.getvalue()) zipio.close()
def doUnMerge(inputio, outdir=None): epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob outputios = [] ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagName("rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") contentdom = parseString(epub.read(rootfilename)) ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) logger.debug("relpath:%s" % relpath) # spin through the manifest--only place there are item tags. # Correction--only place there *should* be item tags. But # somebody found one that did. manifesttag = contentdom.getElementsByTagNameNS("*", "manifest")[0] for item in manifesttag.getElementsByTagNameNS("*", "item"): # look for our fake media-type for original rootfiles. if (item.getAttribute("media-type") == "origrootfile/xml"): # found one, assume the dir containing it is a complete # original epub, do initial setup of epub. itemhref = normpath(relpath + unquote(item.getAttribute("href"))) logger.debug("Found origrootfile:%s" % itemhref) curepubpath = re.sub(r'([^\d/]+/)+$', '', get_path_part(itemhref)) savehref = itemhref[len(curepubpath):] logger.debug("curepubpath:%s" % curepubpath) outputio = BytesIO() outputepub = ZipFile(outputio, "w", compression=ZIP_STORED, allowZip64=True) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED, allowZip64=True) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument( None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version", "1.0") containertop.setAttribute( "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild( newTag( containerdom, "rootfile", { "full-path": savehref, "media-type": "application/oebps-package+xml" })) outputepub.writestr( "META-INF/container.xml", containerdom.toprettyxml(indent=' ', encoding='utf-8')) outputepub.writestr(savehref, epub.read(itemhref)) for item2 in contentdom.getElementsByTagName("item"): item2href = normpath(relpath + unquote(item2.getAttribute("href"))) if item2href.startswith(curepubpath) and item2href != itemhref: save2href = item2href[len(curepubpath):] logger.debug("Found %s -> %s" % (item2href, save2href)) outputepub.writestr(save2href, epub.read(item2href)) # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() outputios.append(outputio) if outdir: outfilenames = [] for count, epubIO in enumerate(outputios): filename = "%s/%d.epub" % (outdir, count) logger.debug("write %s" % filename) outstream = open(filename, "wb") outstream.write(epubIO.getvalue()) outstream.close() outfilenames.append(filename) return outfilenames else: return outputios
def doMerge(outputio, files, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], titlenavpoints=True, originalnavpoints=True, flattentoc=False, printtimes=False, coverjpgpath=None, keepmetadatafiles=False, source=None): ''' outputio = output file name or BytesIO. files = list of input file names or BytesIOs. authoropts = list of authors to use, otherwise add from all input titleopt = title, otherwise '<first title> Anthology' descopt = description, otherwise '<title> by <author>' list for all input tags = dc:subject tags to include, otherwise none. languages = dc:language tags to include titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it originalnavpoints if true, include the original TOCs from each epub flattentoc if true, flatten TOC down to one level only. coverjpgpath, Path to a jpg to use as cover image. ''' printt = partial(cond_print, printtimes) ## Python 2.5 ZipFile is rather more primative than later ## versions. It can operate on a file, or on a BytesIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. filecount = 0 t = time() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED, allowZip64=True) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED, allowZip64=True) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument( None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version", "1.0") containertop.setAttribute( "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild( newTag( containerdom, "rootfile", { "full-path": "content.opf", "media-type": "application/oebps-package+xml" })) outputepub.writestr( "META-INF/container.xml", containerdom.toprettyxml(indent=' ', encoding='utf-8')) ## Process input epubs. items = [ ] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests items.append( ("ncx", "toc.ncx", "application/x-dtbncx+xml")) ## we'll generate the toc.ncx file, ## but it needs to be in the items manifest. itemrefs = [] # list of strings -- idrefs from .opfs' spines navmaps = [ ] # list of navMap DOM elements -- TOC data for each from toc.ncx files is_ffdl_epub = [] # list of t/f itemhrefs = { } # hash of item[id]s to itemref[href]s -- to find true start of book(s). firstitemhrefs = [] booktitles = [] # list of strings -- Each book's title allauthors = [] # list of lists of strings -- Each book's list of authors. filelist = [] printt("prep output:%s" % (time() - t)) t = time() booknum = 1 firstmetadom = None for file in files: if file == None: continue book = "%d" % booknum bookdir = "%d/" % booknum bookid = "a%d" % booknum epub = ZipFile(file, 'r') ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagNameNS("*", "rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) metadom = parseString(epub.read(rootfilename)) # logger.debug("metadom:%s"%epub.read(rootfilename)) if booknum == 1 and not source: try: firstmetadom = metadom.getElementsByTagNameNS("*", "metadata")[0] source = firstmetadom.getElementsByTagName( "dc:source")[0].firstChild.data.encode("utf-8") except: source = "" # if the epub was ever edited with Sigil, it changed the unique-identifier, # but dc:contributor was left. #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid") is_ffdl_epub.append(False) for c in metadom.getElementsByTagName("dc:contributor"): # logger.debug("dc:contributor:%s"%getText(c.childNodes)) if c.getAttribute("opf:role") == "bkp" and \ getText(c.childNodes) in ["fanficdownloader [http://fanficdownloader.googlecode.com]", "FanFicFare [https://github.com/JimmXinu/FanFicFare]"]: is_ffdl_epub[-1] = True # set last. break ## Save indiv book title try: booktitles.append( metadom.getElementsByTagName("dc:title")[0].firstChild.data) except: booktitles.append("(Title Missing)") ## Save authors. authors = [] for creator in metadom.getElementsByTagName("dc:creator"): try: if (creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None): authors.append(creator.firstChild.data) except: pass if len(authors) == 0: authors.append("(Author Missing)") allauthors.append(authors) if keepmetadatafiles: itemid = bookid + "rootfile" itemhref = rootfilename href = bookdir + itemhref logger.debug("write rootfile %s to %s" % (itemhref, href)) outputepub.writestr(href, epub.read(itemhref)) items.append((itemid, href, "origrootfile/xml")) # spin through the manifest--only place there are item tags. # Correction--only place there *should* be item tags. But # somebody found one that did. manifesttag = metadom.getElementsByTagNameNS("*", "manifest")[0] for item in manifesttag.getElementsByTagNameNS("*", "item"): itemid = bookid + item.getAttribute("id") itemhref = normpath(unquote( item.getAttribute("href"))) # remove %20, etc. href = bookdir + relpath + itemhref if (item.getAttribute("media-type") == "application/x-dtbncx+xml"): # TOC file is only one with this type--as far as I know. # grab the whole navmap, deal with it later. tocdom = parseString( epub.read(normpath(relpath + item.getAttribute("href")))) # update all navpoint ids with bookid for uniqueness. for navpoint in tocdom.getElementsByTagNameNS("*", "navPoint"): navpoint.setAttribute("id", bookid + navpoint.getAttribute("id")) # update all content paths with bookdir for uniqueness. for content in tocdom.getElementsByTagNameNS("*", "content"): content.setAttribute( "src", normpath(bookdir + relpath + content.getAttribute("src"))) navmaps.append(tocdom.getElementsByTagNameNS("*", "navMap")[0]) if keepmetadatafiles: logger.debug("write toc.ncx %s to %s" % (relpath + itemhref, href)) outputepub.writestr( href, epub.read(normpath(relpath + itemhref))) items.append((itemid, href, "origtocncx/xml")) else: #href=href.encode('utf8') logger.debug("item id: %s -> %s:" % (itemid, href)) itemhrefs[itemid] = href if href not in filelist: try: outputepub.writestr( href, epub.read(normpath(relpath + itemhref))) if re.match(r'.*/(file|chapter)\d+\.x?html', href): filecount += 1 items.append( (itemid, href, item.getAttribute("media-type"))) filelist.append(href) except KeyError as ke: # Skip missing files. logger.info("Skipping missing file %s (%s)" % (href, relpath + itemhref)) del itemhrefs[itemid] itemreflist = metadom.getElementsByTagNameNS("*", "itemref") # logger.debug("itemhrefs:%s"%itemhrefs) logger.debug("bookid:%s" % bookid) logger.debug("itemreflist[0].getAttribute(idref):%s" % itemreflist[0].getAttribute("idref")) # Looking for the first item in itemreflist that wasn't # discarded due to missing files. for itemref in itemreflist: idref = bookid + itemref.getAttribute("idref") if idref in itemhrefs: firstitemhrefs.append(itemhrefs[idref]) break for itemref in itemreflist: itemrefs.append(bookid + itemref.getAttribute("idref")) # logger.debug("adding to itemrefs:%s"%itemref.toprettyxml()) booknum = booknum + 1 printt("after file loop:%s" % (time() - t)) t = time() ## create content.opf file. uniqueid = "epubmerge-uid-%d" % time() # real sophisticated uid scheme. contentdom = getDOMImplementation().createDocument(None, "package", None) package = contentdom.documentElement package.setAttribute("version", "2.0") package.setAttribute("xmlns", "http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier", "epubmerge-id") metadata = newTag(contentdom, "metadata", attrs={ "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf" }) metadata.appendChild( newTag(contentdom, "dc:identifier", text=uniqueid, attrs={"id": "epubmerge-id"})) if (titleopt is None): titleopt = booktitles[0] + " Anthology" metadata.appendChild(newTag(contentdom, "dc:title", text=titleopt)) # If cmdline authors, use those instead of those collected from the epubs # (allauthors kept for TOC & description gen below. if (len(authoropts) > 1): useauthors = [authoropts] else: useauthors = allauthors usedauthors = dict() for authorlist in useauthors: for author in authorlist: if (author not in usedauthors): usedauthors[author] = author metadata.appendChild( newTag(contentdom, "dc:creator", attrs={"opf:role": "aut"}, text=author)) metadata.appendChild( newTag(contentdom, "dc:contributor", text="epubmerge", attrs={"opf:role": "bkp"})) metadata.appendChild( newTag(contentdom, "dc:rights", text="Copyrights as per source stories")) for l in languages: metadata.appendChild(newTag(contentdom, "dc:language", text=l)) if not descopt: # created now, but not filled in until TOC generation to save loops. description = newTag(contentdom, "dc:description", text="Anthology containing:\n") else: description = newTag(contentdom, "dc:description", text=descopt) metadata.appendChild(description) if source: metadata.appendChild( newTag(contentdom, "dc:identifier", attrs={"opf:scheme": "URL"}, text=source)) metadata.appendChild(newTag(contentdom, "dc:source", text=source)) for tag in tags: metadata.appendChild(newTag(contentdom, "dc:subject", text=tag)) package.appendChild(metadata) manifest = contentdom.createElement("manifest") package.appendChild(manifest) spine = newTag(contentdom, "spine", attrs={"toc": "ncx"}) package.appendChild(spine) if coverjpgpath: # in case coverjpg isn't a jpg: coverext = 'jpg' covertype = 'image/jpeg' try: coverext = coverjpgpath.split('.')[-1].lower() covertype = imagetypes.get(coverext, covertype) except: pass logger.debug("coverjpgpath:%s coverext:%s covertype:%s" % (coverjpgpath, coverext, covertype)) # <meta name="cover" content="cover.jpg"/> metadata.appendChild( newTag(contentdom, "meta", { "name": "cover", "content": "coverimageid" })) guide = newTag(contentdom, "guide") guide.appendChild( newTag(contentdom, "reference", attrs={ "type": "cover", "title": "Cover", "href": "cover.xhtml" })) package.appendChild(guide) manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': "coverimageid", 'href': "cover." + coverext, 'media-type': covertype })) # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': "cover", 'href': "cover.xhtml", 'media-type': "application/xhtml+xml" })) spine.appendChild( newTag(contentdom, "itemref", attrs={ "idref": "cover", "linear": "yes" })) for item in items: # logger.debug("new item:%s %s %s"%item) (id, href, type) = item manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': id, 'href': href, 'media-type': type })) for itemref in itemrefs: # logger.debug("itemref:%s"%itemref) spine.appendChild( newTag(contentdom, "itemref", attrs={ "idref": itemref, "linear": "yes" })) ## create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version", "2005-1") ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:uid", "content": uniqueid })) depthnode = newTag(tocncxdom, "meta", attrs={ "name": "dtb:depth", "content": "4" }) head.appendChild(depthnode) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:totalPageCount", "content": "0" })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:maxPageNumber", "content": "0" })) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild(newTag(tocncxdom, "text", text=titleopt)) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) booknum = 0 printt("wrote initial metadata:%s" % (time() - t)) t = time() for navmap in navmaps: depthnavpoints = navmap.getElementsByTagNameNS( "*", "navPoint") # for checking more than one TOC entry # logger.debug( [ x.toprettyxml() for x in navmap.childNodes ] ) ## only gets top level TOC entries. sub entries carried inside. navpoints = [ x for x in navmap.childNodes if isinstance(x, Element) and x.tagName == "navPoint" ] # logger.debug("len(navpoints):%s"%len(navpoints)) # logger.debug( [ x.toprettyxml() for x in navpoints ] ) newnav = None if titlenavpoints: newnav = newTag(tocncxdom, "navPoint", {"id": "book%03d" % booknum}) navlabel = newTag(tocncxdom, "navLabel") newnav.appendChild(navlabel) # For purposes of TOC titling & desc, use first book author. Skip adding author if only one. if len(usedauthors) > 1: title = booktitles[booknum] + " by " + allauthors[booknum][0] else: title = booktitles[booknum] navlabel.appendChild(newTag(tocncxdom, "text", text=title)) # Find the first 'spine' item's content for the title navpoint. # Many epubs have the first chapter as first navpoint, so we can't just # copy that anymore. newnav.appendChild( newTag(tocncxdom, "content", {"src": firstitemhrefs[booknum]})) # logger.debug("newnav:%s"%newnav.toprettyxml()) tocnavMap.appendChild(newnav) # logger.debug("tocnavMap:%s"%tocnavMap.toprettyxml()) else: newnav = tocnavMap if not descopt and len(allauthors[booknum]) > 0: description.appendChild( contentdom.createTextNode(booktitles[booknum] + " by " + allauthors[booknum][0] + "\n")) # If only one TOC point(total, not top level), or if not # including title nav point, include sub book TOC entries. if originalnavpoints and (len(depthnavpoints) > 1 or not titlenavpoints): for navpoint in navpoints: # logger.debug("navpoint:%s"%navpoint.toprettyxml()) newnav.appendChild(navpoint) navpoint.is_ffdl_epub = is_ffdl_epub[booknum] booknum = booknum + 1 # end of navmaps loop. maxdepth = 0 contentsrcs = {} removednodes = [] ## Force strict ordering of playOrder, stripping out some. playorder = 0 # logger.debug("tocncxdom:%s"%tocncxdom.toprettyxml()) for navpoint in tocncxdom.getElementsByTagNameNS("*", "navPoint"): # logger.debug("navpoint:%s"%navpoint.toprettyxml()) if navpoint in removednodes: continue # need content[src] to compare for dups. epub wants dup srcs to have same playOrder. contentsrc = None for n in navpoint.childNodes: if isinstance(n, Element) and n.tagName == "content": contentsrc = n.getAttribute("src") logger.debug("contentsrc: %s" % contentsrc) break if (contentsrc not in contentsrcs): parent = navpoint.parentNode try: # if the epub was ever edited with Sigil, it changed # the id, but the file name is the same. if navpoint.is_ffdl_epub and \ ( navpoint.getAttribute("id").endswith('log_page') \ or contentsrc.endswith("log_page.xhtml") ): logger.debug("Doing sibs 'filter' 1") sibs = [ x for x in parent.childNodes if isinstance(x, Element) and x.tagName == "navPoint" ] # if only logpage and one chapter, remove them from TOC and just show story. if len(sibs) == 2: parent.removeChild(navpoint) logger.debug("Removing %s:" % sibs[0].getAttribute("playOrder")) parent.removeChild(sibs[1]) removednodes.append(sibs[1]) except: pass # New src, new number. contentsrcs[contentsrc] = navpoint.getAttribute("id") playorder += 1 navpoint.setAttribute("playOrder", "%d" % playorder) logger.debug("playorder:%d:" % playorder) # need to know depth of deepest navpoint for <meta name="dtb:depth" content="2"/> npdepth = 1 dp = navpoint.parentNode while dp and dp.tagName != "navMap": npdepth += 1 dp = dp.parentNode if npdepth > maxdepth: maxdepth = npdepth else: # same content, look for ffdl and title_page and/or single chapter. # easier to just set it now, even if the node gets removed later. navpoint.setAttribute("playOrder", "%d" % playorder) logger.debug("playorder:%d:" % playorder) parent = navpoint.parentNode try: # if the epub was ever edited with Sigil, it changed # the id, but the file name is the same. if navpoint.is_ffdl_epub and \ ( navpoint.getAttribute("id").endswith('title_page') \ or contentsrc.endswith("title_page.xhtml") ): parent.removeChild(navpoint) logger.debug("Doing sibs 'filter' 2") sibs = [ x for x in parent.childNodes if isinstance(x, Element) and x.tagName == "navPoint" ] # if only one chapter after removing title_page, remove it too. if len(sibs) == 1: logger.debug("Removing %s:" % sibs[0].getAttribute("playOrder")) parent.removeChild(sibs[0]) removednodes.append(sibs[0]) except: pass if flattentoc: maxdepth = 1 # already have play order and pesky dup/single chapters # removed, just need to flatten. flattocnavMap = tocncxdom.createElement("navMap") for n in tocnavMap.getElementsByTagNameNS("*", "navPoint"): flattocnavMap.appendChild(n) ncx.replaceChild(flattocnavMap, tocnavMap) printt("navmap/toc maddess:%s" % (time() - t)) t = time() depthnode.setAttribute("content", "%d" % maxdepth) ## content.opf written now due to description being filled in ## during TOC generation to save loops. contentxml = contentdom.toprettyxml(indent=' ', encoding='utf-8') # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace( ensure_binary('<meta content="coverimageid" name="cover"/>'), ensure_binary('<meta name="cover" content="coverimageid"/>')) outputepub.writestr("content.opf", contentxml) outputepub.writestr("toc.ncx", tocncxdom.toprettyxml(indent=' ', encoding='utf-8')) printt("wrote opf/ncx files:%s" % (time() - t)) t = time() if coverjpgpath: # write, not write string. Pulling from file. outputepub.write(coverjpgpath, "cover." + coverext) outputepub.writestr( "cover.xhtml", ''' <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css"> @page {padding: 0pt; margin:0pt} body { text-align: center; padding:0pt; margin: 0pt; } div { margin: 0pt; padding: 0pt; } </style></head><body><div> <img src="cover.''' + coverext + '''" alt="cover"/> </div></body></html> ''') # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() printt("closed outputepub:%s" % (time() - t)) t = time() return (source, filecount)
def reset_orig_chapters_epub(inputio,outfile): inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob ## build zip in memory in case updating in place(CLI). zipio = StringIO() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 changed = False unmerge_tocncxdoms = {} ## spin through file contents, saving any unmerge toc.ncx files. for zf in inputepub.namelist(): ## logger.debug("zf:%s"%zf) if zf.endswith('/toc.ncx'): ## logger.debug("toc.ncx zf:%s"%zf) unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf)) tocncxdom = parseString(inputepub.read('toc.ncx')) ## spin through file contents. for zf in inputepub.namelist(): if zf not in ['mimetype','toc.ncx'] and not zf.endswith('/toc.ncx'): entrychanged = False data = inputepub.read(zf) # if isinstance(data,unicode): # logger.debug("\n\n\ndata is unicode\n\n\n") if re.match(r'.*/file\d+\.xhtml',zf): #logger.debug("zf:%s"%zf) data = data.decode('utf-8') soup = make_soup(data) chapterorigtitle = None tag = soup.find('meta',{'name':'chapterorigtitle'}) if tag: chapterorigtitle = tag['content'] # toctitle is separate for add_chapter_numbers:toconly users. chaptertoctitle = None tag = soup.find('meta',{'name':'chaptertoctitle'}) if tag: chaptertoctitle = tag['content'] chaptertoctitle = chapterorigtitle chaptertitle = None tag = soup.find('meta',{'name':'chaptertitle'}) if tag: chaptertitle = tag['content'] chaptertitle_tag = tag #logger.debug("chaptertitle:(%s) chapterorigtitle:(%s)"%(chaptertitle, chapterorigtitle)) if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle: origdata = data # data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>', # u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>') # data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>') # data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>') chaptertitle_tag['content'] = chapterorigtitle title_tag = soup.find('title') if title_tag and title_tag.string == chaptertitle: title_tag.string.replace_with(chapterorigtitle) h3_tag = soup.find('h3') if h3_tag and h3_tag.string == chaptertitle: h3_tag.string.replace_with(chapterorigtitle) data = unicode(soup) entrychanged = ( origdata != data ) changed = changed or entrychanged if entrychanged: logger.debug("\nentrychanged:%s\n"%zf) _replace_tocncx(tocncxdom,zf,chaptertoctitle) ## Also look for and update individual ## book toc.ncx files for anthology in case ## it's unmerged. zf_toc = zf[:zf.rfind('/OEBPS/')]+'/toc.ncx' mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')])+1 if zf_toc in unmerge_tocncxdoms: _replace_tocncx(unmerge_tocncxdoms[zf_toc],zf[mergedprefix_len:],chaptertoctitle) outputepub.writestr(zf,data.encode('utf-8')) else: # possibly binary data, thus no .encode(). outputepub.writestr(zf,data) for tocnm, tocdom in unmerge_tocncxdoms.items(): outputepub.writestr(tocnm,tocdom.toxml(encoding='utf-8')) outputepub.writestr('toc.ncx',tocncxdom.toxml(encoding='utf-8')) outputepub.close() # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 # only *actually* write if changed. if changed: if isinstance(outfile,basestring): with open(outfile,"wb") as outputio: outputio.write(zipio.getvalue()) else: outfile.write(zipio.getvalue()) inputepub.close() zipio.close() return changed
def write_split_epub(self, outputio, linenums, changedtocs={}, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], coverjpgpath=None): files = self.get_split_files(linenums) ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) #### ## create content.opf file. uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme. contentdom = getDOMImplementation().createDocument(None, "package", None) package = contentdom.documentElement package.setAttribute("version","2.0") package.setAttribute("xmlns","http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier","epubsplit-id") metadata=newTag(contentdom,"metadata", attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/", "xmlns:opf":"http://www.idpf.org/2007/opf"}) metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"})) if( titleopt is None ): titleopt = self.origtitle+" Split" metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt)) if( authoropts and len(authoropts) > 0 ): useauthors=authoropts else: useauthors=self.origauthors usedauthors=dict() for author in useauthors: if( not usedauthors.has_key(author) ): usedauthors[author]=author metadata.appendChild(newTag(contentdom,"dc:creator", attrs={"opf:role":"aut"}, text=author)) metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"})) metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories")) if languages: for l in languages: metadata.appendChild(newTag(contentdom,"dc:language",text=l)) else: metadata.appendChild(newTag(contentdom,"dc:language",text="en")) if not descopt: # created now, but not filled in until TOC generation to save loops. description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors))) else: description = newTag(contentdom,"dc:description",text=descopt) metadata.appendChild(description) for tag in tags: metadata.appendChild(newTag(contentdom,"dc:subject",text=tag)) package.appendChild(metadata) manifest = contentdom.createElement("manifest") package.appendChild(manifest) spine = newTag(contentdom,"spine",attrs={"toc":"ncx"}) package.appendChild(spine) manifest.appendChild(newTag(contentdom,"item", attrs={'id':'ncx', 'href':'toc.ncx', 'media-type':'application/x-dtbncx+xml'})) if coverjpgpath: # <meta name="cover" content="cover.jpg"/> metadata.appendChild(newTag(contentdom,"meta",{"name":"cover", "content":"coverimageid"})) # cover stuff for later: # at end of <package>: # <guide> # <reference type="cover" title="Cover" href="Text/cover.xhtml"/> # </guide> guide = newTag(contentdom,"guide") guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover", "title":"Cover", "href":"cover.xhtml"})) package.appendChild(guide) manifest.appendChild(newTag(contentdom,"item", attrs={'id':"coverimageid", 'href':"cover.jpg", 'media-type':"image/jpeg"})) # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. manifest.appendChild(newTag(contentdom,"item", attrs={'id':"cover", 'href':"cover.xhtml", 'media-type':"application/xhtml+xml"})) spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":"cover", "linear":"yes"})) contentcount=0 for (filename,id,type,filedata) in files: #filename = self.filecache.addHtml(href,filedata) #print("writing :%s"%filename) # add to manifest and spine if coverjpgpath and filename == "cover.xhtml": continue # don't dup cover. outputepub.writestr(filename,filedata.encode('utf-8')) id = "a%d"%contentcount contentcount += 1 manifest.appendChild(newTag(contentdom,"item", attrs={'id':id, 'href':filename, 'media-type':type})) spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":id, "linear":"yes"})) for (linked,type) in self.filecache.linkedfiles: # add to manifest if coverjpgpath and linked == "cover.jpg": continue # don't dup cover. try: outputepub.writestr(linked,self.get_file(linked)) except Exception, e: print("Failed to copy linked file (%s)\nException: %s"%(linked,e)) id = "a%d"%contentcount contentcount += 1 manifest.appendChild(newTag(contentdom,"item", attrs={'id':id, 'href':linked, 'media-type':type}))
def write_split_epub(self, outputio, linenums, changedtocs={}, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], coverjpgpath=None): files = self.get_split_files(linenums) ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument( None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version", "1.0") containertop.setAttribute( "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild( newTag( containerdom, "rootfile", { "full-path": "content.opf", "media-type": "application/oebps-package+xml" })) outputepub.writestr( "META-INF/container.xml", containerdom.toprettyxml(indent=' ', encoding='utf-8')) #### ## create content.opf file. uniqueid = "epubsplit-uid-%d" % time( ) # real sophisticated uid scheme. contentdom = getDOMImplementation().createDocument( None, "package", None) package = contentdom.documentElement package.setAttribute("version", "2.0") package.setAttribute("xmlns", "http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier", "epubsplit-id") metadata = newTag(contentdom, "metadata", attrs={ "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf" }) metadata.appendChild( newTag(contentdom, "dc:identifier", text=uniqueid, attrs={"id": "epubsplit-id"})) if (titleopt is None): titleopt = self.origtitle + " Split" metadata.appendChild(newTag(contentdom, "dc:title", text=titleopt)) if (authoropts and len(authoropts) > 0): useauthors = authoropts else: useauthors = self.origauthors usedauthors = dict() for author in useauthors: if (author not in usedauthors): usedauthors[author] = author metadata.appendChild( newTag(contentdom, "dc:creator", attrs={"opf:role": "aut"}, text=author)) metadata.appendChild( newTag(contentdom, "dc:contributor", text="epubsplit", attrs={"opf:role": "bkp"})) metadata.appendChild( newTag(contentdom, "dc:rights", text="Copyrights as per source stories")) if languages: for l in languages: metadata.appendChild(newTag(contentdom, "dc:language", text=l)) else: metadata.appendChild(newTag(contentdom, "dc:language", text="en")) if not descopt: # created now, but not filled in until TOC generation to save loops. description = newTag(contentdom, "dc:description", text="Split from %s by %s." % (self.origtitle, ", ".join(self.origauthors))) else: description = newTag(contentdom, "dc:description", text=descopt) metadata.appendChild(description) for tag in tags: metadata.appendChild(newTag(contentdom, "dc:subject", text=tag)) package.appendChild(metadata) manifest = contentdom.createElement("manifest") package.appendChild(manifest) spine = newTag(contentdom, "spine", attrs={"toc": "ncx"}) package.appendChild(spine) manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': 'ncx', 'href': 'toc.ncx', 'media-type': 'application/x-dtbncx+xml' })) if coverjpgpath: # <meta name="cover" content="cover.jpg"/> metadata.appendChild( newTag(contentdom, "meta", { "name": "cover", "content": "coverimageid" })) # cover stuff for later: # at end of <package>: # <guide> # <reference type="cover" title="Cover" href="Text/cover.xhtml"/> # </guide> guide = newTag(contentdom, "guide") guide.appendChild( newTag(contentdom, "reference", attrs={ "type": "cover", "title": "Cover", "href": "cover.xhtml" })) package.appendChild(guide) manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': "coverimageid", 'href': "cover.jpg", 'media-type': "image/jpeg" })) # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': "cover", 'href': "cover.xhtml", 'media-type': "application/xhtml+xml" })) spine.appendChild( newTag(contentdom, "itemref", attrs={ "idref": "cover", "linear": "yes" })) contentcount = 0 for (filename, id, type, filedata) in files: #filename = self.filecache.addHtml(href,filedata) #print("writing :%s"%filename) # add to manifest and spine if coverjpgpath and filename == "cover.xhtml": continue # don't dup cover. outputepub.writestr(filename, filedata.encode('utf-8')) id = "a%d" % contentcount contentcount += 1 manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': id, 'href': filename, 'media-type': type })) spine.appendChild( newTag(contentdom, "itemref", attrs={ "idref": id, "linear": "yes" })) for (linked, type) in self.filecache.linkedfiles: # add to manifest if coverjpgpath and linked == "cover.jpg": continue # don't dup cover. try: outputepub.writestr(linked, self.get_file(linked)) except Exception as e: print("Skipping linked file (%s)\nException: %s" % (linked, e)) id = "a%d" % contentcount contentcount += 1 manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': id, 'href': linked, 'media-type': type })) contentxml = contentdom.toprettyxml(indent=' ') # ,encoding='utf-8' # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace( '<meta content="coverimageid" name="cover"/>', '<meta name="cover" content="coverimageid"/>') outputepub.writestr("content.opf", contentxml) ## create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version", "2005-1") ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:uid", "content": uniqueid })) depthnode = newTag(tocncxdom, "meta", attrs={ "name": "dtb:depth", "content": "1" }) head.appendChild(depthnode) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:totalPageCount", "content": "0" })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:maxPageNumber", "content": "0" })) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild( newTag(tocncxdom, "text", text=stripHTML(titleopt))) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) # come back to lines again for TOC because files only has files(gasp-shock!) count = 1 for line in self.split_lines: if 'include' in line: # if changed, use only changed values. if line['num'] in changedtocs: line['toc'] = changedtocs[line['num']] # can have more than one toc entry. for title in line['toc']: newnav = newTag(tocncxdom, "navPoint", { "id": "a%03d" % count, "playOrder": "%d" % count }) count += 1 tocnavMap.appendChild(newnav) navlabel = newTag(tocncxdom, "navLabel") newnav.appendChild(navlabel) # For purposes of TOC titling & desc, use first book author navlabel.appendChild( newTag(tocncxdom, "text", text=stripHTML(title))) # Find the first 'spine' item's content for the title navpoint. # Many epubs have the first chapter as first navpoint, so we can't just # copy that anymore. if line['anchor'] and line['href'] + "#" + line[ 'anchor'] in self.filecache.anchors: src = self.filecache.anchors[line['href'] + "#" + line['anchor']] #print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src)) else: #print("toc from href(%s)"%line['href']) src = line['href'] newnav.appendChild( newTag(tocncxdom, "content", {"src": src})) outputepub.writestr( "toc.ncx", tocncxdom.toprettyxml(indent=' ', encoding='utf-8')) if coverjpgpath: # write, not write string. Pulling from file. outputepub.write(coverjpgpath, "cover.jpg") outputepub.writestr( "cover.xhtml", ''' <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css"> @page {padding: 0pt; margin:0pt} body { text-align: center; padding:0pt; margin: 0pt; } div { margin: 0pt; padding: 0pt; } </style></head><body><div> <img src="cover.jpg" alt="cover"/> </div></body></html> ''') # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close()
def doMerge(outputio, files, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], titlenavpoints=True, flattentoc=False, printtimes=False, coverjpgpath=None, keepmetadatafiles=False, source=None): ''' outputio = output file name or StringIO. files = list of input file names or StringIOs. authoropts = list of authors to use, otherwise add from all input titleopt = title, otherwise '<first title> Anthology' descopt = description, otherwise '<title> by <author>' list for all input tags = dc:subject tags to include, otherwise none. languages = dc:language tags to include titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it flattentoc if true, flatten TOC down to one level only. coverjpgpath, Path to a jpg to use as cover image. ''' printt = partial(cond_print,printtimes) ## Python 2.5 ZipFile is rather more primative than later ## versions. It can operate on a file, or on a StringIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. filecount=0 t = time() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) ## Process input epubs. items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file, ## but it needs to be in the items manifest. itemrefs = [] # list of strings -- idrefs from .opfs' spines navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files is_ffdl_epub = [] # list of t/f itemhrefs = {} # hash of item[id]s to itemref[href]s -- to find true start of book(s). firstitemhrefs = [] booktitles = [] # list of strings -- Each book's title allauthors = [] # list of lists of strings -- Each book's list of authors. filelist = [] printt("prep output:%s"%(time()-t)) t = time() booknum=1 firstmetadom = None for file in files: if file == None : continue book = "%d" % booknum bookdir = "%d/" % booknum bookid = "a%d" % booknum #print "book %d" % booknum epub = ZipFile(file, 'r') ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagNameNS("*","rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) metadom = parseString(epub.read(rootfilename)) #print("metadom:%s"%epub.read(rootfilename)) if booknum==1 and not source: try: firstmetadom = metadom.getElementsByTagNameNS("*","metadata")[0] source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8") except: source="" #print "Source:%s"%source # if the epub was ever edited with Sigil, it changed the unique-identifier, # but dc:contributor was left. #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid") is_ffdl_epub.append(False) for c in metadom.getElementsByTagName("dc:contributor"): if c.getAttribute("opf:role") == "bkp" and \ getText(c.childNodes) == "fanficdownloader [http://fanficdownloader.googlecode.com]": is_ffdl_epub[-1] = True # set last. break; ## Save indiv book title try: booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data) except: booktitles.append("(Title Missing)") ## Save authors. authors=[] for creator in metadom.getElementsByTagName("dc:creator"): try: if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None): authors.append(creator.firstChild.data) except: pass if len(authors) == 0: authors.append("(Author Missing)") allauthors.append(authors) if keepmetadatafiles: itemid=bookid+"rootfile" itemhref = rootfilename href=bookdir+itemhref #print("write rootfile %s to %s"%(itemhref,href)) outputepub.writestr(href, epub.read(itemhref)) items.append((itemid,href,"origrootfile/xml")) # spin through the manifest--only place there are item tags. # Correction--only place there *should* be item tags. But # somebody found one that did. manifesttag=metadom.getElementsByTagNameNS("*","manifest")[0] for item in manifesttag.getElementsByTagNameNS("*","item"): itemid=bookid+item.getAttribute("id") itemhref = unquote(item.getAttribute("href")) # remove %20, etc. href=bookdir+relpath+itemhref if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ): # TOC file is only one with this type--as far as I know. # grab the whole navmap, deal with it later. tocdom = parseString(re.sub(r'(&|&)', r'and', epub.read(relpath+item.getAttribute("href")))) # update all navpoint ids with bookid for uniqueness. for navpoint in tocdom.getElementsByTagNameNS("*","navPoint"): navpoint.setAttribute("id",bookid+navpoint.getAttribute("id")) # update all content paths with bookdir for uniqueness. for content in tocdom.getElementsByTagNameNS("*","content"): content.setAttribute("src",bookdir+relpath+content.getAttribute("src")) navmaps.append(tocdom.getElementsByTagNameNS("*","navMap")[0]) if keepmetadatafiles: #print("write toc.ncx %s to %s"%(relpath+itemhref,href)) outputepub.writestr(href, epub.read(relpath+itemhref)) items.append((itemid,href,"origtocncx/xml")) else: href=href.encode('utf8') #print("item id: %s -> %s:"%(itemid,href)) itemhrefs[itemid] = href if href not in filelist: try: outputepub.writestr(href, epub.read(relpath+itemhref)) if re.match(r'.*/(file|chapter)\d+\.x?html',href): filecount+=1 items.append((itemid,href,item.getAttribute("media-type"))) filelist.append(href) except KeyError, ke: pass # Skip missing files. itemreflist = metadom.getElementsByTagNameNS("*","itemref") # print("itemreflist:%s"%itemreflist) # print("itemhrefs:%s"%itemhrefs) # print("bookid:%s"%bookid) # print("itemreflist[0].getAttribute(idref):%s"%itemreflist[0].getAttribute("idref")) firstitemhrefs.append(itemhrefs[bookid+itemreflist[0].getAttribute("idref")]) for itemref in itemreflist: itemrefs.append(bookid+itemref.getAttribute("idref")) booknum=booknum+1;
def doMerge(outputio, files, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], titlenavpoints=True, flattentoc=False, printtimes=False, coverjpgpath=None, keepmetadatafiles=False, source=None): ''' outputio = output file name or StringIO. files = list of input file names or StringIOs. authoropts = list of authors to use, otherwise add from all input titleopt = title, otherwise '<first title> Anthology' descopt = description, otherwise '<title> by <author>' list for all input tags = dc:subject tags to include, otherwise none. languages = dc:language tags to include titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it flattentoc if true, flatten TOC down to one level only. coverjpgpath, Path to a jpg to use as cover image. ''' printt = partial(cond_print,printtimes) ## Python 2.5 ZipFile is rather more primative than later ## versions. It can operate on a file, or on a StringIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. filecount=0 t = time() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) ## Process input epubs. items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file, ## but it needs to be in the items manifest. itemrefs = [] # list of strings -- idrefs from .opfs' spines navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files is_ffdl_epub = [] # list of t/f itemhrefs = {} # hash of item[id]s to itemref[href]s -- to find true start of book(s). firstitemhrefs = [] booktitles = [] # list of strings -- Each book's title allauthors = [] # list of lists of strings -- Each book's list of authors. filelist = [] printt("prep output:%s"%(time()-t)) t = time() booknum=1 firstmetadom = None for file in files: if file == None : continue book = "%d" % booknum bookdir = "%d/" % booknum bookid = "a%d" % booknum epub = ZipFile(file, 'r') ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagNameNS("*","rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) metadom = parseString(epub.read(rootfilename)) #logger.debug("metadom:%s"%epub.read(rootfilename)) if booknum==1 and not source: try: firstmetadom = metadom.getElementsByTagNameNS("*","metadata")[0] source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8") except: source="" # if the epub was ever edited with Sigil, it changed the unique-identifier, # but dc:contributor was left. #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid") is_ffdl_epub.append(False) for c in metadom.getElementsByTagName("dc:contributor"): # logger.debug("dc:contributor:%s"%getText(c.childNodes)) if c.getAttribute("opf:role") == "bkp" and \ getText(c.childNodes) in ["fanficdownloader [http://fanficdownloader.googlecode.com]", "FanFicFare [https://github.com/JimmXinu/FanFicFare]"]: is_ffdl_epub[-1] = True # set last. break; ## Save indiv book title try: booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data) except: booktitles.append("(Title Missing)") ## Save authors. authors=[] for creator in metadom.getElementsByTagName("dc:creator"): try: if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None): authors.append(creator.firstChild.data) except: pass if len(authors) == 0: authors.append("(Author Missing)") allauthors.append(authors) if keepmetadatafiles: itemid=bookid+"rootfile" itemhref = rootfilename href=bookdir+itemhref #logger.debug("write rootfile %s to %s"%(itemhref,href)) outputepub.writestr(href, epub.read(itemhref)) items.append((itemid,href,"origrootfile/xml")) # spin through the manifest--only place there are item tags. # Correction--only place there *should* be item tags. But # somebody found one that did. manifesttag=metadom.getElementsByTagNameNS("*","manifest")[0] for item in manifesttag.getElementsByTagNameNS("*","item"): itemid=bookid+item.getAttribute("id") itemhref = unquote(item.getAttribute("href")) # remove %20, etc. href=bookdir+relpath+itemhref if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ): # TOC file is only one with this type--as far as I know. # grab the whole navmap, deal with it later. tocdom = parseString(epub.read(relpath+item.getAttribute("href"))) # update all navpoint ids with bookid for uniqueness. for navpoint in tocdom.getElementsByTagNameNS("*","navPoint"): navpoint.setAttribute("id",bookid+navpoint.getAttribute("id")) # update all content paths with bookdir for uniqueness. for content in tocdom.getElementsByTagNameNS("*","content"): content.setAttribute("src",bookdir+relpath+content.getAttribute("src")) navmaps.append(tocdom.getElementsByTagNameNS("*","navMap")[0]) if keepmetadatafiles: #logger.debug("write toc.ncx %s to %s"%(relpath+itemhref,href)) outputepub.writestr(href, epub.read(relpath+itemhref)) items.append((itemid,href,"origtocncx/xml")) else: href=href.encode('utf8') #logger.debug("item id: %s -> %s:"%(itemid,href)) itemhrefs[itemid] = href if href not in filelist: try: outputepub.writestr(href, epub.read(relpath+itemhref)) if re.match(r'.*/(file|chapter)\d+\.x?html',href): filecount+=1 items.append((itemid,href,item.getAttribute("media-type"))) filelist.append(href) except KeyError, ke: # Skip missing files. logger.info("Skipping missing file %s (%s)"%(href,relpath+itemhref)) del itemhrefs[itemid] itemreflist = metadom.getElementsByTagNameNS("*","itemref") # logger.debug("itemreflist:%s"%itemreflist) # logger.debug("itemhrefs:%s"%itemhrefs) # logger.debug("bookid:%s"%bookid) # logger.debug("itemreflist[0].getAttribute(idref):%s"%itemreflist[0].getAttribute("idref")) # Looking for the first item in itemreflist that wasn't # discarded due to missing files. for itemref in itemreflist: idref = bookid+itemref.getAttribute("idref") if idref in itemhrefs: firstitemhrefs.append(itemhrefs[idref]) break for itemref in itemreflist: itemrefs.append(bookid+itemref.getAttribute("idref")) booknum=booknum+1;
def reset_orig_chapters_epub(inputio,outfile): inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob ## build zip in memory in case updating in place(CLI). zipio = StringIO() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 changed = False tocncxdom = parseString(inputepub.read('toc.ncx')) ## spin through file contents. for zf in inputepub.namelist(): if zf not in ['mimetype','toc.ncx'] : entrychanged = False data = inputepub.read(zf) # if isinstance(data,unicode): # logger.debug("\n\n\ndata is unicode\n\n\n") if re.match(r'.*/file\d+\.xhtml',zf): data = data.decode('utf-8') soup = bs.BeautifulSoup(data,"html5lib") chapterorigtitle = None tag = soup.find('meta',{'name':'chapterorigtitle'}) if tag: chapterorigtitle = tag['content'] # toctitle is separate for add_chapter_numbers:toconly users. chaptertoctitle = None tag = soup.find('meta',{'name':'chaptertoctitle'}) if tag: chaptertoctitle = tag['content'] elif chapterorigtitle: chaptertoctitle = chapterorigtitle chaptertitle = None tag = soup.find('meta',{'name':'chaptertitle'}) if tag: chaptertitle = tag['content'] if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle: origdata = data # print("\n%s\n%s\n"%(chapterorigtitle,chaptertitle)) data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>', u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>') data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>') data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>') entrychanged = ( origdata != data ) changed = changed or entrychanged if entrychanged: ## go after the TOC entry, too. # <navPoint id="file0005" playOrder="6"> # <navLabel> # <text>5. (new) Chapter 4</text> # </navLabel> # <content src="OEBPS/file0005.xhtml"/> # </navPoint> for contenttag in tocncxdom.getElementsByTagName("content"): if contenttag.getAttribute('src') == zf: texttag = contenttag.parentNode.getElementsByTagName('navLabel')[0].getElementsByTagName('text')[0] texttag.childNodes[0].replaceWholeText(chaptertoctitle) # logger.debug("text label:%s"%texttag.toxml()) continue outputepub.writestr(zf,data.encode('utf-8')) else: # possibly binary data, thus no .encode(). outputepub.writestr(zf,data) outputepub.writestr('toc.ncx',tocncxdom.toxml(encoding='utf-8')) outputepub.close() # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 # only *actually* write if changed. if changed: if isinstance(outfile,basestring): with open(outfile,"wb") as outputio: outputio.write(zipio.getvalue()) else: outfile.write(zipio.getvalue()) inputepub.close() zipio.close() return changed
def write(self, report=False, dirpath=None): n = ffnet_notify().progress_init(int( self.S.metadata['numChapters'])).shadow(self.S.storyID) file_name = string.Template("${title} - ${author}.epub").substitute( self.S.metadata).encode('utf8') if dirpath: file_name = os.path.normpath(dirpath + (dirpath[-1] != "/" and "/" or "") + file_name) else: file_name = 'stories/' + file_name logger.info("Save directly to file: %s" % file_name) try: os.makedirs(os.path.dirname(os.path.normpath(file_name))) except: pass outstream = open(file_name, "wb") outputepub = ZipFile(outstream, 'w', compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr('mimetype', 'application/epub+zip') outputepub.close() outputepub = ZipFile(outstream, 'a', compression=ZIP_DEFLATED) outputepub.debug = 3 containerdom = getDOMImplementation().createDocument( None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version", "1.0") containertop.setAttribute( "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild( newTag( containerdom, "rootfile", { "full-path": "content.opf", "media-type": "application/oebps-package+xml" })) outputepub.writestr("META-INF/container.xml", containerdom.toxml(encoding='utf-8')) containerdom.unlink() del containerdom # TODO change this? # uniqueid = 'fanficfare-uid:%s-u%s-s%s' % ( # self.S.metadata['site'], # self.S.metadata['authorId'][0], # self.S.metadata['storyId'] # ) contentdom = getDOMImplementation().createDocument( None, "package", None) package = contentdom.documentElement package.setAttribute("version", "2.0") package.setAttribute("xmlns", "http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier", "fanficfare-uid") metadata = newTag(contentdom, "metadata", attrs={ "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf" }) package.appendChild(metadata) if self.S.metadata['title']: metadata.appendChild( newTag(contentdom, "dc:title", text=self.S.metadata['title'])) if self.S.metadata['author']: metadata.appendChild( newTag(contentdom, "dc:creator", attrs={"opf:role": "aut"}, text=self.S.metadata['author'])) metadata.appendChild( newTag(contentdom, "dc:contributor", text="Automatia", attrs={"opf:role": "bkp"})) metadata.appendChild(newTag(contentdom, "dc:rights", text="")) if self.S.metadata['langcode']: metadata.appendChild( newTag(contentdom, "dc:language", text=self.S.metadata['langcode'])) else: metadata.appendChild(newTag(contentdom, "dc:language", text='en')) # published, created, updated, calibre # Leave calling self.story.getMetadataRaw directly in case date format changes. if self.S.metadata['datePublished']: metadata.appendChild( newTag(contentdom, "dc:date", attrs={"opf:event": "publication"}, text=self.S.metadata['datePublished'].strftime( "%Y-%m-%d"))) if 'dateUpdated' in self.S.metadata: metadata.appendChild( newTag( contentdom, "dc:date", attrs={"opf:event": "modification"}, text=self.S.metadata['dateUpdated'].strftime("%Y-%m-%d"))) metadata.appendChild( newTag(contentdom, "meta", attrs={ "name": "calibre:timestamp", "content": self.S.metadata['dateUpdated'].strftime( "%Y-%m-%dT%H:%M:%S") })) if self.S.metadata['description']: metadata.appendChild( newTag(contentdom, "dc:description", text=self.S.metadata['description'])) # FIXME ??? # for subject in self.story.getSubjectTags(): # metadata.appendChild(newTag(contentdom, "dc:subject", text=subject)) if self.S.metadata['storyUrl']: metadata.appendChild( newTag(contentdom, "dc:identifier", attrs={"opf:scheme": "URL"}, text=self.S.metadata['storyUrl'])) metadata.appendChild( newTag(contentdom, "dc:source", text=self.S.metadata['storyUrl'])) items = [] # list of (id, href, type, title) tuples(all strings) itemrefs = [] # list of strings -- idrefs from .opfs' spines items.append(("ncx", "toc.ncx", "application/x-dtbncx+xml", None)) guide = None coverIO = None coverimgid = "image0000" # FIXME cover # if None: # not self.story.cover and self.story.oldcover: # logger.debug("writer_epub: no new cover, has old cover, write image.") # (oldcoverhtmlhref, # oldcoverhtmltype, # oldcoverhtmldata, # oldcoverimghref, # oldcoverimgtype, # oldcoverimgdata) = self.story.oldcover # outputepub.writestr(oldcoverhtmlhref, oldcoverhtmldata) # outputepub.writestr(oldcoverimghref, oldcoverimgdata) # # coverimgid = "image0" # items.append((coverimgid, # oldcoverimghref, # oldcoverimgtype, # None)) # items.append(("cover", oldcoverhtmlhref, oldcoverhtmltype, None)) # itemrefs.append("cover") # metadata.appendChild(newTag(contentdom, "meta", {"content": "image0", # "name": "cover"})) # guide = newTag(contentdom, "guide") # guide.appendChild(newTag(contentdom, "reference", attrs={"type": "cover", # "title": "Cover", # "href": oldcoverhtmlhref})) # TODO # if None: # self.getConfig('include_images'): # imgcount = 0 # for imgmap in self.story.getImgUrls(): # imgfile = "OEBPS/" + imgmap['newsrc'] # outputepub.writestr(imgfile, imgmap['data']) # items.append(("image%04d" % imgcount, # imgfile, # imgmap['mime'], # None)) # imgcount += 1 # if 'cover' in imgfile: # # make sure coverimgid is set to the cover, not # # just the first image. # coverimgid = items[-1][0] # items.append(("style", "OEBPS/stylesheet.css", "text/css", None)) # TODO # if None: # self.story.cover: # # Note that the id of the cover xhmtl *must* be 'cover' # # for it to work on Nook. # items.append(("cover", "OEBPS/cover.xhtml", "application/xhtml+xml", None)) # itemrefs.append("cover") # # # # <meta name="cover" content="cover.jpg"/> # metadata.appendChild(newTag(contentdom, "meta", {"content": coverimgid, # "name": "cover"})) # # cover stuff for later: # # at end of <package>: # # <guide> # # <reference type="cover" title="Cover" href="Text/cover.xhtml"/> # # </guide> # guide = newTag(contentdom, "guide") # guide.appendChild(newTag(contentdom, "reference", attrs={"type": "cover", # "title": "Cover", # "href": "OEBPS/cover.xhtml"})) # # if self.hasConfig("cover_content"): # COVER = string.Template(self.getConfig("cover_content")) # else: # COVER = self.EPUB_COVER # coverIO = StringIO.StringIO() # coverIO.write( # COVER.substitute(dict(self.story.getAllMetadata().items() + {'coverimg': self.story.cover}.items()))) items.append(("title_page", "OEBPS/title_page.xhtml", "application/xhtml+xml", "Title Page")) itemrefs.append("title_page") # if self.S.metadata['numChapters'] > 1: # items.append(("toc_page", "OEBPS/toc_page.xhtml", "application/xhtml+xml", "Table of Contents")) # itemrefs.append("toc_page") # collect chapter urls and file names for internalize_text_links option. chapurlmap = {} for index, chap in enumerate(self.S.chapterUrls): i = index + 1 items.append(("file%04d" % i, "OEBPS/file%04d.xhtml" % i, "application/xhtml+xml", "%d. %s" % (i, chap[0]))) itemrefs.append("file%04d" % i) chapurlmap[chap[1]] = "file%04d.xhtml" % i # url -> relative epub file name. manifest = contentdom.createElement("manifest") package.appendChild(manifest) for item in items: (item_id, href, item_type, title) = item manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': item_id, 'href': href, 'media-type': item_type })) spine = newTag(contentdom, "spine", attrs={"toc": "ncx"}) package.appendChild(spine) for itemref in itemrefs: spine.appendChild( newTag(contentdom, "itemref", attrs={ "idref": itemref, "linear": "yes" })) # guide only exists if there's a cover. if guide: package.appendChild(guide) # write content.opf to zip. contentxml = contentdom.toxml(encoding='utf-8') # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace( '<meta content="%s" name="cover"/>' % coverimgid, '<meta name="cover" content="%s"/>' % coverimgid) outputepub.writestr("content.opf", contentxml) contentdom.unlink() del contentdom # create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version", "2005-1") ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) # head.appendChild(newTag(tocncxdom, "meta", # attrs={"name": "dtb:uid", "content": uniqueid})) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:depth", "content": "1" })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:totalPageCount", "content": "0" })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:maxPageNumber", "content": "0" })) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild( newTag(tocncxdom, "text", text=self.S.metadata['title'])) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) # <navPoint id="<id>" playOrder="<risingnumberfrom0>"> # <navLabel> # <text><chapter title></text> # </navLabel> # <content src="<chapterfile>"/> # </navPoint> index = 0 for item in items: (item_id, href, item_type, title) = item # only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title. if title: navPoint = newTag(tocncxdom, "navPoint", attrs={ 'id': item_id, 'playOrder': unicode(index) }) tocnavMap.appendChild(navPoint) navLabel = newTag(tocncxdom, "navLabel") navPoint.appendChild(navLabel) # the xml library will re-escape as needed. navLabel.appendChild( newTag(tocncxdom, "text", text=stripHTML(title))) navPoint.appendChild( newTag(tocncxdom, "content", attrs={"src": href})) index = index + 1 # write toc.ncx to zip file outputepub.writestr("toc.ncx", tocncxdom.toxml(encoding='utf-8')) tocncxdom.unlink() del tocncxdom # write stylesheet.css file. # outputepub.writestr("OEBPS/stylesheet.css", self.EPUB_CSS.substitute({'output_css': css_text})) TITLE_PAGE = self.EPUB_TITLE_PAGE if coverIO: outputepub.writestr("OEBPS/cover.xhtml", coverIO.getvalue()) coverIO.close() titlepageIO = StringIO.StringIO() self.writeTitlePage(out=titlepageIO, PAGE=TITLE_PAGE) if titlepageIO.getvalue(): # will be false if no title page. outputepub.writestr("OEBPS/title_page.xhtml", titlepageIO.getvalue()) titlepageIO.close() # # TODO write toc page. # tocpageIO = StringIO.StringIO() # self.writeTOCPage(tocpageIO, # self.EPUB_TOC_PAGE_START, # self.EPUB_TOC_ENTRY, # self.EPUB_TOC_PAGE_END) # if tocpageIO.getvalue(): # will be false if no toc page. # outputepub.writestr("OEBPS/toc_page.xhtml", tocpageIO.getvalue()) # tocpageIO.close() CHAPTER_START = self.EPUB_CHAPTER_START CHAPTER_END = self.EPUB_CHAPTER_END for index, chap in enumerate(self.S.chapterUrls): # (url,title,html) chap_data = self.S.getChapterText(index) if report: n.progress(index+1)\ .post() logger.debug('Writing chapter text for: %s' % chap[0]) vals = { 'url': removeEntities(chap[1]), 'chapter': removeEntities(chap[0]), # 'origchapter': removeEntities(chap.origtitle), # 'tocchapter': removeEntities(chap.toctitle), 'index': "%04d" % (index + 1), 'number': index + 1 } # escape double quotes in all vals. for k, v in vals.items(): if isinstance(v, basestring): vals[k] = v.replace('"', '"') fullhtml = CHAPTER_START.substitute( vals) + chap_data.strip() + CHAPTER_END.substitute(vals) fullhtml = re.sub(r'(</p>|<br ?/>)\n*', r'\1\n', fullhtml) outputepub.writestr("OEBPS/file%04d.xhtml" % (index + 1), fullhtml.encode('utf-8')) del fullhtml for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() ### STOP WRITE # zipout.writestr(string.Template("${title} - ${storyID}.epub").substitute(self.S.metadata).encode('utf8'), # zipio.getvalue()) outstream.close() # zipout.close() if report: ffnet_notify()\ .shadow(self.S.storyID)\ .end(file_name)\ .post()
def reset_orig_chapters_epub(inputio, outfile): inputepub = ZipFile(inputio, "r") # works equally well with a path or a blob ## build zip in memory in case updating in place(CLI). zipio = StringIO() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(zipio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 changed = False unmerge_tocncxdoms = {} ## spin through file contents, saving any unmerge toc.ncx files. for zf in inputepub.namelist(): ## logger.debug("zf:%s"%zf) if zf.endswith("/toc.ncx"): ## logger.debug("toc.ncx zf:%s"%zf) unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf)) tocncxdom = parseString(inputepub.read("toc.ncx")) ## spin through file contents. for zf in inputepub.namelist(): if zf not in ["mimetype", "toc.ncx"] and not zf.endswith("/toc.ncx"): entrychanged = False data = inputepub.read(zf) # if isinstance(data,unicode): # logger.debug("\n\n\ndata is unicode\n\n\n") if re.match(r".*/file\d+\.xhtml", zf): # logger.debug("zf:%s"%zf) data = data.decode("utf-8") soup = bs.BeautifulSoup(data, "html5lib") chapterorigtitle = None tag = soup.find("meta", {"name": "chapterorigtitle"}) if tag: chapterorigtitle = tag["content"] # toctitle is separate for add_chapter_numbers:toconly users. chaptertoctitle = None tag = soup.find("meta", {"name": "chaptertoctitle"}) if tag: chaptertoctitle = tag["content"] elif chapterorigtitle: chaptertoctitle = chapterorigtitle chaptertitle = None tag = soup.find("meta", {"name": "chaptertitle"}) if tag: chaptertitle = tag["content"] if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle: origdata = data # print("\n%s\n%s\n"%(chapterorigtitle,chaptertitle)) data = data.replace( u'<meta name="chaptertitle" content="' + chaptertitle + u'"></meta>', u'<meta name="chaptertitle" content="' + chapterorigtitle + u'"></meta>', ) data = data.replace( u"<title>" + chaptertitle + u"</title>", u"<title>" + chapterorigtitle + u"</title>" ) data = data.replace(u"<h3>" + chaptertitle + u"</h3>", u"<h3>" + chapterorigtitle + u"</h3>") entrychanged = origdata != data changed = changed or entrychanged if entrychanged: _replace_tocncx(tocncxdom, zf, chaptertoctitle) ## Also look for and update individual ## book toc.ncx files for anthology in case ## it's unmerged. zf_toc = zf[: zf.rfind("/OEBPS/")] + "/toc.ncx" mergedprefix_len = len(zf[: zf.rfind("/OEBPS/")]) + 1 if zf_toc in unmerge_tocncxdoms: _replace_tocncx(unmerge_tocncxdoms[zf_toc], zf[mergedprefix_len:], chaptertoctitle) outputepub.writestr(zf, data.encode("utf-8")) else: # possibly binary data, thus no .encode(). outputepub.writestr(zf, data) for tocnm, tocdom in unmerge_tocncxdoms.items(): outputepub.writestr(tocnm, tocdom.toxml(encoding="utf-8")) outputepub.writestr("toc.ncx", tocncxdom.toxml(encoding="utf-8")) outputepub.close() # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 # only *actually* write if changed. if changed: if isinstance(outfile, basestring): with open(outfile, "wb") as outputio: outputio.write(zipio.getvalue()) else: outfile.write(zipio.getvalue()) inputepub.close() zipio.close() return changed
def doMerge(outputio, files, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], titlenavpoints=True, originalnavpoints=True, flattentoc=False, printtimes=False, coverjpgpath=None, keepmetadatafiles=False, source=None): ''' outputio = output file name or StringIO. files = list of input file names or StringIOs. authoropts = list of authors to use, otherwise add from all input titleopt = title, otherwise '<first title> Anthology' descopt = description, otherwise '<title> by <author>' list for all input tags = dc:subject tags to include, otherwise none. languages = dc:language tags to include titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it originalnavpoints if true, include the original TOCs from each epub flattentoc if true, flatten TOC down to one level only. coverjpgpath, Path to a jpg to use as cover image. ''' printt = partial(cond_print,printtimes) ## Python 2.5 ZipFile is rather more primative than later ## versions. It can operate on a file, or on a StringIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. filecount=0 t = time() ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED, allowZip64=True) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED, allowZip64=True) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf", "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) ## Process input epubs. items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file, ## but it needs to be in the items manifest. itemrefs = [] # list of strings -- idrefs from .opfs' spines navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files is_ffdl_epub = [] # list of t/f itemhrefs = {} # hash of item[id]s to itemref[href]s -- to find true start of book(s). firstitemhrefs = [] booktitles = [] # list of strings -- Each book's title allauthors = [] # list of lists of strings -- Each book's list of authors. filelist = [] printt("prep output:%s"%(time()-t)) t = time() booknum=1 firstmetadom = None for file in files: if file == None : continue book = "%d" % booknum bookdir = "%d/" % booknum bookid = "a%d" % booknum epub = ZipFile(file, 'r') ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagNameNS("*","rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) metadom = parseString(epub.read(rootfilename)) #logger.debug("metadom:%s"%epub.read(rootfilename)) if booknum==1 and not source: try: firstmetadom = metadom.getElementsByTagNameNS("*","metadata")[0] source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8") except: source="" # if the epub was ever edited with Sigil, it changed the unique-identifier, # but dc:contributor was left. #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid") is_ffdl_epub.append(False) for c in metadom.getElementsByTagName("dc:contributor"): # logger.debug("dc:contributor:%s"%getText(c.childNodes)) if c.getAttribute("opf:role") == "bkp" and \ getText(c.childNodes) in ["fanficdownloader [http://fanficdownloader.googlecode.com]", "FanFicFare [https://github.com/JimmXinu/FanFicFare]"]: is_ffdl_epub[-1] = True # set last. break; ## Save indiv book title try: booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data) except: booktitles.append("(Title Missing)") ## Save authors. authors=[] for creator in metadom.getElementsByTagName("dc:creator"): try: if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None): authors.append(creator.firstChild.data) except: pass if len(authors) == 0: authors.append("(Author Missing)") allauthors.append(authors) if keepmetadatafiles: itemid=bookid+"rootfile" itemhref = rootfilename href=bookdir+itemhref #logger.debug("write rootfile %s to %s"%(itemhref,href)) outputepub.writestr(href, epub.read(itemhref)) items.append((itemid,href,"origrootfile/xml")) # spin through the manifest--only place there are item tags. # Correction--only place there *should* be item tags. But # somebody found one that did. manifesttag=metadom.getElementsByTagNameNS("*","manifest")[0] for item in manifesttag.getElementsByTagNameNS("*","item"): itemid=bookid+item.getAttribute("id") itemhref = normpath(unquote(item.getAttribute("href"))) # remove %20, etc. href=bookdir+relpath+itemhref if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ): # TOC file is only one with this type--as far as I know. # grab the whole navmap, deal with it later. tocdom = parseString(epub.read(normpath(relpath+item.getAttribute("href")))) # update all navpoint ids with bookid for uniqueness. for navpoint in tocdom.getElementsByTagNameNS("*","navPoint"): navpoint.setAttribute("id",bookid+navpoint.getAttribute("id")) # update all content paths with bookdir for uniqueness. for content in tocdom.getElementsByTagNameNS("*","content"): content.setAttribute("src",normpath(bookdir+relpath+content.getAttribute("src"))) navmaps.append(tocdom.getElementsByTagNameNS("*","navMap")[0]) if keepmetadatafiles: #logger.debug("write toc.ncx %s to %s"%(relpath+itemhref,href)) outputepub.writestr(href, epub.read(normpath(relpath+itemhref))) items.append((itemid,href,"origtocncx/xml")) else: href=href.encode('utf8') #logger.debug("item id: %s -> %s:"%(itemid,href)) itemhrefs[itemid] = href if href not in filelist: try: outputepub.writestr(href, epub.read(normpath(relpath+itemhref))) if re.match(r'.*/(file|chapter)\d+\.x?html',href): filecount+=1 items.append((itemid,href,item.getAttribute("media-type"))) filelist.append(href) except KeyError as ke: # Skip missing files. logger.info("Skipping missing file %s (%s)"%(href,relpath+itemhref)) del itemhrefs[itemid] itemreflist = metadom.getElementsByTagNameNS("*","itemref") # logger.debug("itemreflist:%s"%itemreflist) # logger.debug("itemhrefs:%s"%itemhrefs) # logger.debug("bookid:%s"%bookid) # logger.debug("itemreflist[0].getAttribute(idref):%s"%itemreflist[0].getAttribute("idref")) # Looking for the first item in itemreflist that wasn't # discarded due to missing files. for itemref in itemreflist: idref = bookid+itemref.getAttribute("idref") if idref in itemhrefs: firstitemhrefs.append(itemhrefs[idref]) break for itemref in itemreflist: itemrefs.append(bookid+itemref.getAttribute("idref")) booknum=booknum+1; printt("after file loop:%s"%(time()-t)) t = time() ## create content.opf file. uniqueid="epubmerge-uid-%d" % time() # real sophisticated uid scheme. contentdom = getDOMImplementation().createDocument(None, "package", None) package = contentdom.documentElement package.setAttribute("version","2.0") package.setAttribute("xmlns","http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier","epubmerge-id") metadata=newTag(contentdom,"metadata", attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/", "xmlns:opf":"http://www.idpf.org/2007/opf"}) metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubmerge-id"})) if( titleopt is None ): titleopt = booktitles[0]+" Anthology" metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt)) # If cmdline authors, use those instead of those collected from the epubs # (allauthors kept for TOC & description gen below. if( len(authoropts) > 1 ): useauthors=[authoropts] else: useauthors=allauthors usedauthors=dict() for authorlist in useauthors: for author in authorlist: if( not usedauthors.has_key(author) ): usedauthors[author]=author metadata.appendChild(newTag(contentdom,"dc:creator", attrs={"opf:role":"aut"}, text=author)) metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubmerge",attrs={"opf:role":"bkp"})) metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories")) for l in languages: metadata.appendChild(newTag(contentdom,"dc:language",text=l)) if not descopt: # created now, but not filled in until TOC generation to save loops. description = newTag(contentdom,"dc:description",text="Anthology containing:\n") else: description = newTag(contentdom,"dc:description",text=descopt) metadata.appendChild(description) if source: metadata.appendChild(newTag(contentdom,"dc:identifier", attrs={"opf:scheme":"URL"}, text=source)) metadata.appendChild(newTag(contentdom,"dc:source", text=source)) for tag in tags: metadata.appendChild(newTag(contentdom,"dc:subject",text=tag)) package.appendChild(metadata) manifest = contentdom.createElement("manifest") package.appendChild(manifest) spine = newTag(contentdom,"spine",attrs={"toc":"ncx"}) package.appendChild(spine) if coverjpgpath: # in case coverjpg isn't a jpg: coverext = 'jpg' covertype = 'image/jpeg' try: coverext = coverjpgpath.split('.')[-1].lower() covertype = imagetypes.get(coverext,covertype) except: pass logger.debug("coverjpgpath:%s coverext:%s covertype:%s"%(coverjpgpath,coverext,covertype)) # <meta name="cover" content="cover.jpg"/> metadata.appendChild(newTag(contentdom,"meta",{"name":"cover", "content":"coverimageid"})) guide = newTag(contentdom,"guide") guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover", "title":"Cover", "href":"cover.xhtml"})) package.appendChild(guide) manifest.appendChild(newTag(contentdom,"item", attrs={'id':"coverimageid", 'href':"cover."+coverext, 'media-type':covertype})) # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. manifest.appendChild(newTag(contentdom,"item", attrs={'id':"cover", 'href':"cover.xhtml", 'media-type':"application/xhtml+xml"})) spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":"cover", "linear":"yes"})) for item in items: (id,href,type)=item manifest.appendChild(newTag(contentdom,"item", attrs={'id':id, 'href':href, 'media-type':type})) for itemref in itemrefs: spine.appendChild(newTag(contentdom,"itemref", attrs={"idref":itemref, "linear":"yes"})) ## create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version","2005-1") ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:uid", "content":uniqueid})) depthnode = newTag(tocncxdom,"meta", attrs={"name":"dtb:depth", "content":"4"}) head.appendChild(depthnode) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:totalPageCount", "content":"0"})) head.appendChild(newTag(tocncxdom,"meta", attrs={"name":"dtb:maxPageNumber", "content":"0"})) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild(newTag(tocncxdom,"text",text=titleopt)) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) booknum=0 printt("wrote initial metadata:%s"%(time()-t)) t = time() for navmap in navmaps: depthnavpoints = navmap.getElementsByTagNameNS("*","navPoint") # for checking more than one TOC entry ## only gets top level TOC entries. sub entries carried inside. navpoints = filter( lambda x : isinstance(x,Element) and x.tagName=="navPoint", navmap.childNodes) newnav = None if titlenavpoints: newnav = newTag(tocncxdom,"navPoint",{"id":"book%03d"%booknum}) navlabel = newTag(tocncxdom,"navLabel") newnav.appendChild(navlabel) # For purposes of TOC titling & desc, use first book author. Skip adding author if only one. if len(usedauthors) > 1: title = booktitles[booknum]+" by "+allauthors[booknum][0] else: title = booktitles[booknum] navlabel.appendChild(newTag(tocncxdom,"text",text=title)) # Find the first 'spine' item's content for the title navpoint. # Many epubs have the first chapter as first navpoint, so we can't just # copy that anymore. newnav.appendChild(newTag(tocncxdom,"content", {"src":firstitemhrefs[booknum]})) #logger.debug("newnav:%s"%newnav.toprettyxml()) tocnavMap.appendChild(newnav) else: newnav = tocnavMap if not descopt and len(allauthors[booknum]) > 0: description.appendChild(contentdom.createTextNode(booktitles[booknum]+" by "+allauthors[booknum][0]+"\n")) # If only one TOC point(total, not top level), or if not # including title nav point, include sub book TOC entries. if originalnavpoints and (len(depthnavpoints) > 1 or not titlenavpoints): for navpoint in navpoints: newnav.appendChild(navpoint) navpoint.is_ffdl_epub = is_ffdl_epub[booknum] booknum=booknum+1; # end of navmaps loop. maxdepth = 0 contentsrcs = {} removednodes = [] ## Force strict ordering of playOrder, stripping out some. playorder=0 for navpoint in tocncxdom.getElementsByTagNameNS("*","navPoint"): if navpoint in removednodes: continue # need content[src] to compare for dups. epub wants dup srcs to have same playOrder. contentsrc = None for n in navpoint.childNodes: if isinstance(n,Element) and n.tagName == "content": contentsrc = n.getAttribute("src") # logger.debug("contentsrc: %s"%contentsrc) break if( contentsrc not in contentsrcs ): parent = navpoint.parentNode try: # if the epub was ever edited with Sigil, it changed # the id, but the file name is the same. if navpoint.is_ffdl_epub and \ ( navpoint.getAttribute("id").endswith('log_page') \ or contentsrc.endswith("log_page.xhtml") ): sibs = filter( lambda x : isinstance(x,Element) and x.tagName=="navPoint", parent.childNodes ) # if only logpage and one chapter, remove them from TOC and just show story. if len(sibs) == 2: parent.removeChild(navpoint) # logger.debug("Removing %s:"% sibs[0].getAttribute("playOrder")) parent.removeChild(sibs[1]) removednodes.append(sibs[1]) except: pass # New src, new number. contentsrcs[contentsrc] = navpoint.getAttribute("id") playorder += 1 navpoint.setAttribute("playOrder","%d" % playorder) # logger.debug("playorder:%d:"%playorder) # need to know depth of deepest navpoint for <meta name="dtb:depth" content="2"/> npdepth = 1 dp = navpoint.parentNode while dp and dp.tagName != "navMap": npdepth += 1 dp = dp.parentNode if npdepth > maxdepth: maxdepth = npdepth else: # same content, look for ffdl and title_page and/or single chapter. # easier to just set it now, even if the node gets removed later. navpoint.setAttribute("playOrder","%d" % playorder) # logger.debug("playorder:%d:"%playorder) parent = navpoint.parentNode try: # if the epub was ever edited with Sigil, it changed # the id, but the file name is the same. if navpoint.is_ffdl_epub and \ ( navpoint.getAttribute("id").endswith('title_page') \ or contentsrc.endswith("title_page.xhtml") ): parent.removeChild(navpoint) sibs = filter( lambda x : isinstance(x,Element) and x.tagName=="navPoint", parent.childNodes ) # if only one chapter after removing title_page, remove it too. if len(sibs) == 1: # logger.debug("Removing %s:"% sibs[0].getAttribute("playOrder")) parent.removeChild(sibs[0]) removednodes.append(sibs[0]) except: pass if flattentoc: maxdepth = 1 # already have play order and pesky dup/single chapters # removed, just need to flatten. flattocnavMap = tocncxdom.createElement("navMap") for n in tocnavMap.getElementsByTagNameNS("*","navPoint"): flattocnavMap.appendChild(n) ncx.replaceChild(flattocnavMap,tocnavMap) printt("navmap/toc maddess:%s"%(time()-t)) t = time() depthnode.setAttribute("content","%d"%maxdepth) ## content.opf written now due to description being filled in ## during TOC generation to save loops. contentxml = contentdom.toprettyxml(indent=' ',encoding='utf-8') # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>', '<meta name="cover" content="coverimageid"/>') outputepub.writestr("content.opf",contentxml) outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent=' ',encoding='utf-8')) printt("wrote opf/ncx files:%s"%(time()-t)) t = time() if coverjpgpath: # write, not write string. Pulling from file. outputepub.write(coverjpgpath,"cover."+coverext) outputepub.writestr("cover.xhtml",''' <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css"> @page {padding: 0pt; margin:0pt} body { text-align: center; padding:0pt; margin: 0pt; } div { margin: 0pt; padding: 0pt; } </style></head><body><div> <img src="cover.'''+coverext+'''" alt="cover"/> </div></body></html> ''') # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() printt("closed outputepub:%s"%(time()-t)) t = time() return (source,filecount)
def write_split_epub(self, outputio, linenums, changedtocs={}, authoropts=[], titleopt=None, descopt=None, tags=[], languages=['en'], coverjpgpath=None): files = self.get_split_files(linenums) ## Write mimetype file, must be first and uncompressed. ## Older versions of python(2.4/5) don't allow you to specify ## compression by individual file. ## Overwrite if existing output file. outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version", "1.0") containertop.setAttribute("xmlns", "urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom, "rootfile", {"full-path": "content.opf", "media-type": "application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml", containerdom.toprettyxml(indent=' ', encoding='utf-8')) #### ## create content.opf file. uniqueid = "epubsplit-uid-%d" % time() # real sophisticated uid scheme. contentdom = getDOMImplementation().createDocument(None, "package", None) package = contentdom.documentElement package.setAttribute("version", "2.0") package.setAttribute("xmlns", "http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier", "epubsplit-id") metadata = newTag(contentdom, "metadata", attrs={"xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf"}) metadata.appendChild(newTag(contentdom, "dc:identifier", text=uniqueid, attrs={"id": "epubsplit-id"})) if (titleopt is None): titleopt = self.origtitle + " Split" metadata.appendChild(newTag(contentdom, "dc:title", text=titleopt)) if (authoropts and len(authoropts) > 0): useauthors = authoropts else: useauthors = self.origauthors usedauthors = dict() for author in useauthors: if (not usedauthors.has_key(author)): usedauthors[author] = author metadata.appendChild(newTag(contentdom, "dc:creator", attrs={"opf:role": "aut"}, text=author)) metadata.appendChild(newTag(contentdom, "dc:contributor", text="epubsplit", attrs={"opf:role": "bkp"})) metadata.appendChild(newTag(contentdom, "dc:rights", text="Copyrights as per source stories")) if languages: for l in languages: metadata.appendChild(newTag(contentdom, "dc:language", text=l)) else: metadata.appendChild(newTag(contentdom, "dc:language", text="en")) if not descopt: # created now, but not filled in until TOC generation to save loops. description = newTag(contentdom, "dc:description", text="Split from %s by %s." % (self.origtitle, ", ".join(self.origauthors))) else: description = newTag(contentdom, "dc:description", text=descopt) metadata.appendChild(description) for tag in tags: metadata.appendChild(newTag(contentdom, "dc:subject", text=tag)) package.appendChild(metadata) manifest = contentdom.createElement("manifest") package.appendChild(manifest) spine = newTag(contentdom, "spine", attrs={"page-progression-direction": "rtl", "toc": "ncx" }) package.appendChild(spine) manifest.appendChild(newTag(contentdom, "item", attrs={'id': 'ncx', 'href': 'toc.ncx', 'media-type': 'application/x-dtbncx+xml'})) manifest.appendChild(newTag(contentdom, "item", attrs={'id': "not_purchased", 'href': "not_purchased_sections.xhtml", 'media-type': "application/xhtml+xml"})) contentcount = 0 for (filename, id, type, filedata) in files: # filename = self.filecache.addHtml(href,filedata) # print("writing :%s"%filename) # add to manifest and spine if filename == "not_purchased_sections.xhtml": continue # don't dup cover. if globalindex == filename: spine.appendChild(newTag(contentdom, "itemref", attrs={"idref": "not_purchased", "linear": "yes"})) outputepub.writestr(filename, filedata.encode('utf-8')) id = "a%d" % contentcount contentcount += 1 manifest.appendChild(newTag(contentdom, "item", attrs={'id': id, 'href': filename, 'media-type': type})) spine.appendChild(newTag(contentdom, "itemref", attrs={"idref": id, "linear": "yes"})) if globalindex is None: spine.appendChild(newTag(contentdom, "itemref", attrs={"idref": "not_purchased", "linear": "yes"})) for (linked, type) in self.filecache.linkedfiles: try: outputepub.writestr(linked, self.get_file(linked)) except Exception, e: print("Failed to copy linked file (%s)\nException: %s" % (linked, e)) id = "a%d" % contentcount contentcount += 1 manifest.appendChild(newTag(contentdom, "item", attrs={'id': id, 'href': linked, 'media-type': type}))
def writeStoryImpl(self, out): ## Python 2.5 ZipFile is rather more primative than later ## versions. It can operate on a file, or on a StringIO, but ## not on an open stream. OTOH, I suspect we would have had ## problems with closing and opening again to change the ## compression type anyway. zipio = StringIO.StringIO() ## mimetype must be first file and uncompressed. Python 2.5 ## ZipFile can't change compression type file-by-file, so we ## have to close and re-open outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr('mimetype', 'application/epub+zip') outputepub.close() ## Re-open file for content. outputepub = ZipFile(zipio, 'a', compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument( None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version", "1.0") containertop.setAttribute( "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild( newTag( containerdom, "rootfile", { "full-path": "content.opf", "media-type": "application/oebps-package+xml" })) outputepub.writestr("META-INF/container.xml", containerdom.toxml(encoding='utf-8')) containerdom.unlink() del containerdom ## Epub has two metadata files with real data. We're putting ## them in content.opf (pointed to by META-INF/container.xml) ## and toc.ncx (pointed to by content.opf) ## content.opf contains metadata, a 'manifest' list of all ## other included files, and another 'spine' list of the items in the ## file uniqueid = 'fanficfare-uid:%s-u%s-s%s' % ( self.getMetadata('site'), self.story.getList('authorId')[0], self.getMetadata('storyId')) contentdom = getDOMImplementation().createDocument( None, "package", None) package = contentdom.documentElement package.setAttribute("version", "2.0") package.setAttribute("xmlns", "http://www.idpf.org/2007/opf") package.setAttribute("unique-identifier", "fanficfare-uid") metadata = newTag(contentdom, "metadata", attrs={ "xmlns:dc": "http://purl.org/dc/elements/1.1/", "xmlns:opf": "http://www.idpf.org/2007/opf" }) package.appendChild(metadata) metadata.appendChild( newTag(contentdom, "dc:identifier", text=uniqueid, attrs={"id": "fanficfare-uid"})) if self.getMetadata('title'): metadata.appendChild( newTag(contentdom, "dc:title", text=self.getMetadata('title'))) if self.getMetadata('author'): if self.story.isList('author'): for auth in self.story.getList('author'): metadata.appendChild( newTag(contentdom, "dc:creator", attrs={"opf:role": "aut"}, text=auth)) else: metadata.appendChild( newTag(contentdom, "dc:creator", attrs={"opf:role": "aut"}, text=self.getMetadata('author'))) metadata.appendChild( newTag(contentdom, "dc:contributor", text="FanFicFare [https://github.com/JimmXinu/FanFicFare]", attrs={"opf:role": "bkp"})) metadata.appendChild(newTag(contentdom, "dc:rights", text="")) if self.story.getMetadata('langcode'): metadata.appendChild( newTag(contentdom, "dc:language", text=self.story.getMetadata('langcode'))) else: metadata.appendChild(newTag(contentdom, "dc:language", text='en')) # published, created, updated, calibre # Leave calling self.story.getMetadataRaw directly in case date format changes. if self.story.getMetadataRaw('datePublished'): metadata.appendChild( newTag(contentdom, "dc:date", attrs={"opf:event": "publication"}, text=self.story.getMetadataRaw( 'datePublished').strftime("%Y-%m-%d"))) if self.story.getMetadataRaw('dateCreated'): metadata.appendChild( newTag(contentdom, "dc:date", attrs={"opf:event": "creation"}, text=self.story.getMetadataRaw('dateCreated').strftime( "%Y-%m-%d"))) if self.story.getMetadataRaw('dateUpdated'): metadata.appendChild( newTag(contentdom, "dc:date", attrs={"opf:event": "modification"}, text=self.story.getMetadataRaw('dateUpdated').strftime( "%Y-%m-%d"))) metadata.appendChild( newTag(contentdom, "meta", attrs={ "name": "calibre:timestamp", "content": self.story.getMetadataRaw('dateUpdated').strftime( "%Y-%m-%dT%H:%M:%S") })) series = self.story.getMetadataRaw('series') if series and self.getConfig('calibre_series_meta'): series_index = "0.0" if '[' in series: logger.debug(series) ## assumed "series [series_index]" series_index = series[series.index(' [') + 2:-1] series = series[:series.index(' [')] ## calibre always outputs a series_index and it's ## always a float with 1 or 2 decimals. FFF usually ## has either an integer or no index. (injected ## calibre series is the only float at this time) series_index = "%.2f" % float(series_index) metadata.appendChild( newTag(contentdom, "meta", attrs={ "name": "calibre:series", "content": series })) metadata.appendChild( newTag(contentdom, "meta", attrs={ "name": "calibre:series_index", "content": series_index })) if self.getMetadata('description'): metadata.appendChild( newTag(contentdom, "dc:description", text=self.getMetadata('description'))) for subject in self.story.getSubjectTags(): metadata.appendChild(newTag(contentdom, "dc:subject", text=subject)) if self.getMetadata('site'): metadata.appendChild( newTag(contentdom, "dc:publisher", text=self.getMetadata('site'))) if self.getMetadata('storyUrl'): metadata.appendChild( newTag(contentdom, "dc:identifier", attrs={"opf:scheme": "URL"}, text=self.getMetadata('storyUrl'))) metadata.appendChild( newTag(contentdom, "dc:source", text=self.getMetadata('storyUrl'))) ## end of metadata, create manifest. items = [] # list of (id, href, type, title) tuples(all strings) itemrefs = [] # list of strings -- idrefs from .opfs' spines items.append(("ncx", "toc.ncx", "application/x-dtbncx+xml", None)) ## we'll generate the toc.ncx file, ## but it needs to be in the items manifest. guide = None coverIO = None coverimgid = "image0000" if not self.story.cover and self.story.oldcover: logger.debug( "writer_epub: no new cover, has old cover, write image.") (oldcoverhtmlhref, oldcoverhtmltype, oldcoverhtmldata, oldcoverimghref, oldcoverimgtype, oldcoverimgdata) = self.story.oldcover outputepub.writestr(oldcoverhtmlhref, oldcoverhtmldata) outputepub.writestr(oldcoverimghref, oldcoverimgdata) coverimgid = "image0" items.append((coverimgid, oldcoverimghref, oldcoverimgtype, None)) items.append(("cover", oldcoverhtmlhref, oldcoverhtmltype, None)) itemrefs.append("cover") metadata.appendChild( newTag(contentdom, "meta", { "content": "image0", "name": "cover" })) guide = newTag(contentdom, "guide") guide.appendChild( newTag(contentdom, "reference", attrs={ "type": "cover", "title": "Cover", "href": oldcoverhtmlhref })) if self.getConfig('include_images'): imgcount = 0 for imgmap in self.story.getImgUrls(): imgfile = "OEBPS/" + imgmap['newsrc'] outputepub.writestr(imgfile, imgmap['data']) items.append( ("image%04d" % imgcount, imgfile, imgmap['mime'], None)) imgcount += 1 if 'cover' in imgfile: # make sure coverimgid is set to the cover, not # just the first image. coverimgid = items[-1][0] items.append(("style", "OEBPS/stylesheet.css", "text/css", None)) if self.story.cover: # Note that the id of the cover xhmtl *must* be 'cover' # for it to work on Nook. items.append( ("cover", "OEBPS/cover.xhtml", "application/xhtml+xml", None)) itemrefs.append("cover") # # <meta name="cover" content="cover.jpg"/> metadata.appendChild( newTag(contentdom, "meta", { "content": coverimgid, "name": "cover" })) # cover stuff for later: # at end of <package>: # <guide> # <reference type="cover" title="Cover" href="Text/cover.xhtml"/> # </guide> guide = newTag(contentdom, "guide") guide.appendChild( newTag(contentdom, "reference", attrs={ "type": "cover", "title": "Cover", "href": "OEBPS/cover.xhtml" })) if self.hasConfig("cover_content"): COVER = string.Template(self.getConfig("cover_content")) else: COVER = self.EPUB_COVER coverIO = StringIO.StringIO() coverIO.write( COVER.substitute( dict(self.story.getAllMetadata().items() + {'coverimg': self.story.cover}.items()))) if self.getConfig("include_titlepage"): items.append(("title_page", "OEBPS/title_page.xhtml", "application/xhtml+xml", "Title Page")) itemrefs.append("title_page") if len(self.story.getChapters()) > 1 and self.getConfig( "include_tocpage") and not self.metaonly: items.append(("toc_page", "OEBPS/toc_page.xhtml", "application/xhtml+xml", "Table of Contents")) itemrefs.append("toc_page") ## save where to insert logpage. logpage_indices = (len(items), len(itemrefs)) dologpage = ( self.getConfig("include_logpage") == "smart" and \ (self.story.logfile or self.story.getMetadataRaw("status") == "In-Progress") ) \ or self.getConfig("include_logpage") == "true" ## collect chapter urls and file names for internalize_text_links option. chapurlmap = {} for index, chap in enumerate(self.story.getChapters(fortoc=True)): if chap.html: i = index + 1 items.append(("file%04d" % i, "OEBPS/file%04d.xhtml" % i, "application/xhtml+xml", chap.title)) itemrefs.append("file%04d" % i) chapurlmap[ chap. url] = "file%04d.xhtml" % i # url -> relative epub file name. if dologpage: if self.getConfig("logpage_at_end") == "true": ## insert logpage after chapters. logpage_indices = (len(items), len(itemrefs)) items.insert(logpage_indices[0], ("log_page", "OEBPS/log_page.xhtml", "application/xhtml+xml", "Update Log")) itemrefs.insert(logpage_indices[1], "log_page") manifest = contentdom.createElement("manifest") package.appendChild(manifest) for item in items: (id, href, type, title) = item manifest.appendChild( newTag(contentdom, "item", attrs={ 'id': id, 'href': href, 'media-type': type })) spine = newTag(contentdom, "spine", attrs={"toc": "ncx"}) package.appendChild(spine) for itemref in itemrefs: spine.appendChild( newTag(contentdom, "itemref", attrs={ "idref": itemref, "linear": "yes" })) # guide only exists if there's a cover. if guide: package.appendChild(guide) # write content.opf to zip. contentxml = contentdom.toxml(encoding='utf-8') # tweak for brain damaged Nook STR. Nook insists on name before content. contentxml = contentxml.replace( '<meta content="%s" name="cover"/>' % coverimgid, '<meta name="cover" content="%s"/>' % coverimgid) outputepub.writestr("content.opf", contentxml) contentdom.unlink() del contentdom ## create toc.ncx file tocncxdom = getDOMImplementation().createDocument(None, "ncx", None) ncx = tocncxdom.documentElement ncx.setAttribute("version", "2005-1") ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/") head = tocncxdom.createElement("head") ncx.appendChild(head) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:uid", "content": uniqueid })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:depth", "content": "1" })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:totalPageCount", "content": "0" })) head.appendChild( newTag(tocncxdom, "meta", attrs={ "name": "dtb:maxPageNumber", "content": "0" })) docTitle = tocncxdom.createElement("docTitle") docTitle.appendChild( newTag(tocncxdom, "text", text=self.getMetadata('title'))) ncx.appendChild(docTitle) tocnavMap = tocncxdom.createElement("navMap") ncx.appendChild(tocnavMap) # <navPoint id="<id>" playOrder="<risingnumberfrom0>"> # <navLabel> # <text><chapter title></text> # </navLabel> # <content src="<chapterfile>"/> # </navPoint> index = 0 for item in items: (id, href, type, title) = item # only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title. if title: navPoint = newTag(tocncxdom, "navPoint", attrs={ 'id': id, 'playOrder': unicode(index) }) tocnavMap.appendChild(navPoint) navLabel = newTag(tocncxdom, "navLabel") navPoint.appendChild(navLabel) ## the xml library will re-escape as needed. navLabel.appendChild( newTag(tocncxdom, "text", text=stripHTML(title))) navPoint.appendChild( newTag(tocncxdom, "content", attrs={"src": href})) index = index + 1 # write toc.ncx to zip file outputepub.writestr("toc.ncx", tocncxdom.toxml(encoding='utf-8')) tocncxdom.unlink() del tocncxdom # write stylesheet.css file. outputepub.writestr( "OEBPS/stylesheet.css", self.EPUB_CSS.substitute(self.story.getAllMetadata())) # write title page. if self.getConfig("titlepage_use_table"): TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY NO_TITLE_ENTRY = self.EPUB_TABLE_NO_TITLE_ENTRY TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END else: TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START TITLE_ENTRY = self.EPUB_TITLE_ENTRY WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY # same, only wide in tables. NO_TITLE_ENTRY = self.EPUB_NO_TITLE_ENTRY TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END if coverIO: outputepub.writestr("OEBPS/cover.xhtml", coverIO.getvalue()) coverIO.close() titlepageIO = StringIO.StringIO() self.writeTitlePage(out=titlepageIO, START=TITLE_PAGE_START, ENTRY=TITLE_ENTRY, WIDE_ENTRY=WIDE_TITLE_ENTRY, END=TITLE_PAGE_END, NO_TITLE_ENTRY=NO_TITLE_ENTRY) if titlepageIO.getvalue(): # will be false if no title page. outputepub.writestr("OEBPS/title_page.xhtml", titlepageIO.getvalue()) titlepageIO.close() # write toc page. tocpageIO = StringIO.StringIO() self.writeTOCPage(tocpageIO, self.EPUB_TOC_PAGE_START, self.EPUB_TOC_ENTRY, self.EPUB_TOC_PAGE_END) if tocpageIO.getvalue(): # will be false if no toc page. outputepub.writestr("OEBPS/toc_page.xhtml", tocpageIO.getvalue()) tocpageIO.close() if dologpage: # write log page. logpageIO = StringIO.StringIO() self.writeLogPage(logpageIO) outputepub.writestr("OEBPS/log_page.xhtml", logpageIO.getvalue()) logpageIO.close() if self.hasConfig('chapter_start'): CHAPTER_START = string.Template(self.getConfig("chapter_start")) else: CHAPTER_START = self.EPUB_CHAPTER_START if self.hasConfig('chapter_end'): CHAPTER_END = string.Template(self.getConfig("chapter_end")) else: CHAPTER_END = self.EPUB_CHAPTER_END for index, chap in enumerate( self.story.getChapters()): # (url,title,html) if chap.html: chap_data = chap.html if self.getConfig('internalize_text_links'): soup = bs4.BeautifulSoup(chap.html, 'html5lib') changed = False for alink in soup.find_all('a'): if alink.has_attr( 'href') and alink['href'] in chapurlmap: alink['href'] = chapurlmap[alink['href']] changed = True if changed: chap_data = unicode(soup) # Don't want html, head or body tags in # chapter html--bs4 insists on adding them. chap_data = re.sub(r"</?(html|head|body)[^>]*>\r?\n?", "", chap_data) #logger.debug('Writing chapter text for: %s' % chap.title) vals = { 'url': removeEntities(chap.url), 'chapter': removeEntities(chap.title), 'origchapter': removeEntities(chap.origtitle), 'tocchapter': removeEntities(chap.toctitle), 'index': "%04d" % (index + 1), 'number': index + 1 } # escape double quotes in all vals. for k, v in vals.items(): if isinstance(v, basestring): vals[k] = v.replace('"', '"') fullhtml = CHAPTER_START.substitute(vals) + \ chap_data.strip() + \ CHAPTER_END.substitute(vals) # strip to avoid ever growning numbers of newlines. # ffnet(& maybe others) gives the whole chapter text # as one line. This causes problems for nook(at # least) when the chapter size starts getting big # (200k+) fullhtml = re.sub(r'(</p>|<br ?/>)\n*', r'\1\n', fullhtml) outputepub.writestr("OEBPS/file%04d.xhtml" % (index + 1), fullhtml.encode('utf-8')) del fullhtml if self.story.calibrebookmark: outputepub.writestr("META-INF/calibre_bookmarks.txt", self.story.calibrebookmark) # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() out.write(zipio.getvalue()) zipio.close()
def doUnMerge(inputio,outdir=None): epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob outputios = [] ## Find the .opf file. container = epub.read("META-INF/container.xml") containerdom = parseString(container) rootfilenodelist = containerdom.getElementsByTagName("rootfile") rootfilename = rootfilenodelist[0].getAttribute("full-path") contentdom = parseString(epub.read(rootfilename)) ## Save the path to the .opf file--hrefs inside it are relative to it. relpath = get_path_part(rootfilename) #print("relpath:%s"%relpath) # spin through the manifest--only place there are item tags. # Correction--only place there *should* be item tags. But # somebody found one that did. manifesttag=contentdom.getElementsByTagNameNS("*","manifest")[0] for item in manifesttag.getElementsByTagNameNS("*","item"): # look for our fake media-type for original rootfiles. if( item.getAttribute("media-type") == "origrootfile/xml" ): # found one, assume the dir containing it is a complete # original epub, do initial setup of epub. itemhref = relpath+unquote(item.getAttribute("href")) #print("Found origrootfile:%s"%itemhref) curepubpath = re.sub(r'([^\d/]+/)+$','',get_path_part(itemhref)) savehref = itemhref[len(curepubpath):] #print("curepubpath:%s"%curepubpath) outputio = StringIO() outputepub = ZipFile(outputio, "w", compression=ZIP_STORED) outputepub.debug = 3 outputepub.writestr("mimetype", "application/epub+zip") outputepub.close() ## Re-open file for content. outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED) outputepub.debug = 3 ## Create META-INF/container.xml file. The only thing it does is ## point to content.opf containerdom = getDOMImplementation().createDocument(None, "container", None) containertop = containerdom.documentElement containertop.setAttribute("version","1.0") containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container") rootfiles = containerdom.createElement("rootfiles") containertop.appendChild(rootfiles) rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":savehref, "media-type":"application/oebps-package+xml"})) outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent=' ',encoding='utf-8')) outputepub.writestr(savehref,epub.read(itemhref)) for item2 in contentdom.getElementsByTagName("item"): item2href = relpath+unquote(item2.getAttribute("href")) if item2href.startswith(curepubpath) and item2href != itemhref: save2href = item2href[len(curepubpath):] #print("Found %s -> %s"%(item2href,save2href)) outputepub.writestr(save2href,epub.read(item2href)) # declares all the files created by Windows. otherwise, when # it runs in appengine, windows unzips the files as 000 perms. for zf in outputepub.filelist: zf.create_system = 0 outputepub.close() outputios.append(outputio) if outdir: outfilenames=[] for count,epubIO in enumerate(outputios): filename="%s/%d.epub"%(outdir,count) print("write %s"%filename) outstream = open(filename,"wb") outstream.write(epubIO.getvalue()) outstream.close() outfilenames.append(filename) return outfilenames else: return outputios