Beispiel #1
0
    def run(self, path_to_ebook):
        # print("run FanficAuthorsNetCSSFix")
        # logger.warn("logger")
        book_format = 'epub'

        ## Really crude brute force check to see if it's a
        ## fanficauthors.net epub:

        epub = ZipFile(
            path_to_ebook,
            'r')  # works equally well with inputio as a path or a blob
        tocfile = "content/toc.ncx"
        if not (tocfile in epub.namelist()
                and "fanficauthors.net" in epub.read(tocfile)):
            # bail without doing anything
            return path_to_ebook

        print("It's a fanficauthors.net epub!")

        tmpfile = self.temporary_file('.' + book_format)

        outputepub = ZipFile(tmpfile, "w", compression=ZIP_STORED)
        outputepub.debug = 3
        outputepub.writestr("mimetype", "application/epub+zip")
        outputepub.close()

        ## Re-open file for content.
        outputepub = ZipFile(tmpfile, "a", compression=ZIP_DEFLATED)
        outputepub.debug = 3

        for fname in epub.namelist():
            if fname.endswith('.html'):
                outputepub.writestr(
                    fname,
                    epub.read(fname).replace(
                        """body {
	margin-top: 0px;
    padding-top: 0px;
}""", """body { background-color: #FFFFFF;
        text-align: justify;
        margin: 2%;
	adobe-hyphenate: none; }"""))
            elif fname != "mimetype":
                outputepub.writestr(fname, epub.read(fname))

        for zf in outputepub.filelist:
            zf.create_system = 0
        outputepub.close()

        # file = open(path_to_ebook, 'r+b')
        ext = os.path.splitext(path_to_ebook)[-1][1:].lower()
        mi = get_metadata(tmpfile, ext)
        mi.publisher = "fanficauthors.net"
        set_metadata(tmpfile, mi, ext)
        # return path_to_ebook

        return tmpfile.name
    def generate_plugin(self):
        """Generate ZIP file with specified stylesheets."""
        self._preprocess()

        __output = self.out

        __temp = StringIO.StringIO()
        __failed = False
        try:
            __zip = ZipFile(__temp, "w")
            __zip.debug = 3
            try:
                # integrator
                self._run_generation(__zip, self.__generate_integrator,
                                    "%s/integrator.xml" % (self.plugin_name))
                # plugin
                self._run_generation(__zip, self.__generate_plugin_file,
                                    "%s/plugin.xml" % (self.plugin_name))
                # catalog
                self._run_generation(__zip, self.__generate_catalog,
                                    "%s/cfg/catalog.xml" % (self.plugin_name))
                # font-mappings
#                self._run_generation(__zip, self.__generate_font_mappings,
#                                    "%s/cfg/fo/font-mappins.xml" % (self.plugin_name))
                # custom XSLT
                self._run_generation(__zip, self.__generate_custom,
                                    "%s/cfg/fo/xsl/custom.xsl" % (self.plugin_name))
                # custom XSLT attribute sets
                self._run_generation(__zip, self.__generate_custom_attr,
                                    "%s/cfg/fo/attrs/custom.xsl" % (self.plugin_name))
                # shell XSLT
                if self.override_shell:
                    self._run_generation(__zip, self.__generate_shell,
                                        "%s/xsl/fo/topic2fo_shell_%s.xsl" % (self.plugin_name, self.formatter))
#                if not self.link_pagenumber or self.table_continued:
                for lang in self.variable_languages:
                    self._run_generation(__zip, lambda: self.__generate_vars(lang),
                                         "%s/cfg/common/vars/%s.xml" % (self.plugin_name, lang))
#                if self.generate_shell:
#                    # shell XSLT
#                    self._run_generation(__zip, self.__generate_shell,
#                                        "%s/xsl/fo/.xsl" % (self.plugin_name))
            except:
                __failed = True
                raise Exception("Failed to write plugin", sys.exc_info()[1]), None, sys.exc_info()[2]
            finally:
                if __zip != None:
                    __zip.close()
            if not __failed:
                __output.write(__temp.getvalue())
        except:
            __failed = True
            raise Exception("Failed to write ZIP file to output", sys.exc_info()[1]), None, sys.exc_info()[2]
        finally:
            __temp.close()
def reset_orig_chapters_epub(inputio, outfile):
    inputepub = ZipFile(inputio,
                        'r')  # works equally well with a path or a blob

    ## build zip in memory in case updating in place(CLI).
    zipio = BytesIO()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    changed = False

    unmerge_tocncxdoms = {}
    ## spin through file contents, saving any unmerge toc.ncx files.
    for zf in inputepub.namelist():
        ## logger.debug("zf:%s"%zf)
        if zf.endswith('/toc.ncx'):
            ## logger.debug("toc.ncx zf:%s"%zf)
            unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf))

    tocncxdom = parseString(inputepub.read('toc.ncx'))
    ## spin through file contents.
    for zf in inputepub.namelist():
        if zf not in ['mimetype', 'toc.ncx'] and not zf.endswith('/toc.ncx'):
            entrychanged = False
            data = inputepub.read(zf)
            # if isinstance(data,unicode):
            #     logger.debug("\n\n\ndata is unicode\n\n\n")
            if re.match(r'.*/file\d+\.xhtml', zf):
                #logger.debug("zf:%s"%zf)
                data = data.decode('utf-8')
                soup = make_soup(data)

                chapterorigtitle = None
                tag = soup.find('meta', {'name': 'chapterorigtitle'})
                if tag:
                    chapterorigtitle = tag['content']

                # toctitle is separate for add_chapter_numbers:toconly users.
                chaptertoctitle = None
                tag = soup.find('meta', {'name': 'chaptertoctitle'})
                if tag:
                    chaptertoctitle = tag['content']
                    chaptertoctitle = chapterorigtitle

                chaptertitle = None
                tag = soup.find('meta', {'name': 'chaptertitle'})
                if tag:
                    chaptertitle = tag['content']
                    chaptertitle_tag = tag

                #logger.debug("chaptertitle:(%s) chapterorigtitle:(%s)"%(chaptertitle, chapterorigtitle))
                if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle:
                    origdata = data
                    # data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>',
                    #                     u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>')
                    # data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>')
                    # data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>')
                    chaptertitle_tag['content'] = chapterorigtitle
                    title_tag = soup.find('title')
                    if title_tag and title_tag.string == chaptertitle:
                        title_tag.string.replace_with(chapterorigtitle)

                    h3_tag = soup.find('h3')
                    if h3_tag and h3_tag.string == chaptertitle:
                        h3_tag.string.replace_with(chapterorigtitle)

                    data = unicode(soup)

                    entrychanged = (origdata != data)
                    changed = changed or entrychanged

                    if entrychanged:
                        logger.debug("\nentrychanged:%s\n" % zf)
                        _replace_tocncx(tocncxdom, zf, chaptertoctitle)
                        ## Also look for and update individual
                        ## book toc.ncx files for anthology in case
                        ## it's unmerged.
                        zf_toc = zf[:zf.rfind('/OEBPS/')] + '/toc.ncx'
                        mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')]) + 1

                        if zf_toc in unmerge_tocncxdoms:
                            _replace_tocncx(unmerge_tocncxdoms[zf_toc],
                                            zf[mergedprefix_len:],
                                            chaptertoctitle)

                outputepub.writestr(zf, data.encode('utf-8'))
            else:
                # possibly binary data, thus no .encode().
                outputepub.writestr(zf, data)

    for tocnm, tocdom in unmerge_tocncxdoms.items():
        outputepub.writestr(tocnm, tocdom.toxml(encoding='utf-8'))

    outputepub.writestr('toc.ncx', tocncxdom.toxml(encoding='utf-8'))
    outputepub.close()
    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0

    # only *actually* write if changed.
    if changed:
        if isinstance(outfile, basestring):
            with open(outfile, "wb") as outputio:
                outputio.write(zipio.getvalue())
        else:
            outfile.write(zipio.getvalue())

    inputepub.close()
    zipio.close()

    return changed
    def writeStoryImpl(self, out):

        ## Python 2.5 ZipFile is rather more primative than later
        ## versions.  It can operate on a file, or on a StringIO, but
        ## not on an open stream.  OTOH, I suspect we would have had
        ## problems with closing and opening again to change the
        ## compression type anyway.
        zipio = StringIO.StringIO()

        ## mimetype must be first file and uncompressed.  Python 2.5
        ## ZipFile can't change compression type file-by-file, so we
        ## have to close and re-open
        outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
        outputepub.debug=3
        outputepub.writestr('mimetype','application/epub+zip')
        outputepub.close()

        ## Re-open file for content.
        outputepub = ZipFile(zipio, 'a', compression=ZIP_DEFLATED)
        outputepub.debug=3
        
        ## Create META-INF/container.xml file.  The only thing it does is
        ## point to content.opf
        containerdom = getDOMImplementation().createDocument(None, "container", None)
        containertop = containerdom.documentElement
        containertop.setAttribute("version","1.0")
        containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
        rootfiles = containerdom.createElement("rootfiles")
        containertop.appendChild(rootfiles)
        rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
                                                              "media-type":"application/oebps-package+xml"}))
        outputepub.writestr("META-INF/container.xml",containerdom.toxml(encoding='utf-8'))
        containerdom.unlink()
        del containerdom

        ## Epub has two metadata files with real data.  We're putting
        ## them in content.opf (pointed to by META-INF/container.xml)
        ## and toc.ncx (pointed to by content.opf)

        ## content.opf contains metadata, a 'manifest' list of all
        ## other included files, and another 'spine' list of the items in the
        ## file

        uniqueid= 'fanficfare-uid:%s-u%s-s%s' % (
            self.getMetadata('site'),
            self.story.getList('authorId')[0],
            self.getMetadata('storyId'))
        
        contentdom = getDOMImplementation().createDocument(None, "package", None)
        package = contentdom.documentElement
        package.setAttribute("version","2.0")
        package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
        package.setAttribute("unique-identifier","fanficfare-uid")
        metadata=newTag(contentdom,"metadata",
                        attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
                               "xmlns:opf":"http://www.idpf.org/2007/opf"})
        package.appendChild(metadata)

        metadata.appendChild(newTag(contentdom,"dc:identifier",
                                    text=uniqueid,
                                    attrs={"id":"fanficfare-uid"}))

        if self.getMetadata('title'):
            metadata.appendChild(newTag(contentdom,"dc:title",text=self.getMetadata('title')))

        if self.getMetadata('author'):
            if self.story.isList('author'):
                for auth in self.story.getList('author'):
                    metadata.appendChild(newTag(contentdom,"dc:creator",
                                                attrs={"opf:role":"aut"},
                                                text=auth))
            else:
                metadata.appendChild(newTag(contentdom,"dc:creator",
                                            attrs={"opf:role":"aut"},
                                            text=self.getMetadata('author')))

        metadata.appendChild(newTag(contentdom,"dc:contributor",text="FanFicFare [https://github.com/JimmXinu/FanFicFare]",attrs={"opf:role":"bkp"}))
        metadata.appendChild(newTag(contentdom,"dc:rights",text=""))
        if self.story.getMetadata('langcode'):
            metadata.appendChild(newTag(contentdom,"dc:language",text=self.story.getMetadata('langcode')))
        else:
            metadata.appendChild(newTag(contentdom,"dc:language",text='en'))

        #  published, created, updated, calibre
        #  Leave calling self.story.getMetadataRaw directly in case date format changes.
        if self.story.getMetadataRaw('datePublished'):
            metadata.appendChild(newTag(contentdom,"dc:date",
                                        attrs={"opf:event":"publication"},
                                        text=self.story.getMetadataRaw('datePublished').strftime("%Y-%m-%d")))
        
        if self.story.getMetadataRaw('dateCreated'):
            metadata.appendChild(newTag(contentdom,"dc:date",
                                        attrs={"opf:event":"creation"},
                                        text=self.story.getMetadataRaw('dateCreated').strftime("%Y-%m-%d")))
        
        if self.story.getMetadataRaw('dateUpdated'):
            metadata.appendChild(newTag(contentdom,"dc:date",
                                        attrs={"opf:event":"modification"},
                                        text=self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%d")))
            metadata.appendChild(newTag(contentdom,"meta",
                                        attrs={"name":"calibre:timestamp",
                                               "content":self.story.getMetadataRaw('dateUpdated').strftime("%Y-%m-%dT%H:%M:%S")}))
        
        if self.getMetadata('description'):
            metadata.appendChild(newTag(contentdom,"dc:description",text=
                                        self.getMetadata('description')))

        for subject in self.story.getSubjectTags():
            metadata.appendChild(newTag(contentdom,"dc:subject",text=subject))

                    
        if self.getMetadata('site'):
            metadata.appendChild(newTag(contentdom,"dc:publisher",
                                        text=self.getMetadata('site')))
        
        if self.getMetadata('storyUrl'):
            metadata.appendChild(newTag(contentdom,"dc:identifier",
                                        attrs={"opf:scheme":"URL"},
                                        text=self.getMetadata('storyUrl')))
            metadata.appendChild(newTag(contentdom,"dc:source",
                                        text=self.getMetadata('storyUrl')))

        ## end of metadata, create manifest.
        items = [] # list of (id, href, type, title) tuples(all strings)
        itemrefs = [] # list of strings -- idrefs from .opfs' spines
        items.append(("ncx","toc.ncx","application/x-dtbncx+xml",None)) ## we'll generate the toc.ncx file,
                                                                   ## but it needs to be in the items manifest.

        guide = None
        coverIO = None
                
        coverimgid = "image0000"
        if not self.story.cover and self.story.oldcover:
            logger.debug("writer_epub: no new cover, has old cover, write image.")
            (oldcoverhtmlhref,
             oldcoverhtmltype,
             oldcoverhtmldata,
             oldcoverimghref,
             oldcoverimgtype,
             oldcoverimgdata) = self.story.oldcover
            outputepub.writestr(oldcoverhtmlhref,oldcoverhtmldata)
            outputepub.writestr(oldcoverimghref,oldcoverimgdata)
            
            coverimgid = "image0"
            items.append((coverimgid,
                          oldcoverimghref,
                          oldcoverimgtype,
                          None))
            items.append(("cover",oldcoverhtmlhref,oldcoverhtmltype,None))
            itemrefs.append("cover")
            metadata.appendChild(newTag(contentdom,"meta",{"content":"image0",
                                                           "name":"cover"}))
            guide = newTag(contentdom,"guide")
            guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
                                                                   "title":"Cover",
                                                                   "href":oldcoverhtmlhref}))
            
            

        if self.getConfig('include_images'):
            imgcount=0
            for imgmap in self.story.getImgUrls():
                imgfile = "OEBPS/"+imgmap['newsrc']
                outputepub.writestr(imgfile,imgmap['data'])
                items.append(("image%04d"%imgcount,
                              imgfile,
                              imgmap['mime'],
                              None))
                imgcount+=1
                if 'cover' in imgfile:
                    # make sure coverimgid is set to the cover, not
                    # just the first image.
                    coverimgid = items[-1][0]

        
        items.append(("style","OEBPS/stylesheet.css","text/css",None))

        if self.story.cover:
            # Note that the id of the cover xhmtl *must* be 'cover'
            # for it to work on Nook.
            items.append(("cover","OEBPS/cover.xhtml","application/xhtml+xml",None))
            itemrefs.append("cover")
            # 
            # <meta name="cover" content="cover.jpg"/>
            metadata.appendChild(newTag(contentdom,"meta",{"content":coverimgid,
                                                           "name":"cover"}))
            # cover stuff for later:
            # at end of <package>:
            # <guide>
            # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
            # </guide>
            guide = newTag(contentdom,"guide")
            guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
                                                       "title":"Cover",
                                                       "href":"OEBPS/cover.xhtml"}))
            
            if self.hasConfig("cover_content"):
                COVER = string.Template(self.getConfig("cover_content"))
            else:
                COVER = self.EPUB_COVER
            coverIO = StringIO.StringIO()
            coverIO.write(COVER.substitute(dict(self.story.getAllMetadata().items()+{'coverimg':self.story.cover}.items())))
            
        if self.getConfig("include_titlepage"):
            items.append(("title_page","OEBPS/title_page.xhtml","application/xhtml+xml","Title Page"))
            itemrefs.append("title_page")
        if len(self.story.getChapters()) > 1 and self.getConfig("include_tocpage") and not self.metaonly :
            items.append(("toc_page","OEBPS/toc_page.xhtml","application/xhtml+xml","Table of Contents"))
            itemrefs.append("toc_page")

        dologpage = ( self.getConfig("include_logpage") == "smart" and \
                          (self.story.logfile or self.story.getMetadataRaw("status") == "In-Progress") )  \
                     or self.getConfig("include_logpage") == "true"

        if dologpage:
            items.append(("log_page","OEBPS/log_page.xhtml","application/xhtml+xml","Update Log"))
            itemrefs.append("log_page")
            
        for index, chap in enumerate(self.story.getChapters(fortoc=True)):
            if chap.html:
                i=index+1
                items.append(("file%04d"%i,
                              "OEBPS/file%04d.xhtml"%i,
                              "application/xhtml+xml",
                              chap.title))
                itemrefs.append("file%04d"%i)

        manifest = contentdom.createElement("manifest")
        package.appendChild(manifest)
        for item in items:
            (id,href,type,title)=item
            manifest.appendChild(newTag(contentdom,"item",
                                        attrs={'id':id,
                                               'href':href,
                                               'media-type':type}))
        
        spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
        package.appendChild(spine)
        for itemref in itemrefs:
            spine.appendChild(newTag(contentdom,"itemref",
                                     attrs={"idref":itemref,
                                            "linear":"yes"}))
        # guide only exists if there's a cover.
        if guide:
            package.appendChild(guide)
            
        # write content.opf to zip.
        contentxml = contentdom.toxml(encoding='utf-8')
        
        # tweak for brain damaged Nook STR.  Nook insists on name before content.
        contentxml = contentxml.replace('<meta content="%s" name="cover"/>'%coverimgid,
                                        '<meta name="cover" content="%s"/>'%coverimgid)
        outputepub.writestr("content.opf",contentxml)

        contentdom.unlink()
        del contentdom

        ## create toc.ncx file
        tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
        ncx = tocncxdom.documentElement
        ncx.setAttribute("version","2005-1")
        ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
        head = tocncxdom.createElement("head")
        ncx.appendChild(head)
        head.appendChild(newTag(tocncxdom,"meta",
                                attrs={"name":"dtb:uid", "content":uniqueid}))
        head.appendChild(newTag(tocncxdom,"meta",
                                attrs={"name":"dtb:depth", "content":"1"}))
        head.appendChild(newTag(tocncxdom,"meta",
                                attrs={"name":"dtb:totalPageCount", "content":"0"}))
        head.appendChild(newTag(tocncxdom,"meta",
                                attrs={"name":"dtb:maxPageNumber", "content":"0"}))
        
        docTitle = tocncxdom.createElement("docTitle")
        docTitle.appendChild(newTag(tocncxdom,"text",text=self.getMetadata('title')))
        ncx.appendChild(docTitle)
    
        tocnavMap = tocncxdom.createElement("navMap")
        ncx.appendChild(tocnavMap)

        # <navPoint id="<id>" playOrder="<risingnumberfrom0>">
        #   <navLabel>
        #     <text><chapter title></text>
        #   </navLabel>
        #   <content src="<chapterfile>"/>
        # </navPoint>
        index=0
        for item in items:
            (id,href,type,title)=item
            # only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title.
            if title :
                navPoint = newTag(tocncxdom,"navPoint",
                                  attrs={'id':id,
                                         'playOrder':unicode(index)})
                tocnavMap.appendChild(navPoint)
                navLabel = newTag(tocncxdom,"navLabel")
                navPoint.appendChild(navLabel)
                ## the xml library will re-escape as needed.
                navLabel.appendChild(newTag(tocncxdom,"text",text=stripHTML(title)))
                navPoint.appendChild(newTag(tocncxdom,"content",attrs={"src":href}))
                index=index+1
        
        # write toc.ncx to zip file
        outputepub.writestr("toc.ncx",tocncxdom.toxml(encoding='utf-8'))
        tocncxdom.unlink()
        del tocncxdom

        # write stylesheet.css file.
        outputepub.writestr("OEBPS/stylesheet.css",self.EPUB_CSS.substitute(self.story.getAllMetadata())) 

        # write title page.
        if self.getConfig("titlepage_use_table"):
            TITLE_PAGE_START  = self.EPUB_TABLE_TITLE_PAGE_START
            TITLE_ENTRY       = self.EPUB_TABLE_TITLE_ENTRY
            WIDE_TITLE_ENTRY  = self.EPUB_TABLE_TITLE_WIDE_ENTRY
            NO_TITLE_ENTRY    = self.EPUB_TABLE_NO_TITLE_ENTRY
            TITLE_PAGE_END    = self.EPUB_TABLE_TITLE_PAGE_END
        else:
            TITLE_PAGE_START  = self.EPUB_TITLE_PAGE_START
            TITLE_ENTRY       = self.EPUB_TITLE_ENTRY
            WIDE_TITLE_ENTRY  = self.EPUB_TITLE_ENTRY # same, only wide in tables.
            NO_TITLE_ENTRY    = self.EPUB_NO_TITLE_ENTRY
            TITLE_PAGE_END    = self.EPUB_TITLE_PAGE_END

        if coverIO:
            outputepub.writestr("OEBPS/cover.xhtml",coverIO.getvalue())
            coverIO.close()
            
        titlepageIO = StringIO.StringIO()
        self.writeTitlePage(out=titlepageIO,
                            START=TITLE_PAGE_START,
                            ENTRY=TITLE_ENTRY,
                            WIDE_ENTRY=WIDE_TITLE_ENTRY,
                            END=TITLE_PAGE_END,
                            NO_TITLE_ENTRY=NO_TITLE_ENTRY)
        if titlepageIO.getvalue(): # will be false if no title page.
            outputepub.writestr("OEBPS/title_page.xhtml",titlepageIO.getvalue())
        titlepageIO.close()

        # write toc page.  
        tocpageIO = StringIO.StringIO()
        self.writeTOCPage(tocpageIO,
                          self.EPUB_TOC_PAGE_START,
                          self.EPUB_TOC_ENTRY,
                          self.EPUB_TOC_PAGE_END)
        if tocpageIO.getvalue(): # will be false if no toc page.
            outputepub.writestr("OEBPS/toc_page.xhtml",tocpageIO.getvalue())
        tocpageIO.close()

        if dologpage:
            # write log page.
            logpageIO = StringIO.StringIO()
            self.writeLogPage(logpageIO)
            outputepub.writestr("OEBPS/log_page.xhtml",logpageIO.getvalue())
            logpageIO.close()

        if self.hasConfig('chapter_start'):
            CHAPTER_START = string.Template(self.getConfig("chapter_start"))
        else:
            CHAPTER_START = self.EPUB_CHAPTER_START
        
        if self.hasConfig('chapter_end'):
            CHAPTER_END = string.Template(self.getConfig("chapter_end"))
        else:
            CHAPTER_END = self.EPUB_CHAPTER_END
        
        for index, chap in enumerate(self.story.getChapters()): # (url,title,html)
            if chap.html:
                #logger.debug('Writing chapter text for: %s' % chap.title)
                vals={'url':removeEntities(chap.url),
                      'chapter':removeEntities(chap.title),
                      'origchapter':removeEntities(chap.origtitle),
                      'tocchapter':removeEntities(chap.toctitle),
                      'index':"%04d"%(index+1),
                      'number':index+1}
                # escape double quotes in all vals.
                for k,v in vals.items():
                    if isinstance(v,basestring): vals[k]=v.replace('"','&quot;')
                fullhtml = CHAPTER_START.substitute(vals) + \
                    chap.html + CHAPTER_END.substitute(vals)
                # ffnet(& maybe others) gives the whole chapter text
                # as one line.  This causes problems for nook(at
                # least) when the chapter size starts getting big
                # (200k+)
                fullhtml = re.sub(r'(</p>|<br ?/>)\n*',r'\1\n',fullhtml)

                outputepub.writestr("OEBPS/file%04d.xhtml"%(index+1),fullhtml.encode('utf-8'))
                del fullhtml

        if self.story.calibrebookmark:
            outputepub.writestr("META-INF/calibre_bookmarks.txt",self.story.calibrebookmark)

	# declares all the files created by Windows.  otherwise, when
        # it runs in appengine, windows unzips the files as 000 perms.
        for zf in outputepub.filelist:
            zf.create_system = 0
        outputepub.close()
        out.write(zipio.getvalue())
        zipio.close()
Beispiel #5
0
def doUnMerge(inputio, outdir=None):
    epub = ZipFile(inputio,
                   'r')  # works equally well with inputio as a path or a blob
    outputios = []

    ## Find the .opf file.
    container = epub.read("META-INF/container.xml")
    containerdom = parseString(container)
    rootfilenodelist = containerdom.getElementsByTagName("rootfile")
    rootfilename = rootfilenodelist[0].getAttribute("full-path")

    contentdom = parseString(epub.read(rootfilename))

    ## Save the path to the .opf file--hrefs inside it are relative to it.
    relpath = get_path_part(rootfilename)
    logger.debug("relpath:%s" % relpath)

    # spin through the manifest--only place there are item tags.
    # Correction--only place there *should* be item tags.  But
    # somebody found one that did.
    manifesttag = contentdom.getElementsByTagNameNS("*", "manifest")[0]
    for item in manifesttag.getElementsByTagNameNS("*", "item"):
        # look for our fake media-type for original rootfiles.
        if (item.getAttribute("media-type") == "origrootfile/xml"):
            # found one, assume the dir containing it is a complete
            # original epub, do initial setup of epub.
            itemhref = normpath(relpath + unquote(item.getAttribute("href")))
            logger.debug("Found origrootfile:%s" % itemhref)
            curepubpath = re.sub(r'([^\d/]+/)+$', '', get_path_part(itemhref))
            savehref = itemhref[len(curepubpath):]
            logger.debug("curepubpath:%s" % curepubpath)

            outputio = BytesIO()
            outputepub = ZipFile(outputio,
                                 "w",
                                 compression=ZIP_STORED,
                                 allowZip64=True)
            outputepub.debug = 3
            outputepub.writestr("mimetype", "application/epub+zip")
            outputepub.close()

            ## Re-open file for content.
            outputepub = ZipFile(outputio,
                                 "a",
                                 compression=ZIP_DEFLATED,
                                 allowZip64=True)
            outputepub.debug = 3
            ## Create META-INF/container.xml file.  The only thing it does is
            ## point to content.opf
            containerdom = getDOMImplementation().createDocument(
                None, "container", None)
            containertop = containerdom.documentElement
            containertop.setAttribute("version", "1.0")
            containertop.setAttribute(
                "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container")
            rootfiles = containerdom.createElement("rootfiles")
            containertop.appendChild(rootfiles)
            rootfiles.appendChild(
                newTag(
                    containerdom, "rootfile", {
                        "full-path": savehref,
                        "media-type": "application/oebps-package+xml"
                    }))
            outputepub.writestr(
                "META-INF/container.xml",
                containerdom.toprettyxml(indent='   ', encoding='utf-8'))

            outputepub.writestr(savehref, epub.read(itemhref))

            for item2 in contentdom.getElementsByTagName("item"):
                item2href = normpath(relpath +
                                     unquote(item2.getAttribute("href")))
                if item2href.startswith(curepubpath) and item2href != itemhref:
                    save2href = item2href[len(curepubpath):]
                    logger.debug("Found %s -> %s" % (item2href, save2href))
                    outputepub.writestr(save2href, epub.read(item2href))

            # declares all the files created by Windows.  otherwise, when
            # it runs in appengine, windows unzips the files as 000 perms.
            for zf in outputepub.filelist:
                zf.create_system = 0
            outputepub.close()

            outputios.append(outputio)

    if outdir:
        outfilenames = []
        for count, epubIO in enumerate(outputios):
            filename = "%s/%d.epub" % (outdir, count)
            logger.debug("write %s" % filename)
            outstream = open(filename, "wb")
            outstream.write(epubIO.getvalue())
            outstream.close()
            outfilenames.append(filename)
        return outfilenames
    else:
        return outputios
Beispiel #6
0
def doMerge(outputio,
            files,
            authoropts=[],
            titleopt=None,
            descopt=None,
            tags=[],
            languages=['en'],
            titlenavpoints=True,
            originalnavpoints=True,
            flattentoc=False,
            printtimes=False,
            coverjpgpath=None,
            keepmetadatafiles=False,
            source=None):
    '''
    outputio = output file name or BytesIO.
    files = list of input file names or BytesIOs.
    authoropts = list of authors to use, otherwise add from all input
    titleopt = title, otherwise '<first title> Anthology'
    descopt = description, otherwise '<title> by <author>' list for all input
    tags = dc:subject tags to include, otherwise none.
    languages = dc:language tags to include
    titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it
    originalnavpoints if true, include the original TOCs from each epub
    flattentoc if true, flatten TOC down to one level only.
    coverjpgpath, Path to a jpg to use as cover image.
    '''

    printt = partial(cond_print, printtimes)

    ## Python 2.5 ZipFile is rather more primative than later
    ## versions.  It can operate on a file, or on a BytesIO, but
    ## not on an open stream.  OTOH, I suspect we would have had
    ## problems with closing and opening again to change the
    ## compression type anyway.

    filecount = 0
    t = time()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(outputio,
                         "w",
                         compression=ZIP_STORED,
                         allowZip64=True)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(outputio,
                         "a",
                         compression=ZIP_DEFLATED,
                         allowZip64=True)
    outputepub.debug = 3

    ## Create META-INF/container.xml file.  The only thing it does is
    ## point to content.opf
    containerdom = getDOMImplementation().createDocument(
        None, "container", None)
    containertop = containerdom.documentElement
    containertop.setAttribute("version", "1.0")
    containertop.setAttribute(
        "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container")
    rootfiles = containerdom.createElement("rootfiles")
    containertop.appendChild(rootfiles)
    rootfiles.appendChild(
        newTag(
            containerdom, "rootfile", {
                "full-path": "content.opf",
                "media-type": "application/oebps-package+xml"
            }))
    outputepub.writestr(
        "META-INF/container.xml",
        containerdom.toprettyxml(indent='   ', encoding='utf-8'))

    ## Process input epubs.

    items = [
    ]  # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
    items.append(
        ("ncx", "toc.ncx",
         "application/x-dtbncx+xml"))  ## we'll generate the toc.ncx file,
    ## but it needs to be in the items manifest.
    itemrefs = []  # list of strings -- idrefs from .opfs' spines
    navmaps = [
    ]  # list of navMap DOM elements -- TOC data for each from toc.ncx files
    is_ffdl_epub = []  # list of t/f

    itemhrefs = {
    }  # hash of item[id]s to itemref[href]s -- to find true start of book(s).
    firstitemhrefs = []

    booktitles = []  # list of strings -- Each book's title
    allauthors = []  # list of lists of strings -- Each book's list of authors.

    filelist = []

    printt("prep output:%s" % (time() - t))
    t = time()

    booknum = 1
    firstmetadom = None
    for file in files:
        if file == None: continue

        book = "%d" % booknum
        bookdir = "%d/" % booknum
        bookid = "a%d" % booknum

        epub = ZipFile(file, 'r')

        ## Find the .opf file.
        container = epub.read("META-INF/container.xml")
        containerdom = parseString(container)
        rootfilenodelist = containerdom.getElementsByTagNameNS("*", "rootfile")
        rootfilename = rootfilenodelist[0].getAttribute("full-path")

        ## Save the path to the .opf file--hrefs inside it are relative to it.
        relpath = get_path_part(rootfilename)

        metadom = parseString(epub.read(rootfilename))
        # logger.debug("metadom:%s"%epub.read(rootfilename))
        if booknum == 1 and not source:
            try:
                firstmetadom = metadom.getElementsByTagNameNS("*",
                                                              "metadata")[0]
                source = firstmetadom.getElementsByTagName(
                    "dc:source")[0].firstChild.data.encode("utf-8")
            except:
                source = ""

        # if the epub was ever edited with Sigil, it changed the unique-identifier,
        # but dc:contributor was left.
        #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid")
        is_ffdl_epub.append(False)

        for c in metadom.getElementsByTagName("dc:contributor"):
            # logger.debug("dc:contributor:%s"%getText(c.childNodes))
            if c.getAttribute("opf:role") == "bkp" and \
                    getText(c.childNodes) in ["fanficdownloader [http://fanficdownloader.googlecode.com]",
                                              "FanFicFare [https://github.com/JimmXinu/FanFicFare]"]:
                is_ffdl_epub[-1] = True  # set last.
                break

        ## Save indiv book title
        try:
            booktitles.append(
                metadom.getElementsByTagName("dc:title")[0].firstChild.data)
        except:
            booktitles.append("(Title Missing)")

        ## Save authors.
        authors = []
        for creator in metadom.getElementsByTagName("dc:creator"):
            try:
                if (creator.getAttribute("opf:role") == "aut"
                        or not creator.hasAttribute("opf:role")
                        and creator.firstChild != None):
                    authors.append(creator.firstChild.data)
            except:
                pass
        if len(authors) == 0:
            authors.append("(Author Missing)")
        allauthors.append(authors)

        if keepmetadatafiles:
            itemid = bookid + "rootfile"
            itemhref = rootfilename
            href = bookdir + itemhref
            logger.debug("write rootfile %s to %s" % (itemhref, href))
            outputepub.writestr(href, epub.read(itemhref))
            items.append((itemid, href, "origrootfile/xml"))

        # spin through the manifest--only place there are item tags.
        # Correction--only place there *should* be item tags.  But
        # somebody found one that did.
        manifesttag = metadom.getElementsByTagNameNS("*", "manifest")[0]
        for item in manifesttag.getElementsByTagNameNS("*", "item"):
            itemid = bookid + item.getAttribute("id")
            itemhref = normpath(unquote(
                item.getAttribute("href")))  # remove %20, etc.
            href = bookdir + relpath + itemhref
            if (item.getAttribute("media-type") == "application/x-dtbncx+xml"):
                # TOC file is only one with this type--as far as I know.
                # grab the whole navmap, deal with it later.
                tocdom = parseString(
                    epub.read(normpath(relpath + item.getAttribute("href"))))

                # update all navpoint ids with bookid for uniqueness.
                for navpoint in tocdom.getElementsByTagNameNS("*", "navPoint"):
                    navpoint.setAttribute("id",
                                          bookid + navpoint.getAttribute("id"))

                # update all content paths with bookdir for uniqueness.
                for content in tocdom.getElementsByTagNameNS("*", "content"):
                    content.setAttribute(
                        "src",
                        normpath(bookdir + relpath +
                                 content.getAttribute("src")))

                navmaps.append(tocdom.getElementsByTagNameNS("*", "navMap")[0])

                if keepmetadatafiles:
                    logger.debug("write toc.ncx %s to %s" %
                                 (relpath + itemhref, href))
                    outputepub.writestr(
                        href, epub.read(normpath(relpath + itemhref)))
                    items.append((itemid, href, "origtocncx/xml"))
            else:
                #href=href.encode('utf8')
                logger.debug("item id: %s -> %s:" % (itemid, href))
                itemhrefs[itemid] = href
                if href not in filelist:
                    try:
                        outputepub.writestr(
                            href, epub.read(normpath(relpath + itemhref)))
                        if re.match(r'.*/(file|chapter)\d+\.x?html', href):
                            filecount += 1
                        items.append(
                            (itemid, href, item.getAttribute("media-type")))
                        filelist.append(href)
                    except KeyError as ke:  # Skip missing files.
                        logger.info("Skipping missing file %s (%s)" %
                                    (href, relpath + itemhref))
                        del itemhrefs[itemid]

        itemreflist = metadom.getElementsByTagNameNS("*", "itemref")
        # logger.debug("itemhrefs:%s"%itemhrefs)
        logger.debug("bookid:%s" % bookid)
        logger.debug("itemreflist[0].getAttribute(idref):%s" %
                     itemreflist[0].getAttribute("idref"))

        # Looking for the first item in itemreflist that wasn't
        # discarded due to missing files.
        for itemref in itemreflist:
            idref = bookid + itemref.getAttribute("idref")
            if idref in itemhrefs:
                firstitemhrefs.append(itemhrefs[idref])
                break

        for itemref in itemreflist:
            itemrefs.append(bookid + itemref.getAttribute("idref"))
            # logger.debug("adding to itemrefs:%s"%itemref.toprettyxml())

        booknum = booknum + 1

    printt("after file loop:%s" % (time() - t))
    t = time()

    ## create content.opf file.
    uniqueid = "epubmerge-uid-%d" % time()  # real sophisticated uid scheme.
    contentdom = getDOMImplementation().createDocument(None, "package", None)
    package = contentdom.documentElement

    package.setAttribute("version", "2.0")
    package.setAttribute("xmlns", "http://www.idpf.org/2007/opf")
    package.setAttribute("unique-identifier", "epubmerge-id")
    metadata = newTag(contentdom,
                      "metadata",
                      attrs={
                          "xmlns:dc": "http://purl.org/dc/elements/1.1/",
                          "xmlns:opf": "http://www.idpf.org/2007/opf"
                      })
    metadata.appendChild(
        newTag(contentdom,
               "dc:identifier",
               text=uniqueid,
               attrs={"id": "epubmerge-id"}))
    if (titleopt is None):
        titleopt = booktitles[0] + " Anthology"
    metadata.appendChild(newTag(contentdom, "dc:title", text=titleopt))

    # If cmdline authors, use those instead of those collected from the epubs
    # (allauthors kept for TOC & description gen below.
    if (len(authoropts) > 1):
        useauthors = [authoropts]
    else:
        useauthors = allauthors

    usedauthors = dict()
    for authorlist in useauthors:
        for author in authorlist:
            if (author not in usedauthors):
                usedauthors[author] = author
                metadata.appendChild(
                    newTag(contentdom,
                           "dc:creator",
                           attrs={"opf:role": "aut"},
                           text=author))

    metadata.appendChild(
        newTag(contentdom,
               "dc:contributor",
               text="epubmerge",
               attrs={"opf:role": "bkp"}))
    metadata.appendChild(
        newTag(contentdom,
               "dc:rights",
               text="Copyrights as per source stories"))

    for l in languages:
        metadata.appendChild(newTag(contentdom, "dc:language", text=l))

    if not descopt:
        # created now, but not filled in until TOC generation to save loops.
        description = newTag(contentdom,
                             "dc:description",
                             text="Anthology containing:\n")
    else:
        description = newTag(contentdom, "dc:description", text=descopt)
    metadata.appendChild(description)

    if source:
        metadata.appendChild(
            newTag(contentdom,
                   "dc:identifier",
                   attrs={"opf:scheme": "URL"},
                   text=source))
        metadata.appendChild(newTag(contentdom, "dc:source", text=source))

    for tag in tags:
        metadata.appendChild(newTag(contentdom, "dc:subject", text=tag))

    package.appendChild(metadata)

    manifest = contentdom.createElement("manifest")
    package.appendChild(manifest)

    spine = newTag(contentdom, "spine", attrs={"toc": "ncx"})
    package.appendChild(spine)

    if coverjpgpath:
        # in case coverjpg isn't a jpg:
        coverext = 'jpg'
        covertype = 'image/jpeg'
        try:
            coverext = coverjpgpath.split('.')[-1].lower()
            covertype = imagetypes.get(coverext, covertype)
        except:
            pass
        logger.debug("coverjpgpath:%s coverext:%s covertype:%s" %
                     (coverjpgpath, coverext, covertype))
        # <meta name="cover" content="cover.jpg"/>
        metadata.appendChild(
            newTag(contentdom, "meta", {
                "name": "cover",
                "content": "coverimageid"
            }))
        guide = newTag(contentdom, "guide")
        guide.appendChild(
            newTag(contentdom,
                   "reference",
                   attrs={
                       "type": "cover",
                       "title": "Cover",
                       "href": "cover.xhtml"
                   }))
        package.appendChild(guide)

        manifest.appendChild(
            newTag(contentdom,
                   "item",
                   attrs={
                       'id': "coverimageid",
                       'href': "cover." + coverext,
                       'media-type': covertype
                   }))

        # Note that the id of the cover xhmtl *must* be 'cover'
        # for it to work on Nook.
        manifest.appendChild(
            newTag(contentdom,
                   "item",
                   attrs={
                       'id': "cover",
                       'href': "cover.xhtml",
                       'media-type': "application/xhtml+xml"
                   }))

        spine.appendChild(
            newTag(contentdom,
                   "itemref",
                   attrs={
                       "idref": "cover",
                       "linear": "yes"
                   }))

    for item in items:
        # logger.debug("new item:%s %s %s"%item)
        (id, href, type) = item
        manifest.appendChild(
            newTag(contentdom,
                   "item",
                   attrs={
                       'id': id,
                       'href': href,
                       'media-type': type
                   }))

    for itemref in itemrefs:
        # logger.debug("itemref:%s"%itemref)
        spine.appendChild(
            newTag(contentdom,
                   "itemref",
                   attrs={
                       "idref": itemref,
                       "linear": "yes"
                   }))

    ## create toc.ncx file
    tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
    ncx = tocncxdom.documentElement
    ncx.setAttribute("version", "2005-1")
    ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/")
    head = tocncxdom.createElement("head")
    ncx.appendChild(head)
    head.appendChild(
        newTag(tocncxdom,
               "meta",
               attrs={
                   "name": "dtb:uid",
                   "content": uniqueid
               }))
    depthnode = newTag(tocncxdom,
                       "meta",
                       attrs={
                           "name": "dtb:depth",
                           "content": "4"
                       })
    head.appendChild(depthnode)
    head.appendChild(
        newTag(tocncxdom,
               "meta",
               attrs={
                   "name": "dtb:totalPageCount",
                   "content": "0"
               }))
    head.appendChild(
        newTag(tocncxdom,
               "meta",
               attrs={
                   "name": "dtb:maxPageNumber",
                   "content": "0"
               }))

    docTitle = tocncxdom.createElement("docTitle")
    docTitle.appendChild(newTag(tocncxdom, "text", text=titleopt))
    ncx.appendChild(docTitle)

    tocnavMap = tocncxdom.createElement("navMap")
    ncx.appendChild(tocnavMap)

    booknum = 0

    printt("wrote initial metadata:%s" % (time() - t))
    t = time()

    for navmap in navmaps:
        depthnavpoints = navmap.getElementsByTagNameNS(
            "*", "navPoint")  # for checking more than one TOC entry

        # logger.debug( [ x.toprettyxml() for x in navmap.childNodes ] )
        ## only gets top level TOC entries.  sub entries carried inside.
        navpoints = [
            x for x in navmap.childNodes
            if isinstance(x, Element) and x.tagName == "navPoint"
        ]
        # logger.debug("len(navpoints):%s"%len(navpoints))
        # logger.debug( [ x.toprettyxml() for x in navpoints ] )
        newnav = None
        if titlenavpoints:
            newnav = newTag(tocncxdom, "navPoint",
                            {"id": "book%03d" % booknum})
            navlabel = newTag(tocncxdom, "navLabel")
            newnav.appendChild(navlabel)
            # For purposes of TOC titling & desc, use first book author.  Skip adding author if only one.
            if len(usedauthors) > 1:
                title = booktitles[booknum] + " by " + allauthors[booknum][0]
            else:
                title = booktitles[booknum]

            navlabel.appendChild(newTag(tocncxdom, "text", text=title))
            # Find the first 'spine' item's content for the title navpoint.
            # Many epubs have the first chapter as first navpoint, so we can't just
            # copy that anymore.
            newnav.appendChild(
                newTag(tocncxdom, "content", {"src": firstitemhrefs[booknum]}))

            # logger.debug("newnav:%s"%newnav.toprettyxml())
            tocnavMap.appendChild(newnav)
            # logger.debug("tocnavMap:%s"%tocnavMap.toprettyxml())
        else:
            newnav = tocnavMap

        if not descopt and len(allauthors[booknum]) > 0:
            description.appendChild(
                contentdom.createTextNode(booktitles[booknum] + " by " +
                                          allauthors[booknum][0] + "\n"))

        # If only one TOC point(total, not top level), or if not
        # including title nav point, include sub book TOC entries.
        if originalnavpoints and (len(depthnavpoints) > 1
                                  or not titlenavpoints):
            for navpoint in navpoints:
                # logger.debug("navpoint:%s"%navpoint.toprettyxml())
                newnav.appendChild(navpoint)
                navpoint.is_ffdl_epub = is_ffdl_epub[booknum]

        booknum = booknum + 1
        # end of navmaps loop.

    maxdepth = 0
    contentsrcs = {}
    removednodes = []
    ## Force strict ordering of playOrder, stripping out some.
    playorder = 0
    # logger.debug("tocncxdom:%s"%tocncxdom.toprettyxml())
    for navpoint in tocncxdom.getElementsByTagNameNS("*", "navPoint"):
        # logger.debug("navpoint:%s"%navpoint.toprettyxml())
        if navpoint in removednodes:
            continue
        # need content[src] to compare for dups.  epub wants dup srcs to have same playOrder.
        contentsrc = None
        for n in navpoint.childNodes:
            if isinstance(n, Element) and n.tagName == "content":
                contentsrc = n.getAttribute("src")
                logger.debug("contentsrc: %s" % contentsrc)
                break

        if (contentsrc not in contentsrcs):

            parent = navpoint.parentNode
            try:
                # if the epub was ever edited with Sigil, it changed
                # the id, but the file name is the same.
                if navpoint.is_ffdl_epub and \
                        ( navpoint.getAttribute("id").endswith('log_page') \
                              or contentsrc.endswith("log_page.xhtml") ):
                    logger.debug("Doing sibs 'filter' 1")
                    sibs = [
                        x for x in parent.childNodes
                        if isinstance(x, Element) and x.tagName == "navPoint"
                    ]
                    # if only logpage and one chapter, remove them from TOC and just show story.
                    if len(sibs) == 2:
                        parent.removeChild(navpoint)
                        logger.debug("Removing %s:" %
                                     sibs[0].getAttribute("playOrder"))
                        parent.removeChild(sibs[1])
                        removednodes.append(sibs[1])
            except:
                pass

            # New src, new number.
            contentsrcs[contentsrc] = navpoint.getAttribute("id")
            playorder += 1
            navpoint.setAttribute("playOrder", "%d" % playorder)
            logger.debug("playorder:%d:" % playorder)

            # need to know depth of deepest navpoint for <meta name="dtb:depth" content="2"/>
            npdepth = 1
            dp = navpoint.parentNode
            while dp and dp.tagName != "navMap":
                npdepth += 1
                dp = dp.parentNode

            if npdepth > maxdepth:
                maxdepth = npdepth
        else:
            # same content, look for ffdl and title_page and/or single chapter.

            # easier to just set it now, even if the node gets removed later.
            navpoint.setAttribute("playOrder", "%d" % playorder)
            logger.debug("playorder:%d:" % playorder)

            parent = navpoint.parentNode
            try:
                # if the epub was ever edited with Sigil, it changed
                # the id, but the file name is the same.
                if navpoint.is_ffdl_epub and \
                        ( navpoint.getAttribute("id").endswith('title_page') \
                              or contentsrc.endswith("title_page.xhtml") ):
                    parent.removeChild(navpoint)
                    logger.debug("Doing sibs 'filter' 2")
                    sibs = [
                        x for x in parent.childNodes
                        if isinstance(x, Element) and x.tagName == "navPoint"
                    ]
                    # if only one chapter after removing title_page, remove it too.
                    if len(sibs) == 1:
                        logger.debug("Removing %s:" %
                                     sibs[0].getAttribute("playOrder"))
                        parent.removeChild(sibs[0])
                        removednodes.append(sibs[0])
            except:
                pass

    if flattentoc:
        maxdepth = 1
        # already have play order and pesky dup/single chapters
        # removed, just need to flatten.
        flattocnavMap = tocncxdom.createElement("navMap")
        for n in tocnavMap.getElementsByTagNameNS("*", "navPoint"):
            flattocnavMap.appendChild(n)

        ncx.replaceChild(flattocnavMap, tocnavMap)

    printt("navmap/toc maddess:%s" % (time() - t))
    t = time()

    depthnode.setAttribute("content", "%d" % maxdepth)

    ## content.opf written now due to description being filled in
    ## during TOC generation to save loops.
    contentxml = contentdom.toprettyxml(indent='   ', encoding='utf-8')
    # tweak for brain damaged Nook STR.  Nook insists on name before content.
    contentxml = contentxml.replace(
        ensure_binary('<meta content="coverimageid" name="cover"/>'),
        ensure_binary('<meta name="cover" content="coverimageid"/>'))
    outputepub.writestr("content.opf", contentxml)
    outputepub.writestr("toc.ncx",
                        tocncxdom.toprettyxml(indent='   ', encoding='utf-8'))

    printt("wrote opf/ncx files:%s" % (time() - t))
    t = time()

    if coverjpgpath:
        # write, not write string.  Pulling from file.
        outputepub.write(coverjpgpath, "cover." + coverext)

        outputepub.writestr(
            "cover.xhtml", '''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style></head><body><div>
<img src="cover.''' + coverext + '''" alt="cover"/>
</div></body></html>
''')

    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0
    outputepub.close()

    printt("closed outputepub:%s" % (time() - t))
    t = time()

    return (source, filecount)
Beispiel #7
0
def reset_orig_chapters_epub(inputio,outfile):
    inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob

    ## build zip in memory in case updating in place(CLI).
    zipio = StringIO()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    changed = False

    unmerge_tocncxdoms = {}
    ## spin through file contents, saving any unmerge toc.ncx files.
    for zf in inputepub.namelist():
        ## logger.debug("zf:%s"%zf)
        if zf.endswith('/toc.ncx'):
            ## logger.debug("toc.ncx zf:%s"%zf)
            unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf))

    tocncxdom = parseString(inputepub.read('toc.ncx'))
    ## spin through file contents.
    for zf in inputepub.namelist():
        if zf not in ['mimetype','toc.ncx'] and not zf.endswith('/toc.ncx'):
            entrychanged = False
            data = inputepub.read(zf)
            # if isinstance(data,unicode):
            #     logger.debug("\n\n\ndata is unicode\n\n\n")
            if re.match(r'.*/file\d+\.xhtml',zf):
                #logger.debug("zf:%s"%zf)
                data = data.decode('utf-8')
                soup = make_soup(data)

                chapterorigtitle = None
                tag = soup.find('meta',{'name':'chapterorigtitle'})
                if tag:
                    chapterorigtitle = tag['content']

                # toctitle is separate for add_chapter_numbers:toconly users.
                chaptertoctitle = None
                tag = soup.find('meta',{'name':'chaptertoctitle'})
                if tag:
                    chaptertoctitle = tag['content']
                    chaptertoctitle = chapterorigtitle

                chaptertitle = None
                tag = soup.find('meta',{'name':'chaptertitle'})
                if tag:
                    chaptertitle = tag['content']
                    chaptertitle_tag = tag

                #logger.debug("chaptertitle:(%s) chapterorigtitle:(%s)"%(chaptertitle, chapterorigtitle))
                if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle:
                    origdata = data
                    # data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>',
                    #                     u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>')
                    # data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>')
                    # data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>')
                    chaptertitle_tag['content'] = chapterorigtitle
                    title_tag = soup.find('title')
                    if title_tag and title_tag.string == chaptertitle:
                        title_tag.string.replace_with(chapterorigtitle)

                    h3_tag = soup.find('h3')
                    if h3_tag and h3_tag.string == chaptertitle:
                        h3_tag.string.replace_with(chapterorigtitle)

                    data = unicode(soup)

                    entrychanged = ( origdata != data )
                    changed = changed or entrychanged

                    if entrychanged:
                        logger.debug("\nentrychanged:%s\n"%zf)
                        _replace_tocncx(tocncxdom,zf,chaptertoctitle)
                        ## Also look for and update individual
                        ## book toc.ncx files for anthology in case
                        ## it's unmerged.
                        zf_toc = zf[:zf.rfind('/OEBPS/')]+'/toc.ncx'
                        mergedprefix_len = len(zf[:zf.rfind('/OEBPS/')])+1

                        if zf_toc in unmerge_tocncxdoms:
                            _replace_tocncx(unmerge_tocncxdoms[zf_toc],zf[mergedprefix_len:],chaptertoctitle)

                outputepub.writestr(zf,data.encode('utf-8'))
            else:
                # possibly binary data, thus no .encode().
                outputepub.writestr(zf,data)

    for tocnm, tocdom in unmerge_tocncxdoms.items():
        outputepub.writestr(tocnm,tocdom.toxml(encoding='utf-8'))

    outputepub.writestr('toc.ncx',tocncxdom.toxml(encoding='utf-8'))
    outputepub.close()
    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0

    # only *actually* write if changed.
    if changed:
        if isinstance(outfile,basestring):
            with open(outfile,"wb") as outputio:
                outputio.write(zipio.getvalue())
        else:
            outfile.write(zipio.getvalue())

    inputepub.close()
    zipio.close()

    return changed
Beispiel #8
0
    def write_split_epub(self,
                         outputio,
                         linenums,
                         changedtocs={},
                         authoropts=[],
                         titleopt=None,
                         descopt=None,
                         tags=[],
                         languages=['en'],
                         coverjpgpath=None):

        files = self.get_split_files(linenums)

        ## Write mimetype file, must be first and uncompressed.
        ## Older versions of python(2.4/5) don't allow you to specify
        ## compression by individual file.
        ## Overwrite if existing output file.
        outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
        outputepub.debug = 3
        outputepub.writestr("mimetype", "application/epub+zip")
        outputepub.close()

        ## Re-open file for content.
        outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
        outputepub.debug = 3

        ## Create META-INF/container.xml file.  The only thing it does is
        ## point to content.opf
        containerdom = getDOMImplementation().createDocument(None, "container", None)
        containertop = containerdom.documentElement
        containertop.setAttribute("version","1.0")
        containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
        rootfiles = containerdom.createElement("rootfiles")
        containertop.appendChild(rootfiles)
        rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
                                                              "media-type":"application/oebps-package+xml"}))
        outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent='   ',encoding='utf-8'))


####    ## create content.opf file.
        uniqueid="epubsplit-uid-%d" % time() # real sophisticated uid scheme.
        contentdom = getDOMImplementation().createDocument(None, "package", None)
        package = contentdom.documentElement

        package.setAttribute("version","2.0")
        package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
        package.setAttribute("unique-identifier","epubsplit-id")
        metadata=newTag(contentdom,"metadata",
                        attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
                               "xmlns:opf":"http://www.idpf.org/2007/opf"})
        metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubsplit-id"}))
        if( titleopt is None ):
            titleopt = self.origtitle+" Split"
        metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))

        if( authoropts and len(authoropts) > 0  ):
            useauthors=authoropts
        else:
            useauthors=self.origauthors

        usedauthors=dict()
        for author in useauthors:
            if( not usedauthors.has_key(author) ):
                usedauthors[author]=author
                metadata.appendChild(newTag(contentdom,"dc:creator",
                                            attrs={"opf:role":"aut"},
                                            text=author))

        metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubsplit",attrs={"opf:role":"bkp"}))
        metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))

        if languages:
            for l in languages:
                metadata.appendChild(newTag(contentdom,"dc:language",text=l))
        else:
            metadata.appendChild(newTag(contentdom,"dc:language",text="en"))

        if not descopt:
            # created now, but not filled in until TOC generation to save loops.
            description = newTag(contentdom,"dc:description",text="Split from %s by %s."%(self.origtitle,", ".join(self.origauthors)))
        else:
            description = newTag(contentdom,"dc:description",text=descopt)
        metadata.appendChild(description)

        for tag in tags:
            metadata.appendChild(newTag(contentdom,"dc:subject",text=tag))

        package.appendChild(metadata)

        manifest = contentdom.createElement("manifest")
        package.appendChild(manifest)
        spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
        package.appendChild(spine)

        manifest.appendChild(newTag(contentdom,"item",
                                    attrs={'id':'ncx',
                                           'href':'toc.ncx',
                                           'media-type':'application/x-dtbncx+xml'}))

        if coverjpgpath:
            # <meta name="cover" content="cover.jpg"/>
            metadata.appendChild(newTag(contentdom,"meta",{"name":"cover",
                                                           "content":"coverimageid"}))
            # cover stuff for later:
            # at end of <package>:
            # <guide>
            # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
            # </guide>
            guide = newTag(contentdom,"guide")
            guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
                                                       "title":"Cover",
                                                       "href":"cover.xhtml"}))
            package.appendChild(guide)

            manifest.appendChild(newTag(contentdom,"item",
                                        attrs={'id':"coverimageid",
                                               'href':"cover.jpg",
                                               'media-type':"image/jpeg"}))

            # Note that the id of the cover xhmtl *must* be 'cover'
            # for it to work on Nook.
            manifest.appendChild(newTag(contentdom,"item",
                                        attrs={'id':"cover",
                                               'href':"cover.xhtml",
                                               'media-type':"application/xhtml+xml"}))

            spine.appendChild(newTag(contentdom,"itemref",
                                     attrs={"idref":"cover",
                                            "linear":"yes"}))

        contentcount=0
        for (filename,id,type,filedata) in files:
            #filename = self.filecache.addHtml(href,filedata)
            #print("writing :%s"%filename)
            # add to manifest and spine

            if coverjpgpath and filename == "cover.xhtml":
                continue # don't dup cover.

            outputepub.writestr(filename,filedata.encode('utf-8'))
            id = "a%d"%contentcount
            contentcount += 1
            manifest.appendChild(newTag(contentdom,"item",
                                        attrs={'id':id,
                                               'href':filename,
                                               'media-type':type}))
            spine.appendChild(newTag(contentdom,"itemref",
                                     attrs={"idref":id,
                                            "linear":"yes"}))

        for (linked,type) in self.filecache.linkedfiles:
            # add to manifest
            if coverjpgpath and linked == "cover.jpg":
                continue # don't dup cover.

            try:
                outputepub.writestr(linked,self.get_file(linked))
            except Exception, e:
                print("Failed to copy linked file (%s)\nException: %s"%(linked,e))

            id = "a%d"%contentcount
            contentcount += 1
            manifest.appendChild(newTag(contentdom,"item",
                                        attrs={'id':id,
                                               'href':linked,
                                               'media-type':type}))
Beispiel #9
0
    def write_split_epub(self,
                         outputio,
                         linenums,
                         changedtocs={},
                         authoropts=[],
                         titleopt=None,
                         descopt=None,
                         tags=[],
                         languages=['en'],
                         coverjpgpath=None):

        files = self.get_split_files(linenums)

        ## Write mimetype file, must be first and uncompressed.
        ## Older versions of python(2.4/5) don't allow you to specify
        ## compression by individual file.
        ## Overwrite if existing output file.
        outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
        outputepub.debug = 3
        outputepub.writestr("mimetype", "application/epub+zip")
        outputepub.close()

        ## Re-open file for content.
        outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
        outputepub.debug = 3

        ## Create META-INF/container.xml file.  The only thing it does is
        ## point to content.opf
        containerdom = getDOMImplementation().createDocument(
            None, "container", None)
        containertop = containerdom.documentElement
        containertop.setAttribute("version", "1.0")
        containertop.setAttribute(
            "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container")
        rootfiles = containerdom.createElement("rootfiles")
        containertop.appendChild(rootfiles)
        rootfiles.appendChild(
            newTag(
                containerdom, "rootfile", {
                    "full-path": "content.opf",
                    "media-type": "application/oebps-package+xml"
                }))
        outputepub.writestr(
            "META-INF/container.xml",
            containerdom.toprettyxml(indent='   ', encoding='utf-8'))

        ####    ## create content.opf file.
        uniqueid = "epubsplit-uid-%d" % time(
        )  # real sophisticated uid scheme.
        contentdom = getDOMImplementation().createDocument(
            None, "package", None)
        package = contentdom.documentElement

        package.setAttribute("version", "2.0")
        package.setAttribute("xmlns", "http://www.idpf.org/2007/opf")
        package.setAttribute("unique-identifier", "epubsplit-id")
        metadata = newTag(contentdom,
                          "metadata",
                          attrs={
                              "xmlns:dc": "http://purl.org/dc/elements/1.1/",
                              "xmlns:opf": "http://www.idpf.org/2007/opf"
                          })
        metadata.appendChild(
            newTag(contentdom,
                   "dc:identifier",
                   text=uniqueid,
                   attrs={"id": "epubsplit-id"}))
        if (titleopt is None):
            titleopt = self.origtitle + " Split"
        metadata.appendChild(newTag(contentdom, "dc:title", text=titleopt))

        if (authoropts and len(authoropts) > 0):
            useauthors = authoropts
        else:
            useauthors = self.origauthors

        usedauthors = dict()
        for author in useauthors:
            if (author not in usedauthors):
                usedauthors[author] = author
                metadata.appendChild(
                    newTag(contentdom,
                           "dc:creator",
                           attrs={"opf:role": "aut"},
                           text=author))

        metadata.appendChild(
            newTag(contentdom,
                   "dc:contributor",
                   text="epubsplit",
                   attrs={"opf:role": "bkp"}))
        metadata.appendChild(
            newTag(contentdom,
                   "dc:rights",
                   text="Copyrights as per source stories"))

        if languages:
            for l in languages:
                metadata.appendChild(newTag(contentdom, "dc:language", text=l))
        else:
            metadata.appendChild(newTag(contentdom, "dc:language", text="en"))

        if not descopt:
            # created now, but not filled in until TOC generation to save loops.
            description = newTag(contentdom,
                                 "dc:description",
                                 text="Split from %s by %s." %
                                 (self.origtitle, ", ".join(self.origauthors)))
        else:
            description = newTag(contentdom, "dc:description", text=descopt)
        metadata.appendChild(description)

        for tag in tags:
            metadata.appendChild(newTag(contentdom, "dc:subject", text=tag))

        package.appendChild(metadata)

        manifest = contentdom.createElement("manifest")
        package.appendChild(manifest)
        spine = newTag(contentdom, "spine", attrs={"toc": "ncx"})
        package.appendChild(spine)

        manifest.appendChild(
            newTag(contentdom,
                   "item",
                   attrs={
                       'id': 'ncx',
                       'href': 'toc.ncx',
                       'media-type': 'application/x-dtbncx+xml'
                   }))

        if coverjpgpath:
            # <meta name="cover" content="cover.jpg"/>
            metadata.appendChild(
                newTag(contentdom, "meta", {
                    "name": "cover",
                    "content": "coverimageid"
                }))
            # cover stuff for later:
            # at end of <package>:
            # <guide>
            # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
            # </guide>
            guide = newTag(contentdom, "guide")
            guide.appendChild(
                newTag(contentdom,
                       "reference",
                       attrs={
                           "type": "cover",
                           "title": "Cover",
                           "href": "cover.xhtml"
                       }))
            package.appendChild(guide)

            manifest.appendChild(
                newTag(contentdom,
                       "item",
                       attrs={
                           'id': "coverimageid",
                           'href': "cover.jpg",
                           'media-type': "image/jpeg"
                       }))

            # Note that the id of the cover xhmtl *must* be 'cover'
            # for it to work on Nook.
            manifest.appendChild(
                newTag(contentdom,
                       "item",
                       attrs={
                           'id': "cover",
                           'href': "cover.xhtml",
                           'media-type': "application/xhtml+xml"
                       }))

            spine.appendChild(
                newTag(contentdom,
                       "itemref",
                       attrs={
                           "idref": "cover",
                           "linear": "yes"
                       }))

        contentcount = 0
        for (filename, id, type, filedata) in files:
            #filename = self.filecache.addHtml(href,filedata)
            #print("writing :%s"%filename)
            # add to manifest and spine

            if coverjpgpath and filename == "cover.xhtml":
                continue  # don't dup cover.

            outputepub.writestr(filename, filedata.encode('utf-8'))
            id = "a%d" % contentcount
            contentcount += 1
            manifest.appendChild(
                newTag(contentdom,
                       "item",
                       attrs={
                           'id': id,
                           'href': filename,
                           'media-type': type
                       }))
            spine.appendChild(
                newTag(contentdom,
                       "itemref",
                       attrs={
                           "idref": id,
                           "linear": "yes"
                       }))

        for (linked, type) in self.filecache.linkedfiles:
            # add to manifest
            if coverjpgpath and linked == "cover.jpg":
                continue  # don't dup cover.

            try:
                outputepub.writestr(linked, self.get_file(linked))
            except Exception as e:
                print("Skipping linked file (%s)\nException: %s" % (linked, e))

            id = "a%d" % contentcount
            contentcount += 1
            manifest.appendChild(
                newTag(contentdom,
                       "item",
                       attrs={
                           'id': id,
                           'href': linked,
                           'media-type': type
                       }))

        contentxml = contentdom.toprettyxml(indent='   ')  # ,encoding='utf-8'
        # tweak for brain damaged Nook STR.  Nook insists on name before content.
        contentxml = contentxml.replace(
            '<meta content="coverimageid" name="cover"/>',
            '<meta name="cover" content="coverimageid"/>')
        outputepub.writestr("content.opf", contentxml)

        ## create toc.ncx file
        tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
        ncx = tocncxdom.documentElement
        ncx.setAttribute("version", "2005-1")
        ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/")
        head = tocncxdom.createElement("head")
        ncx.appendChild(head)
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:uid",
                       "content": uniqueid
                   }))
        depthnode = newTag(tocncxdom,
                           "meta",
                           attrs={
                               "name": "dtb:depth",
                               "content": "1"
                           })
        head.appendChild(depthnode)
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:totalPageCount",
                       "content": "0"
                   }))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:maxPageNumber",
                       "content": "0"
                   }))

        docTitle = tocncxdom.createElement("docTitle")
        docTitle.appendChild(
            newTag(tocncxdom, "text", text=stripHTML(titleopt)))
        ncx.appendChild(docTitle)

        tocnavMap = tocncxdom.createElement("navMap")
        ncx.appendChild(tocnavMap)

        # come back to lines again for TOC because files only has files(gasp-shock!)
        count = 1
        for line in self.split_lines:
            if 'include' in line:
                # if changed, use only changed values.
                if line['num'] in changedtocs:
                    line['toc'] = changedtocs[line['num']]
                # can have more than one toc entry.
                for title in line['toc']:
                    newnav = newTag(tocncxdom, "navPoint", {
                        "id": "a%03d" % count,
                        "playOrder": "%d" % count
                    })
                    count += 1
                    tocnavMap.appendChild(newnav)
                    navlabel = newTag(tocncxdom, "navLabel")
                    newnav.appendChild(navlabel)
                    # For purposes of TOC titling & desc, use first book author
                    navlabel.appendChild(
                        newTag(tocncxdom, "text", text=stripHTML(title)))
                    # Find the first 'spine' item's content for the title navpoint.
                    # Many epubs have the first chapter as first navpoint, so we can't just
                    # copy that anymore.
                    if line['anchor'] and line['href'] + "#" + line[
                            'anchor'] in self.filecache.anchors:
                        src = self.filecache.anchors[line['href'] + "#" +
                                                     line['anchor']]
                        #print("toc from anchors(%s#%s)(%s)"%(line['href'],line['anchor'],src))
                    else:
                        #print("toc from href(%s)"%line['href'])
                        src = line['href']
                    newnav.appendChild(
                        newTag(tocncxdom, "content", {"src": src}))

        outputepub.writestr(
            "toc.ncx", tocncxdom.toprettyxml(indent='   ', encoding='utf-8'))

        if coverjpgpath:
            # write, not write string.  Pulling from file.
            outputepub.write(coverjpgpath, "cover.jpg")

            outputepub.writestr(
                "cover.xhtml", '''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style></head><body><div>
<img src="cover.jpg" alt="cover"/>
</div></body></html>
''')

# declares all the files created by Windows.  otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
        for zf in outputepub.filelist:
            zf.create_system = 0
        outputepub.close()
def doMerge(outputio,
            files,
            authoropts=[],
            titleopt=None,
            descopt=None,
            tags=[],
            languages=['en'],
            titlenavpoints=True,
            flattentoc=False,
            printtimes=False,
            coverjpgpath=None,
            keepmetadatafiles=False,
            source=None):
    '''
    outputio = output file name or StringIO.
    files = list of input file names or StringIOs.
    authoropts = list of authors to use, otherwise add from all input
    titleopt = title, otherwise '<first title> Anthology'
    descopt = description, otherwise '<title> by <author>' list for all input
    tags = dc:subject tags to include, otherwise none.
    languages = dc:language tags to include
    titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it
    flattentoc if true, flatten TOC down to one level only.
    coverjpgpath, Path to a jpg to use as cover image.
    '''

    printt = partial(cond_print,printtimes)
    
    ## Python 2.5 ZipFile is rather more primative than later
    ## versions.  It can operate on a file, or on a StringIO, but
    ## not on an open stream.  OTOH, I suspect we would have had
    ## problems with closing and opening again to change the
    ## compression type anyway.

    filecount=0
    t = time()
    
    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    ## Create META-INF/container.xml file.  The only thing it does is
    ## point to content.opf
    containerdom = getDOMImplementation().createDocument(None, "container", None)
    containertop = containerdom.documentElement
    containertop.setAttribute("version","1.0")
    containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
    rootfiles = containerdom.createElement("rootfiles")
    containertop.appendChild(rootfiles)
    rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
                                                          "media-type":"application/oebps-package+xml"}))
    outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent='   ',encoding='utf-8'))    

    ## Process input epubs.
    
    items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
    items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
                                                               ## but it needs to be in the items manifest.
    itemrefs = [] # list of strings -- idrefs from .opfs' spines
    navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
    is_ffdl_epub = [] # list of t/f

    itemhrefs = {} # hash of item[id]s to itemref[href]s -- to find true start of book(s).
    firstitemhrefs = []

    booktitles = [] # list of strings -- Each book's title
    allauthors = [] # list of lists of strings -- Each book's list of authors.

    filelist = []
    
    printt("prep output:%s"%(time()-t))
    t = time()
    
    booknum=1
    firstmetadom = None
    for file in files:
        if file == None : continue
        
        book = "%d" % booknum
        bookdir = "%d/" % booknum
        bookid = "a%d" % booknum
        #print "book %d" % booknum
        
        epub = ZipFile(file, 'r')

        ## Find the .opf file.
        container = epub.read("META-INF/container.xml")
        containerdom = parseString(container)
        rootfilenodelist = containerdom.getElementsByTagNameNS("*","rootfile")
        rootfilename = rootfilenodelist[0].getAttribute("full-path")

        ## Save the path to the .opf file--hrefs inside it are relative to it.
        relpath = get_path_part(rootfilename)
            
        metadom = parseString(epub.read(rootfilename))
        #print("metadom:%s"%epub.read(rootfilename))
        if booknum==1 and not source:
            try:
                firstmetadom = metadom.getElementsByTagNameNS("*","metadata")[0]
                source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
            except:
                source=""
            #print "Source:%s"%source

        # if the epub was ever edited with Sigil, it changed the unique-identifier,
        # but dc:contributor was left.
        #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid")
        is_ffdl_epub.append(False)

        for c in metadom.getElementsByTagName("dc:contributor"):
            if c.getAttribute("opf:role") == "bkp" and \
                    getText(c.childNodes) == "fanficdownloader [http://fanficdownloader.googlecode.com]":
                is_ffdl_epub[-1] = True # set last.
                break;

        ## Save indiv book title
        try:
            booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
        except:
            booktitles.append("(Title Missing)")

        ## Save authors.
        authors=[]
        for creator in metadom.getElementsByTagName("dc:creator"):
            try:
                if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
                    authors.append(creator.firstChild.data)
            except:
                pass
        if len(authors) == 0:
            authors.append("(Author Missing)")
        allauthors.append(authors)

        if keepmetadatafiles:
            itemid=bookid+"rootfile"
            itemhref = rootfilename
            href=bookdir+itemhref
            #print("write rootfile %s to %s"%(itemhref,href))
            outputepub.writestr(href,
                                epub.read(itemhref))
            items.append((itemid,href,"origrootfile/xml"))
            
        # spin through the manifest--only place there are item tags.
        # Correction--only place there *should* be item tags.  But
        # somebody found one that did.
        manifesttag=metadom.getElementsByTagNameNS("*","manifest")[0]
        for item in manifesttag.getElementsByTagNameNS("*","item"):
            itemid=bookid+item.getAttribute("id")
            itemhref = unquote(item.getAttribute("href")) # remove %20, etc.
            href=bookdir+relpath+itemhref
            if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
                # TOC file is only one with this type--as far as I know.
                # grab the whole navmap, deal with it later.
                tocdom = parseString(re.sub(r'(&amp;|&)', r'and', epub.read(relpath+item.getAttribute("href"))))

                # update all navpoint ids with bookid for uniqueness.
                for navpoint in tocdom.getElementsByTagNameNS("*","navPoint"):
                    navpoint.setAttribute("id",bookid+navpoint.getAttribute("id"))

                # update all content paths with bookdir for uniqueness.
                for content in tocdom.getElementsByTagNameNS("*","content"):
                    content.setAttribute("src",bookdir+relpath+content.getAttribute("src"))

                navmaps.append(tocdom.getElementsByTagNameNS("*","navMap")[0])

                if keepmetadatafiles:
                    #print("write toc.ncx %s to %s"%(relpath+itemhref,href))
                    outputepub.writestr(href,
                                        epub.read(relpath+itemhref))
                    items.append((itemid,href,"origtocncx/xml"))
            else:
                href=href.encode('utf8')
                #print("item id: %s -> %s:"%(itemid,href))
                itemhrefs[itemid] = href
                if href not in filelist:
                    try:
                        outputepub.writestr(href,
                                            epub.read(relpath+itemhref))
                        if re.match(r'.*/(file|chapter)\d+\.x?html',href):
                            filecount+=1
                        items.append((itemid,href,item.getAttribute("media-type")))
                        filelist.append(href)
                    except KeyError, ke:
                        pass # Skip missing files.

        itemreflist = metadom.getElementsByTagNameNS("*","itemref")
        # print("itemreflist:%s"%itemreflist)
        # print("itemhrefs:%s"%itemhrefs)
        # print("bookid:%s"%bookid)
        # print("itemreflist[0].getAttribute(idref):%s"%itemreflist[0].getAttribute("idref"))
        firstitemhrefs.append(itemhrefs[bookid+itemreflist[0].getAttribute("idref")])
        for itemref in itemreflist:
            itemrefs.append(bookid+itemref.getAttribute("idref"))

        booknum=booknum+1;
Beispiel #11
0
def doMerge(outputio,
            files,
            authoropts=[],
            titleopt=None,
            descopt=None,
            tags=[],
            languages=['en'],
            titlenavpoints=True,
            flattentoc=False,
            printtimes=False,
            coverjpgpath=None,
            keepmetadatafiles=False,
            source=None):
    '''
    outputio = output file name or StringIO.
    files = list of input file names or StringIOs.
    authoropts = list of authors to use, otherwise add from all input
    titleopt = title, otherwise '<first title> Anthology'
    descopt = description, otherwise '<title> by <author>' list for all input
    tags = dc:subject tags to include, otherwise none.
    languages = dc:language tags to include
    titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it
    flattentoc if true, flatten TOC down to one level only.
    coverjpgpath, Path to a jpg to use as cover image.
    '''

    printt = partial(cond_print,printtimes)

    ## Python 2.5 ZipFile is rather more primative than later
    ## versions.  It can operate on a file, or on a StringIO, but
    ## not on an open stream.  OTOH, I suspect we would have had
    ## problems with closing and opening again to change the
    ## compression type anyway.

    filecount=0
    t = time()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    ## Create META-INF/container.xml file.  The only thing it does is
    ## point to content.opf
    containerdom = getDOMImplementation().createDocument(None, "container", None)
    containertop = containerdom.documentElement
    containertop.setAttribute("version","1.0")
    containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
    rootfiles = containerdom.createElement("rootfiles")
    containertop.appendChild(rootfiles)
    rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
                                                          "media-type":"application/oebps-package+xml"}))
    outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent='   ',encoding='utf-8'))

    ## Process input epubs.

    items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
    items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
                                                               ## but it needs to be in the items manifest.
    itemrefs = [] # list of strings -- idrefs from .opfs' spines
    navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
    is_ffdl_epub = [] # list of t/f

    itemhrefs = {} # hash of item[id]s to itemref[href]s -- to find true start of book(s).
    firstitemhrefs = []

    booktitles = [] # list of strings -- Each book's title
    allauthors = [] # list of lists of strings -- Each book's list of authors.

    filelist = []

    printt("prep output:%s"%(time()-t))
    t = time()

    booknum=1
    firstmetadom = None
    for file in files:
        if file == None : continue

        book = "%d" % booknum
        bookdir = "%d/" % booknum
        bookid = "a%d" % booknum

        epub = ZipFile(file, 'r')

        ## Find the .opf file.
        container = epub.read("META-INF/container.xml")
        containerdom = parseString(container)
        rootfilenodelist = containerdom.getElementsByTagNameNS("*","rootfile")
        rootfilename = rootfilenodelist[0].getAttribute("full-path")

        ## Save the path to the .opf file--hrefs inside it are relative to it.
        relpath = get_path_part(rootfilename)

        metadom = parseString(epub.read(rootfilename))
        #logger.debug("metadom:%s"%epub.read(rootfilename))
        if booknum==1 and not source:
            try:
                firstmetadom = metadom.getElementsByTagNameNS("*","metadata")[0]
                source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
            except:
                source=""

        # if the epub was ever edited with Sigil, it changed the unique-identifier,
        # but dc:contributor was left.
        #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid")
        is_ffdl_epub.append(False)

        for c in metadom.getElementsByTagName("dc:contributor"):
            # logger.debug("dc:contributor:%s"%getText(c.childNodes))
            if c.getAttribute("opf:role") == "bkp" and \
                    getText(c.childNodes) in ["fanficdownloader [http://fanficdownloader.googlecode.com]",
                                              "FanFicFare [https://github.com/JimmXinu/FanFicFare]"]:
                is_ffdl_epub[-1] = True # set last.
                break;

        ## Save indiv book title
        try:
            booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
        except:
            booktitles.append("(Title Missing)")

        ## Save authors.
        authors=[]
        for creator in metadom.getElementsByTagName("dc:creator"):
            try:
                if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
                    authors.append(creator.firstChild.data)
            except:
                pass
        if len(authors) == 0:
            authors.append("(Author Missing)")
        allauthors.append(authors)

        if keepmetadatafiles:
            itemid=bookid+"rootfile"
            itemhref = rootfilename
            href=bookdir+itemhref
            #logger.debug("write rootfile %s to %s"%(itemhref,href))
            outputepub.writestr(href,
                                epub.read(itemhref))
            items.append((itemid,href,"origrootfile/xml"))

        # spin through the manifest--only place there are item tags.
        # Correction--only place there *should* be item tags.  But
        # somebody found one that did.
        manifesttag=metadom.getElementsByTagNameNS("*","manifest")[0]
        for item in manifesttag.getElementsByTagNameNS("*","item"):
            itemid=bookid+item.getAttribute("id")
            itemhref = unquote(item.getAttribute("href")) # remove %20, etc.
            href=bookdir+relpath+itemhref
            if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
                # TOC file is only one with this type--as far as I know.
                # grab the whole navmap, deal with it later.
                tocdom = parseString(epub.read(relpath+item.getAttribute("href")))

                # update all navpoint ids with bookid for uniqueness.
                for navpoint in tocdom.getElementsByTagNameNS("*","navPoint"):
                    navpoint.setAttribute("id",bookid+navpoint.getAttribute("id"))

                # update all content paths with bookdir for uniqueness.
                for content in tocdom.getElementsByTagNameNS("*","content"):
                    content.setAttribute("src",bookdir+relpath+content.getAttribute("src"))

                navmaps.append(tocdom.getElementsByTagNameNS("*","navMap")[0])

                if keepmetadatafiles:
                    #logger.debug("write toc.ncx %s to %s"%(relpath+itemhref,href))
                    outputepub.writestr(href,
                                        epub.read(relpath+itemhref))
                    items.append((itemid,href,"origtocncx/xml"))
            else:
                href=href.encode('utf8')
                #logger.debug("item id: %s -> %s:"%(itemid,href))
                itemhrefs[itemid] = href
                if href not in filelist:
                    try:
                        outputepub.writestr(href,
                                            epub.read(relpath+itemhref))
                        if re.match(r'.*/(file|chapter)\d+\.x?html',href):
                            filecount+=1
                        items.append((itemid,href,item.getAttribute("media-type")))
                        filelist.append(href)
                    except KeyError, ke: # Skip missing files.
                        logger.info("Skipping missing file %s (%s)"%(href,relpath+itemhref))
                        del itemhrefs[itemid]

        itemreflist = metadom.getElementsByTagNameNS("*","itemref")
        # logger.debug("itemreflist:%s"%itemreflist)
        # logger.debug("itemhrefs:%s"%itemhrefs)
        # logger.debug("bookid:%s"%bookid)
        # logger.debug("itemreflist[0].getAttribute(idref):%s"%itemreflist[0].getAttribute("idref"))

        # Looking for the first item in itemreflist that wasn't
        # discarded due to missing files.
        for itemref in itemreflist:
            idref = bookid+itemref.getAttribute("idref")
            if idref in itemhrefs:
                firstitemhrefs.append(itemhrefs[idref])
                break

        for itemref in itemreflist:
            itemrefs.append(bookid+itemref.getAttribute("idref"))

        booknum=booknum+1;
Beispiel #12
0
def reset_orig_chapters_epub(inputio,outfile):
    inputepub = ZipFile(inputio, 'r') # works equally well with a path or a blob

    ## build zip in memory in case updating in place(CLI).
    zipio = StringIO()
    
    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    changed = False
    
    tocncxdom = parseString(inputepub.read('toc.ncx'))
    ## spin through file contents.
    for zf in inputepub.namelist():
        if zf not in ['mimetype','toc.ncx'] :
            entrychanged = False
            data = inputepub.read(zf)
            # if isinstance(data,unicode):
            #     logger.debug("\n\n\ndata is unicode\n\n\n")
            if re.match(r'.*/file\d+\.xhtml',zf):
                data = data.decode('utf-8')
                soup = bs.BeautifulSoup(data,"html5lib")
                
                chapterorigtitle = None
                tag = soup.find('meta',{'name':'chapterorigtitle'})
                if tag:
                    chapterorigtitle = tag['content']

                # toctitle is separate for add_chapter_numbers:toconly users.
                chaptertoctitle = None
                tag = soup.find('meta',{'name':'chaptertoctitle'})
                if tag:
                    chaptertoctitle = tag['content']
                elif chapterorigtitle:
                    chaptertoctitle = chapterorigtitle
                    
                chaptertitle = None
                tag = soup.find('meta',{'name':'chaptertitle'})
                if tag:
                    chaptertitle = tag['content']

                if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle:
                    origdata = data
                    # print("\n%s\n%s\n"%(chapterorigtitle,chaptertitle))
                    data = data.replace(u'<meta name="chaptertitle" content="'+chaptertitle+u'"></meta>',
                                        u'<meta name="chaptertitle" content="'+chapterorigtitle+u'"></meta>')
                    data = data.replace(u'<title>'+chaptertitle+u'</title>',u'<title>'+chapterorigtitle+u'</title>')
                    data = data.replace(u'<h3>'+chaptertitle+u'</h3>',u'<h3>'+chapterorigtitle+u'</h3>')

                    entrychanged = ( origdata != data )
                    changed = changed or entrychanged
                    
                    if entrychanged:
                        ## go after the TOC entry, too.
                        # <navPoint id="file0005" playOrder="6">
                        #   <navLabel>
                        #     <text>5. (new) Chapter 4</text>
                        #   </navLabel>
                        #   <content src="OEBPS/file0005.xhtml"/>
                        # </navPoint>
                        for contenttag in tocncxdom.getElementsByTagName("content"):
                            if contenttag.getAttribute('src') == zf:
                                texttag = contenttag.parentNode.getElementsByTagName('navLabel')[0].getElementsByTagName('text')[0]
                                texttag.childNodes[0].replaceWholeText(chaptertoctitle)
                                # logger.debug("text label:%s"%texttag.toxml())
                                continue
                    
                outputepub.writestr(zf,data.encode('utf-8'))
            else:
                # possibly binary data, thus no .encode().
                outputepub.writestr(zf,data)

    outputepub.writestr('toc.ncx',tocncxdom.toxml(encoding='utf-8'))
    outputepub.close()
    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0
        
    # only *actually* write if changed.
    if changed:
        if isinstance(outfile,basestring):
            with open(outfile,"wb") as outputio:
                outputio.write(zipio.getvalue())
        else:
            outfile.write(zipio.getvalue())

    inputepub.close()
    zipio.close()
            
    return changed
Beispiel #13
0
    def write(self, report=False, dirpath=None):

        n = ffnet_notify().progress_init(int(
            self.S.metadata['numChapters'])).shadow(self.S.storyID)

        file_name = string.Template("${title} - ${author}.epub").substitute(
            self.S.metadata).encode('utf8')
        if dirpath:
            file_name = os.path.normpath(dirpath +
                                         (dirpath[-1] != "/" and "/" or "") +
                                         file_name)
        else:
            file_name = 'stories/' + file_name
        logger.info("Save directly to file: %s" % file_name)
        try:
            os.makedirs(os.path.dirname(os.path.normpath(file_name)))
        except:
            pass

        outstream = open(file_name, "wb")

        outputepub = ZipFile(outstream, 'w', compression=ZIP_STORED)
        outputepub.debug = 3
        outputepub.writestr('mimetype', 'application/epub+zip')
        outputepub.close()
        outputepub = ZipFile(outstream, 'a', compression=ZIP_DEFLATED)
        outputepub.debug = 3
        containerdom = getDOMImplementation().createDocument(
            None, "container", None)
        containertop = containerdom.documentElement
        containertop.setAttribute("version", "1.0")
        containertop.setAttribute(
            "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container")
        rootfiles = containerdom.createElement("rootfiles")
        containertop.appendChild(rootfiles)
        rootfiles.appendChild(
            newTag(
                containerdom, "rootfile", {
                    "full-path": "content.opf",
                    "media-type": "application/oebps-package+xml"
                }))
        outputepub.writestr("META-INF/container.xml",
                            containerdom.toxml(encoding='utf-8'))
        containerdom.unlink()
        del containerdom

        # TODO change this?
        # uniqueid = 'fanficfare-uid:%s-u%s-s%s' % (
        #     self.S.metadata['site'],
        #     self.S.metadata['authorId'][0],
        #     self.S.metadata['storyId']
        # )

        contentdom = getDOMImplementation().createDocument(
            None, "package", None)
        package = contentdom.documentElement
        package.setAttribute("version", "2.0")
        package.setAttribute("xmlns", "http://www.idpf.org/2007/opf")
        package.setAttribute("unique-identifier", "fanficfare-uid")
        metadata = newTag(contentdom,
                          "metadata",
                          attrs={
                              "xmlns:dc": "http://purl.org/dc/elements/1.1/",
                              "xmlns:opf": "http://www.idpf.org/2007/opf"
                          })
        package.appendChild(metadata)

        if self.S.metadata['title']:
            metadata.appendChild(
                newTag(contentdom, "dc:title", text=self.S.metadata['title']))

        if self.S.metadata['author']:
            metadata.appendChild(
                newTag(contentdom,
                       "dc:creator",
                       attrs={"opf:role": "aut"},
                       text=self.S.metadata['author']))

        metadata.appendChild(
            newTag(contentdom,
                   "dc:contributor",
                   text="Automatia",
                   attrs={"opf:role": "bkp"}))
        metadata.appendChild(newTag(contentdom, "dc:rights", text=""))
        if self.S.metadata['langcode']:
            metadata.appendChild(
                newTag(contentdom,
                       "dc:language",
                       text=self.S.metadata['langcode']))
        else:
            metadata.appendChild(newTag(contentdom, "dc:language", text='en'))

        # published, created, updated, calibre
        #  Leave calling self.story.getMetadataRaw directly in case date format changes.
        if self.S.metadata['datePublished']:
            metadata.appendChild(
                newTag(contentdom,
                       "dc:date",
                       attrs={"opf:event": "publication"},
                       text=self.S.metadata['datePublished'].strftime(
                           "%Y-%m-%d")))

        if 'dateUpdated' in self.S.metadata:
            metadata.appendChild(
                newTag(
                    contentdom,
                    "dc:date",
                    attrs={"opf:event": "modification"},
                    text=self.S.metadata['dateUpdated'].strftime("%Y-%m-%d")))
            metadata.appendChild(
                newTag(contentdom,
                       "meta",
                       attrs={
                           "name":
                           "calibre:timestamp",
                           "content":
                           self.S.metadata['dateUpdated'].strftime(
                               "%Y-%m-%dT%H:%M:%S")
                       }))

        if self.S.metadata['description']:
            metadata.appendChild(
                newTag(contentdom,
                       "dc:description",
                       text=self.S.metadata['description']))

        # FIXME ???
        # for subject in self.story.getSubjectTags():
        #     metadata.appendChild(newTag(contentdom, "dc:subject", text=subject))

        if self.S.metadata['storyUrl']:
            metadata.appendChild(
                newTag(contentdom,
                       "dc:identifier",
                       attrs={"opf:scheme": "URL"},
                       text=self.S.metadata['storyUrl']))
            metadata.appendChild(
                newTag(contentdom,
                       "dc:source",
                       text=self.S.metadata['storyUrl']))

        items = []  # list of (id, href, type, title) tuples(all strings)
        itemrefs = []  # list of strings -- idrefs from .opfs' spines
        items.append(("ncx", "toc.ncx", "application/x-dtbncx+xml", None))

        guide = None
        coverIO = None

        coverimgid = "image0000"

        # FIXME cover
        # if None:  # not self.story.cover and self.story.oldcover:
        #     logger.debug("writer_epub: no new cover, has old cover, write image.")
        #     (oldcoverhtmlhref,
        #      oldcoverhtmltype,
        #      oldcoverhtmldata,
        #      oldcoverimghref,
        #      oldcoverimgtype,
        #      oldcoverimgdata) = self.story.oldcover
        #     outputepub.writestr(oldcoverhtmlhref, oldcoverhtmldata)
        #     outputepub.writestr(oldcoverimghref, oldcoverimgdata)
        #
        #     coverimgid = "image0"
        #     items.append((coverimgid,
        #                   oldcoverimghref,
        #                   oldcoverimgtype,
        #                   None))
        #     items.append(("cover", oldcoverhtmlhref, oldcoverhtmltype, None))
        #     itemrefs.append("cover")
        #     metadata.appendChild(newTag(contentdom, "meta", {"content": "image0",
        #                                                      "name": "cover"}))
        #     guide = newTag(contentdom, "guide")
        #     guide.appendChild(newTag(contentdom, "reference", attrs={"type": "cover",
        #                                                              "title": "Cover",
        #                                                              "href": oldcoverhtmlhref}))

        # TODO
        # if None:  # self.getConfig('include_images'):
        #     imgcount = 0
        #     for imgmap in self.story.getImgUrls():
        #         imgfile = "OEBPS/" + imgmap['newsrc']
        #         outputepub.writestr(imgfile, imgmap['data'])
        #         items.append(("image%04d" % imgcount,
        #                       imgfile,
        #                       imgmap['mime'],
        #                       None))
        #         imgcount += 1
        #         if 'cover' in imgfile:
        #             # make sure coverimgid is set to the cover, not
        #             # just the first image.
        #             coverimgid = items[-1][0]

        # items.append(("style", "OEBPS/stylesheet.css", "text/css", None))

        # TODO
        # if None:  # self.story.cover:
        #     # Note that the id of the cover xhmtl *must* be 'cover'
        #     # for it to work on Nook.
        #     items.append(("cover", "OEBPS/cover.xhtml", "application/xhtml+xml", None))
        #     itemrefs.append("cover")
        #     #
        #     # <meta name="cover" content="cover.jpg"/>
        #     metadata.appendChild(newTag(contentdom, "meta", {"content": coverimgid,
        #                                                      "name": "cover"}))
        #     # cover stuff for later:
        #     # at end of <package>:
        #     # <guide>
        #     # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
        #     # </guide>
        #     guide = newTag(contentdom, "guide")
        #     guide.appendChild(newTag(contentdom, "reference", attrs={"type": "cover",
        #                                                              "title": "Cover",
        #                                                              "href": "OEBPS/cover.xhtml"}))
        #
        #     if self.hasConfig("cover_content"):
        #         COVER = string.Template(self.getConfig("cover_content"))
        #     else:
        #         COVER = self.EPUB_COVER
        #     coverIO = StringIO.StringIO()
        #     coverIO.write(
        #         COVER.substitute(dict(self.story.getAllMetadata().items() + {'coverimg': self.story.cover}.items())))

        items.append(("title_page", "OEBPS/title_page.xhtml",
                      "application/xhtml+xml", "Title Page"))
        itemrefs.append("title_page")

        # if self.S.metadata['numChapters'] > 1:
        #     items.append(("toc_page", "OEBPS/toc_page.xhtml", "application/xhtml+xml", "Table of Contents"))
        #     itemrefs.append("toc_page")
        # collect chapter urls and file names for internalize_text_links option.

        chapurlmap = {}
        for index, chap in enumerate(self.S.chapterUrls):
            i = index + 1
            items.append(("file%04d" % i, "OEBPS/file%04d.xhtml" % i,
                          "application/xhtml+xml", "%d. %s" % (i, chap[0])))
            itemrefs.append("file%04d" % i)
            chapurlmap[chap[1]] = "file%04d.xhtml" % i
            # url -> relative epub file name.

        manifest = contentdom.createElement("manifest")
        package.appendChild(manifest)
        for item in items:
            (item_id, href, item_type, title) = item
            manifest.appendChild(
                newTag(contentdom,
                       "item",
                       attrs={
                           'id': item_id,
                           'href': href,
                           'media-type': item_type
                       }))

        spine = newTag(contentdom, "spine", attrs={"toc": "ncx"})
        package.appendChild(spine)
        for itemref in itemrefs:
            spine.appendChild(
                newTag(contentdom,
                       "itemref",
                       attrs={
                           "idref": itemref,
                           "linear": "yes"
                       }))
        # guide only exists if there's a cover.
        if guide:
            package.appendChild(guide)

        # write content.opf to zip.
        contentxml = contentdom.toxml(encoding='utf-8')

        # tweak for brain damaged Nook STR.  Nook insists on name before content.
        contentxml = contentxml.replace(
            '<meta content="%s" name="cover"/>' % coverimgid,
            '<meta name="cover" content="%s"/>' % coverimgid)
        outputepub.writestr("content.opf", contentxml)

        contentdom.unlink()
        del contentdom

        # create toc.ncx file
        tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
        ncx = tocncxdom.documentElement
        ncx.setAttribute("version", "2005-1")
        ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/")
        head = tocncxdom.createElement("head")
        ncx.appendChild(head)
        # head.appendChild(newTag(tocncxdom, "meta",
        #                         attrs={"name": "dtb:uid", "content": uniqueid}))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:depth",
                       "content": "1"
                   }))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:totalPageCount",
                       "content": "0"
                   }))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:maxPageNumber",
                       "content": "0"
                   }))

        docTitle = tocncxdom.createElement("docTitle")
        docTitle.appendChild(
            newTag(tocncxdom, "text", text=self.S.metadata['title']))
        ncx.appendChild(docTitle)

        tocnavMap = tocncxdom.createElement("navMap")
        ncx.appendChild(tocnavMap)

        # <navPoint id="<id>" playOrder="<risingnumberfrom0>">
        #   <navLabel>
        #     <text><chapter title></text>
        #   </navLabel>
        #   <content src="<chapterfile>"/>
        # </navPoint>
        index = 0
        for item in items:
            (item_id, href, item_type, title) = item
            # only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title.
            if title:
                navPoint = newTag(tocncxdom,
                                  "navPoint",
                                  attrs={
                                      'id': item_id,
                                      'playOrder': unicode(index)
                                  })
                tocnavMap.appendChild(navPoint)
                navLabel = newTag(tocncxdom, "navLabel")
                navPoint.appendChild(navLabel)
                # the xml library will re-escape as needed.
                navLabel.appendChild(
                    newTag(tocncxdom, "text", text=stripHTML(title)))
                navPoint.appendChild(
                    newTag(tocncxdom, "content", attrs={"src": href}))
                index = index + 1

        # write toc.ncx to zip file
        outputepub.writestr("toc.ncx", tocncxdom.toxml(encoding='utf-8'))
        tocncxdom.unlink()
        del tocncxdom

        # write stylesheet.css file.
        # outputepub.writestr("OEBPS/stylesheet.css", self.EPUB_CSS.substitute({'output_css': css_text}))

        TITLE_PAGE = self.EPUB_TITLE_PAGE

        if coverIO:
            outputepub.writestr("OEBPS/cover.xhtml", coverIO.getvalue())
            coverIO.close()

        titlepageIO = StringIO.StringIO()
        self.writeTitlePage(out=titlepageIO, PAGE=TITLE_PAGE)
        if titlepageIO.getvalue():  # will be false if no title page.
            outputepub.writestr("OEBPS/title_page.xhtml",
                                titlepageIO.getvalue())
        titlepageIO.close()

        # # TODO write toc page.
        # tocpageIO = StringIO.StringIO()
        # self.writeTOCPage(tocpageIO,
        #                   self.EPUB_TOC_PAGE_START,
        #                   self.EPUB_TOC_ENTRY,
        #                   self.EPUB_TOC_PAGE_END)
        # if tocpageIO.getvalue():  # will be false if no toc page.
        #     outputepub.writestr("OEBPS/toc_page.xhtml", tocpageIO.getvalue())
        # tocpageIO.close()

        CHAPTER_START = self.EPUB_CHAPTER_START
        CHAPTER_END = self.EPUB_CHAPTER_END

        for index, chap in enumerate(self.S.chapterUrls):  # (url,title,html)
            chap_data = self.S.getChapterText(index)
            if report:
                n.progress(index+1)\
                    .post()

            logger.debug('Writing chapter text for: %s' % chap[0])
            vals = {
                'url': removeEntities(chap[1]),
                'chapter': removeEntities(chap[0]),
                # 'origchapter': removeEntities(chap.origtitle),
                # 'tocchapter': removeEntities(chap.toctitle),
                'index': "%04d" % (index + 1),
                'number': index + 1
            }
            # escape double quotes in all vals.
            for k, v in vals.items():
                if isinstance(v, basestring):
                    vals[k] = v.replace('"', '&quot;')
            fullhtml = CHAPTER_START.substitute(
                vals) + chap_data.strip() + CHAPTER_END.substitute(vals)
            fullhtml = re.sub(r'(</p>|<br ?/>)\n*', r'\1\n', fullhtml)

            outputepub.writestr("OEBPS/file%04d.xhtml" % (index + 1),
                                fullhtml.encode('utf-8'))
            del fullhtml

        for zf in outputepub.filelist:
            zf.create_system = 0
        outputepub.close()

        ### STOP WRITE

        # zipout.writestr(string.Template("${title} - ${storyID}.epub").substitute(self.S.metadata).encode('utf8'),
        #                zipio.getvalue())

        outstream.close()

        # zipout.close()
        if report:
            ffnet_notify()\
                .shadow(self.S.storyID)\
                .end(file_name)\
                .post()
Beispiel #14
0
def reset_orig_chapters_epub(inputio, outfile):
    inputepub = ZipFile(inputio, "r")  # works equally well with a path or a blob

    ## build zip in memory in case updating in place(CLI).
    zipio = StringIO()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(zipio, "w", compression=ZIP_STORED)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(zipio, "a", compression=ZIP_DEFLATED)
    outputepub.debug = 3

    changed = False

    unmerge_tocncxdoms = {}
    ## spin through file contents, saving any unmerge toc.ncx files.
    for zf in inputepub.namelist():
        ## logger.debug("zf:%s"%zf)
        if zf.endswith("/toc.ncx"):
            ## logger.debug("toc.ncx zf:%s"%zf)
            unmerge_tocncxdoms[zf] = parseString(inputepub.read(zf))

    tocncxdom = parseString(inputepub.read("toc.ncx"))
    ## spin through file contents.
    for zf in inputepub.namelist():
        if zf not in ["mimetype", "toc.ncx"] and not zf.endswith("/toc.ncx"):
            entrychanged = False
            data = inputepub.read(zf)
            # if isinstance(data,unicode):
            #     logger.debug("\n\n\ndata is unicode\n\n\n")
            if re.match(r".*/file\d+\.xhtml", zf):
                # logger.debug("zf:%s"%zf)
                data = data.decode("utf-8")
                soup = bs.BeautifulSoup(data, "html5lib")

                chapterorigtitle = None
                tag = soup.find("meta", {"name": "chapterorigtitle"})
                if tag:
                    chapterorigtitle = tag["content"]

                # toctitle is separate for add_chapter_numbers:toconly users.
                chaptertoctitle = None
                tag = soup.find("meta", {"name": "chaptertoctitle"})
                if tag:
                    chaptertoctitle = tag["content"]
                elif chapterorigtitle:
                    chaptertoctitle = chapterorigtitle

                chaptertitle = None
                tag = soup.find("meta", {"name": "chaptertitle"})
                if tag:
                    chaptertitle = tag["content"]

                if chaptertitle and chapterorigtitle and chapterorigtitle != chaptertitle:
                    origdata = data
                    # print("\n%s\n%s\n"%(chapterorigtitle,chaptertitle))
                    data = data.replace(
                        u'<meta name="chaptertitle" content="' + chaptertitle + u'"></meta>',
                        u'<meta name="chaptertitle" content="' + chapterorigtitle + u'"></meta>',
                    )
                    data = data.replace(
                        u"<title>" + chaptertitle + u"</title>", u"<title>" + chapterorigtitle + u"</title>"
                    )
                    data = data.replace(u"<h3>" + chaptertitle + u"</h3>", u"<h3>" + chapterorigtitle + u"</h3>")

                    entrychanged = origdata != data
                    changed = changed or entrychanged

                    if entrychanged:
                        _replace_tocncx(tocncxdom, zf, chaptertoctitle)
                        ## Also look for and update individual
                        ## book toc.ncx files for anthology in case
                        ## it's unmerged.
                        zf_toc = zf[: zf.rfind("/OEBPS/")] + "/toc.ncx"
                        mergedprefix_len = len(zf[: zf.rfind("/OEBPS/")]) + 1

                        if zf_toc in unmerge_tocncxdoms:
                            _replace_tocncx(unmerge_tocncxdoms[zf_toc], zf[mergedprefix_len:], chaptertoctitle)

                outputepub.writestr(zf, data.encode("utf-8"))
            else:
                # possibly binary data, thus no .encode().
                outputepub.writestr(zf, data)

    for tocnm, tocdom in unmerge_tocncxdoms.items():
        outputepub.writestr(tocnm, tocdom.toxml(encoding="utf-8"))

    outputepub.writestr("toc.ncx", tocncxdom.toxml(encoding="utf-8"))
    outputepub.close()
    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0

    # only *actually* write if changed.
    if changed:
        if isinstance(outfile, basestring):
            with open(outfile, "wb") as outputio:
                outputio.write(zipio.getvalue())
        else:
            outfile.write(zipio.getvalue())

    inputepub.close()
    zipio.close()

    return changed
Beispiel #15
0
def doMerge(outputio,
            files,
            authoropts=[],
            titleopt=None,
            descopt=None,
            tags=[],
            languages=['en'],
            titlenavpoints=True,
            originalnavpoints=True,
            flattentoc=False,
            printtimes=False,
            coverjpgpath=None,
            keepmetadatafiles=False,
            source=None):
    '''
    outputio = output file name or StringIO.
    files = list of input file names or StringIOs.
    authoropts = list of authors to use, otherwise add from all input
    titleopt = title, otherwise '<first title> Anthology'
    descopt = description, otherwise '<title> by <author>' list for all input
    tags = dc:subject tags to include, otherwise none.
    languages = dc:language tags to include
    titlenavpoints if true, put in a new TOC entry for each epub, nesting each epub's chapters under it
    originalnavpoints if true, include the original TOCs from each epub
    flattentoc if true, flatten TOC down to one level only.
    coverjpgpath, Path to a jpg to use as cover image.
    '''

    printt = partial(cond_print,printtimes)

    ## Python 2.5 ZipFile is rather more primative than later
    ## versions.  It can operate on a file, or on a StringIO, but
    ## not on an open stream.  OTOH, I suspect we would have had
    ## problems with closing and opening again to change the
    ## compression type anyway.

    filecount=0
    t = time()

    ## Write mimetype file, must be first and uncompressed.
    ## Older versions of python(2.4/5) don't allow you to specify
    ## compression by individual file.
    ## Overwrite if existing output file.
    outputepub = ZipFile(outputio, "w", compression=ZIP_STORED, allowZip64=True)
    outputepub.debug = 3
    outputepub.writestr("mimetype", "application/epub+zip")
    outputepub.close()

    ## Re-open file for content.
    outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED, allowZip64=True)
    outputepub.debug = 3

    ## Create META-INF/container.xml file.  The only thing it does is
    ## point to content.opf
    containerdom = getDOMImplementation().createDocument(None, "container", None)
    containertop = containerdom.documentElement
    containertop.setAttribute("version","1.0")
    containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
    rootfiles = containerdom.createElement("rootfiles")
    containertop.appendChild(rootfiles)
    rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":"content.opf",
                                                          "media-type":"application/oebps-package+xml"}))
    outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent='   ',encoding='utf-8'))

    ## Process input epubs.

    items = [] # list of (id, href, type) tuples(all strings) -- From .opfs' manifests
    items.append(("ncx","toc.ncx","application/x-dtbncx+xml")) ## we'll generate the toc.ncx file,
                                                               ## but it needs to be in the items manifest.
    itemrefs = [] # list of strings -- idrefs from .opfs' spines
    navmaps = [] # list of navMap DOM elements -- TOC data for each from toc.ncx files
    is_ffdl_epub = [] # list of t/f

    itemhrefs = {} # hash of item[id]s to itemref[href]s -- to find true start of book(s).
    firstitemhrefs = []

    booktitles = [] # list of strings -- Each book's title
    allauthors = [] # list of lists of strings -- Each book's list of authors.

    filelist = []

    printt("prep output:%s"%(time()-t))
    t = time()

    booknum=1
    firstmetadom = None
    for file in files:
        if file == None : continue

        book = "%d" % booknum
        bookdir = "%d/" % booknum
        bookid = "a%d" % booknum

        epub = ZipFile(file, 'r')

        ## Find the .opf file.
        container = epub.read("META-INF/container.xml")
        containerdom = parseString(container)
        rootfilenodelist = containerdom.getElementsByTagNameNS("*","rootfile")
        rootfilename = rootfilenodelist[0].getAttribute("full-path")

        ## Save the path to the .opf file--hrefs inside it are relative to it.
        relpath = get_path_part(rootfilename)

        metadom = parseString(epub.read(rootfilename))
        #logger.debug("metadom:%s"%epub.read(rootfilename))
        if booknum==1 and not source:
            try:
                firstmetadom = metadom.getElementsByTagNameNS("*","metadata")[0]
                source=firstmetadom.getElementsByTagName("dc:source")[0].firstChild.data.encode("utf-8")
            except:
                source=""

        # if the epub was ever edited with Sigil, it changed the unique-identifier,
        # but dc:contributor was left.
        #is_ffdl_epub.append(metadom.documentElement.getAttribute('unique-identifier') == "fanficdownloader-uid")
        is_ffdl_epub.append(False)

        for c in metadom.getElementsByTagName("dc:contributor"):
            # logger.debug("dc:contributor:%s"%getText(c.childNodes))
            if c.getAttribute("opf:role") == "bkp" and \
                    getText(c.childNodes) in ["fanficdownloader [http://fanficdownloader.googlecode.com]",
                                              "FanFicFare [https://github.com/JimmXinu/FanFicFare]"]:
                is_ffdl_epub[-1] = True # set last.
                break;

        ## Save indiv book title
        try:
            booktitles.append(metadom.getElementsByTagName("dc:title")[0].firstChild.data)
        except:
            booktitles.append("(Title Missing)")

        ## Save authors.
        authors=[]
        for creator in metadom.getElementsByTagName("dc:creator"):
            try:
                if( creator.getAttribute("opf:role") == "aut" or not creator.hasAttribute("opf:role") and creator.firstChild != None):
                    authors.append(creator.firstChild.data)
            except:
                pass
        if len(authors) == 0:
            authors.append("(Author Missing)")
        allauthors.append(authors)

        if keepmetadatafiles:
            itemid=bookid+"rootfile"
            itemhref = rootfilename
            href=bookdir+itemhref
            #logger.debug("write rootfile %s to %s"%(itemhref,href))
            outputepub.writestr(href,
                                epub.read(itemhref))
            items.append((itemid,href,"origrootfile/xml"))

        # spin through the manifest--only place there are item tags.
        # Correction--only place there *should* be item tags.  But
        # somebody found one that did.
        manifesttag=metadom.getElementsByTagNameNS("*","manifest")[0]
        for item in manifesttag.getElementsByTagNameNS("*","item"):
            itemid=bookid+item.getAttribute("id")
            itemhref = normpath(unquote(item.getAttribute("href"))) # remove %20, etc.
            href=bookdir+relpath+itemhref
            if( item.getAttribute("media-type") == "application/x-dtbncx+xml" ):
                # TOC file is only one with this type--as far as I know.
                # grab the whole navmap, deal with it later.
                tocdom = parseString(epub.read(normpath(relpath+item.getAttribute("href"))))

                # update all navpoint ids with bookid for uniqueness.
                for navpoint in tocdom.getElementsByTagNameNS("*","navPoint"):
                    navpoint.setAttribute("id",bookid+navpoint.getAttribute("id"))

                # update all content paths with bookdir for uniqueness.
                for content in tocdom.getElementsByTagNameNS("*","content"):
                    content.setAttribute("src",normpath(bookdir+relpath+content.getAttribute("src")))

                navmaps.append(tocdom.getElementsByTagNameNS("*","navMap")[0])

                if keepmetadatafiles:
                    #logger.debug("write toc.ncx %s to %s"%(relpath+itemhref,href))
                    outputepub.writestr(href,
                                        epub.read(normpath(relpath+itemhref)))
                    items.append((itemid,href,"origtocncx/xml"))
            else:
                href=href.encode('utf8')
                #logger.debug("item id: %s -> %s:"%(itemid,href))
                itemhrefs[itemid] = href
                if href not in filelist:
                    try:
                        outputepub.writestr(href,
                                            epub.read(normpath(relpath+itemhref)))
                        if re.match(r'.*/(file|chapter)\d+\.x?html',href):
                            filecount+=1
                        items.append((itemid,href,item.getAttribute("media-type")))
                        filelist.append(href)
                    except KeyError as ke: # Skip missing files.
                        logger.info("Skipping missing file %s (%s)"%(href,relpath+itemhref))
                        del itemhrefs[itemid]

        itemreflist = metadom.getElementsByTagNameNS("*","itemref")
        # logger.debug("itemreflist:%s"%itemreflist)
        # logger.debug("itemhrefs:%s"%itemhrefs)
        # logger.debug("bookid:%s"%bookid)
        # logger.debug("itemreflist[0].getAttribute(idref):%s"%itemreflist[0].getAttribute("idref"))

        # Looking for the first item in itemreflist that wasn't
        # discarded due to missing files.
        for itemref in itemreflist:
            idref = bookid+itemref.getAttribute("idref")
            if idref in itemhrefs:
                firstitemhrefs.append(itemhrefs[idref])
                break

        for itemref in itemreflist:
            itemrefs.append(bookid+itemref.getAttribute("idref"))

        booknum=booknum+1;

    printt("after file loop:%s"%(time()-t))
    t = time()

    ## create content.opf file.
    uniqueid="epubmerge-uid-%d" % time() # real sophisticated uid scheme.
    contentdom = getDOMImplementation().createDocument(None, "package", None)
    package = contentdom.documentElement

    package.setAttribute("version","2.0")
    package.setAttribute("xmlns","http://www.idpf.org/2007/opf")
    package.setAttribute("unique-identifier","epubmerge-id")
    metadata=newTag(contentdom,"metadata",
                    attrs={"xmlns:dc":"http://purl.org/dc/elements/1.1/",
                           "xmlns:opf":"http://www.idpf.org/2007/opf"})
    metadata.appendChild(newTag(contentdom,"dc:identifier",text=uniqueid,attrs={"id":"epubmerge-id"}))
    if( titleopt is None ):
        titleopt = booktitles[0]+" Anthology"
    metadata.appendChild(newTag(contentdom,"dc:title",text=titleopt))

    # If cmdline authors, use those instead of those collected from the epubs
    # (allauthors kept for TOC & description gen below.
    if( len(authoropts) > 1  ):
        useauthors=[authoropts]
    else:
        useauthors=allauthors

    usedauthors=dict()
    for authorlist in useauthors:
        for author in authorlist:
            if( not usedauthors.has_key(author) ):
                usedauthors[author]=author
                metadata.appendChild(newTag(contentdom,"dc:creator",
                                            attrs={"opf:role":"aut"},
                                            text=author))

    metadata.appendChild(newTag(contentdom,"dc:contributor",text="epubmerge",attrs={"opf:role":"bkp"}))
    metadata.appendChild(newTag(contentdom,"dc:rights",text="Copyrights as per source stories"))

    for l in languages:
        metadata.appendChild(newTag(contentdom,"dc:language",text=l))

    if not descopt:
        # created now, but not filled in until TOC generation to save loops.
        description = newTag(contentdom,"dc:description",text="Anthology containing:\n")
    else:
        description = newTag(contentdom,"dc:description",text=descopt)
    metadata.appendChild(description)

    if source:
        metadata.appendChild(newTag(contentdom,"dc:identifier",
                                    attrs={"opf:scheme":"URL"},
                                    text=source))
        metadata.appendChild(newTag(contentdom,"dc:source",
                                    text=source))

    for tag in tags:
        metadata.appendChild(newTag(contentdom,"dc:subject",text=tag))

    package.appendChild(metadata)

    manifest = contentdom.createElement("manifest")
    package.appendChild(manifest)

    spine = newTag(contentdom,"spine",attrs={"toc":"ncx"})
    package.appendChild(spine)

    if coverjpgpath:
        # in case coverjpg isn't a jpg:
        coverext = 'jpg'
        covertype = 'image/jpeg'
        try:
            coverext = coverjpgpath.split('.')[-1].lower()
            covertype = imagetypes.get(coverext,covertype)
        except:
            pass
        logger.debug("coverjpgpath:%s coverext:%s covertype:%s"%(coverjpgpath,coverext,covertype))
        # <meta name="cover" content="cover.jpg"/>
        metadata.appendChild(newTag(contentdom,"meta",{"name":"cover",
                                                       "content":"coverimageid"}))
        guide = newTag(contentdom,"guide")
        guide.appendChild(newTag(contentdom,"reference",attrs={"type":"cover",
                                                   "title":"Cover",
                                                   "href":"cover.xhtml"}))
        package.appendChild(guide)

        manifest.appendChild(newTag(contentdom,"item",
                                    attrs={'id':"coverimageid",
                                           'href':"cover."+coverext,
                                           'media-type':covertype}))

        # Note that the id of the cover xhmtl *must* be 'cover'
        # for it to work on Nook.
        manifest.appendChild(newTag(contentdom,"item",
                                    attrs={'id':"cover",
                                           'href':"cover.xhtml",
                                           'media-type':"application/xhtml+xml"}))

        spine.appendChild(newTag(contentdom,"itemref",
                                 attrs={"idref":"cover",
                                        "linear":"yes"}))

    for item in items:
        (id,href,type)=item
        manifest.appendChild(newTag(contentdom,"item",
                                       attrs={'id':id,
                                              'href':href,
                                              'media-type':type}))

    for itemref in itemrefs:
        spine.appendChild(newTag(contentdom,"itemref",
                                    attrs={"idref":itemref,
                                           "linear":"yes"}))

    ## create toc.ncx file
    tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
    ncx = tocncxdom.documentElement
    ncx.setAttribute("version","2005-1")
    ncx.setAttribute("xmlns","http://www.daisy.org/z3986/2005/ncx/")
    head = tocncxdom.createElement("head")
    ncx.appendChild(head)
    head.appendChild(newTag(tocncxdom,"meta",
                            attrs={"name":"dtb:uid", "content":uniqueid}))
    depthnode = newTag(tocncxdom,"meta",
                            attrs={"name":"dtb:depth", "content":"4"})
    head.appendChild(depthnode)
    head.appendChild(newTag(tocncxdom,"meta",
                            attrs={"name":"dtb:totalPageCount", "content":"0"}))
    head.appendChild(newTag(tocncxdom,"meta",
                            attrs={"name":"dtb:maxPageNumber", "content":"0"}))

    docTitle = tocncxdom.createElement("docTitle")
    docTitle.appendChild(newTag(tocncxdom,"text",text=titleopt))
    ncx.appendChild(docTitle)

    tocnavMap = tocncxdom.createElement("navMap")
    ncx.appendChild(tocnavMap)

    booknum=0

    printt("wrote initial metadata:%s"%(time()-t))
    t = time()

    for navmap in navmaps:
        depthnavpoints = navmap.getElementsByTagNameNS("*","navPoint") # for checking more than one TOC entry

        ## only gets top level TOC entries.  sub entries carried inside.
        navpoints = filter( lambda x : isinstance(x,Element) and x.tagName=="navPoint",
                            navmap.childNodes)
        newnav = None
        if titlenavpoints:
            newnav = newTag(tocncxdom,"navPoint",{"id":"book%03d"%booknum})
            navlabel = newTag(tocncxdom,"navLabel")
            newnav.appendChild(navlabel)
            # For purposes of TOC titling & desc, use first book author.  Skip adding author if only one.
            if len(usedauthors) > 1:
                title = booktitles[booknum]+" by "+allauthors[booknum][0]
            else:
                title = booktitles[booknum]

            navlabel.appendChild(newTag(tocncxdom,"text",text=title))
            # Find the first 'spine' item's content for the title navpoint.
            # Many epubs have the first chapter as first navpoint, so we can't just
            # copy that anymore.
            newnav.appendChild(newTag(tocncxdom,"content",
                                      {"src":firstitemhrefs[booknum]}))

            #logger.debug("newnav:%s"%newnav.toprettyxml())
            tocnavMap.appendChild(newnav)
        else:
            newnav = tocnavMap

        if not descopt and len(allauthors[booknum]) > 0:
            description.appendChild(contentdom.createTextNode(booktitles[booknum]+" by "+allauthors[booknum][0]+"\n"))

        # If only one TOC point(total, not top level), or if not
        # including title nav point, include sub book TOC entries.
        if originalnavpoints and (len(depthnavpoints) > 1 or not titlenavpoints):
            for navpoint in navpoints:
                newnav.appendChild(navpoint)
                navpoint.is_ffdl_epub = is_ffdl_epub[booknum]

        booknum=booknum+1;
        # end of navmaps loop.


    maxdepth = 0
    contentsrcs = {}
    removednodes = []
    ## Force strict ordering of playOrder, stripping out some.
    playorder=0
    for navpoint in tocncxdom.getElementsByTagNameNS("*","navPoint"):
        if navpoint in removednodes:
            continue
        # need content[src] to compare for dups.  epub wants dup srcs to have same playOrder.
        contentsrc = None
        for n in navpoint.childNodes:
            if isinstance(n,Element) and n.tagName == "content":
                contentsrc = n.getAttribute("src")
                # logger.debug("contentsrc: %s"%contentsrc)
                break

        if( contentsrc not in contentsrcs ):

            parent = navpoint.parentNode
            try:
                # if the epub was ever edited with Sigil, it changed
                # the id, but the file name is the same.
                if navpoint.is_ffdl_epub and \
                        ( navpoint.getAttribute("id").endswith('log_page') \
                              or contentsrc.endswith("log_page.xhtml") ):
                    sibs = filter( lambda x : isinstance(x,Element) and x.tagName=="navPoint",
                                   parent.childNodes )
                    # if only logpage and one chapter, remove them from TOC and just show story.
                    if len(sibs) == 2:
                        parent.removeChild(navpoint)
                        # logger.debug("Removing %s:"% sibs[0].getAttribute("playOrder"))
                        parent.removeChild(sibs[1])
                        removednodes.append(sibs[1])
            except:
                pass

            # New src, new number.
            contentsrcs[contentsrc] = navpoint.getAttribute("id")
            playorder += 1
            navpoint.setAttribute("playOrder","%d" % playorder)
            # logger.debug("playorder:%d:"%playorder)

            # need to know depth of deepest navpoint for <meta name="dtb:depth" content="2"/>
            npdepth = 1
            dp = navpoint.parentNode
            while dp and dp.tagName != "navMap":
                npdepth += 1
                dp = dp.parentNode

            if npdepth > maxdepth:
                maxdepth = npdepth
        else:
            # same content, look for ffdl and title_page and/or single chapter.

            # easier to just set it now, even if the node gets removed later.
            navpoint.setAttribute("playOrder","%d" % playorder)
            # logger.debug("playorder:%d:"%playorder)

            parent = navpoint.parentNode
            try:
                # if the epub was ever edited with Sigil, it changed
                # the id, but the file name is the same.
                if navpoint.is_ffdl_epub and \
                        ( navpoint.getAttribute("id").endswith('title_page') \
                              or contentsrc.endswith("title_page.xhtml") ):
                    parent.removeChild(navpoint)
                    sibs = filter( lambda x : isinstance(x,Element) and x.tagName=="navPoint",
                                   parent.childNodes )
                    # if only one chapter after removing title_page, remove it too.
                    if len(sibs) == 1:
                        # logger.debug("Removing %s:"% sibs[0].getAttribute("playOrder"))
                        parent.removeChild(sibs[0])
                        removednodes.append(sibs[0])
            except:
                pass


    if flattentoc:
        maxdepth = 1
        # already have play order and pesky dup/single chapters
        # removed, just need to flatten.
        flattocnavMap = tocncxdom.createElement("navMap")
        for n in tocnavMap.getElementsByTagNameNS("*","navPoint"):
            flattocnavMap.appendChild(n)

        ncx.replaceChild(flattocnavMap,tocnavMap)

    printt("navmap/toc maddess:%s"%(time()-t))
    t = time()

    depthnode.setAttribute("content","%d"%maxdepth)

    ## content.opf written now due to description being filled in
    ## during TOC generation to save loops.
    contentxml = contentdom.toprettyxml(indent='   ',encoding='utf-8')
    # tweak for brain damaged Nook STR.  Nook insists on name before content.
    contentxml = contentxml.replace('<meta content="coverimageid" name="cover"/>',
                                    '<meta name="cover" content="coverimageid"/>')
    outputepub.writestr("content.opf",contentxml)
    outputepub.writestr("toc.ncx",tocncxdom.toprettyxml(indent='   ',encoding='utf-8'))

    printt("wrote opf/ncx files:%s"%(time()-t))
    t = time()

    if coverjpgpath:
        # write, not write string.  Pulling from file.
        outputepub.write(coverjpgpath,"cover."+coverext)

        outputepub.writestr("cover.xhtml",'''
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"><head><title>Cover</title><style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
div { margin: 0pt; padding: 0pt; }
</style></head><body><div>
<img src="cover.'''+coverext+'''" alt="cover"/>
</div></body></html>
''')

    # declares all the files created by Windows.  otherwise, when
    # it runs in appengine, windows unzips the files as 000 perms.
    for zf in outputepub.filelist:
        zf.create_system = 0
    outputepub.close()

    printt("closed outputepub:%s"%(time()-t))
    t = time()

    return (source,filecount)
Beispiel #16
0
    def write_split_epub(self,
                         outputio,
                         linenums,
                         changedtocs={},
                         authoropts=[],
                         titleopt=None,
                         descopt=None,
                         tags=[],
                         languages=['en'],
                         coverjpgpath=None):

        files = self.get_split_files(linenums)

        ## Write mimetype file, must be first and uncompressed.
        ## Older versions of python(2.4/5) don't allow you to specify
        ## compression by individual file.
        ## Overwrite if existing output file.
        outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
        outputepub.debug = 3
        outputepub.writestr("mimetype", "application/epub+zip")
        outputepub.close()

        ## Re-open file for content.
        outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
        outputepub.debug = 3

        ## Create META-INF/container.xml file.  The only thing it does is
        ## point to content.opf
        containerdom = getDOMImplementation().createDocument(None, "container", None)
        containertop = containerdom.documentElement
        containertop.setAttribute("version", "1.0")
        containertop.setAttribute("xmlns", "urn:oasis:names:tc:opendocument:xmlns:container")
        rootfiles = containerdom.createElement("rootfiles")
        containertop.appendChild(rootfiles)
        rootfiles.appendChild(newTag(containerdom, "rootfile", {"full-path": "content.opf",
                                                                "media-type": "application/oebps-package+xml"}))
        outputepub.writestr("META-INF/container.xml", containerdom.toprettyxml(indent='   ', encoding='utf-8'))

        ####    ## create content.opf file.
        uniqueid = "epubsplit-uid-%d" % time()  # real sophisticated uid scheme.
        contentdom = getDOMImplementation().createDocument(None, "package", None)
        package = contentdom.documentElement

        package.setAttribute("version", "2.0")
        package.setAttribute("xmlns", "http://www.idpf.org/2007/opf")
        package.setAttribute("unique-identifier", "epubsplit-id")
        metadata = newTag(contentdom, "metadata",
                          attrs={"xmlns:dc": "http://purl.org/dc/elements/1.1/",
                                 "xmlns:opf": "http://www.idpf.org/2007/opf"})
        metadata.appendChild(newTag(contentdom, "dc:identifier", text=uniqueid, attrs={"id": "epubsplit-id"}))
        if (titleopt is None):
            titleopt = self.origtitle + " Split"
        metadata.appendChild(newTag(contentdom, "dc:title", text=titleopt))

        if (authoropts and len(authoropts) > 0):
            useauthors = authoropts
        else:
            useauthors = self.origauthors

        usedauthors = dict()
        for author in useauthors:
            if (not usedauthors.has_key(author)):
                usedauthors[author] = author
                metadata.appendChild(newTag(contentdom, "dc:creator",
                                            attrs={"opf:role": "aut"},
                                            text=author))

        metadata.appendChild(newTag(contentdom, "dc:contributor", text="epubsplit", attrs={"opf:role": "bkp"}))
        metadata.appendChild(newTag(contentdom, "dc:rights", text="Copyrights as per source stories"))

        if languages:
            for l in languages:
                metadata.appendChild(newTag(contentdom, "dc:language", text=l))
        else:
            metadata.appendChild(newTag(contentdom, "dc:language", text="en"))

        if not descopt:
            # created now, but not filled in until TOC generation to save loops.
            description = newTag(contentdom, "dc:description",
                                 text="Split from %s by %s." % (self.origtitle, ", ".join(self.origauthors)))
        else:
            description = newTag(contentdom, "dc:description", text=descopt)
        metadata.appendChild(description)

        for tag in tags:
            metadata.appendChild(newTag(contentdom, "dc:subject", text=tag))

        package.appendChild(metadata)

        manifest = contentdom.createElement("manifest")
        package.appendChild(manifest)
        spine = newTag(contentdom, "spine", attrs={"page-progression-direction": "rtl",
                                                   "toc": "ncx"
                                                   })
        package.appendChild(spine)

        manifest.appendChild(newTag(contentdom, "item",
                                    attrs={'id': 'ncx',
                                           'href': 'toc.ncx',
                                           'media-type': 'application/x-dtbncx+xml'}))

        manifest.appendChild(newTag(contentdom, "item",
                                    attrs={'id': "not_purchased",
                                           'href': "not_purchased_sections.xhtml",
                                           'media-type': "application/xhtml+xml"}))

        contentcount = 0
        for (filename, id, type, filedata) in files:
            # filename = self.filecache.addHtml(href,filedata)
            # print("writing :%s"%filename)
            # add to manifest and spine

            if filename == "not_purchased_sections.xhtml":
                continue  # don't dup cover.

            if globalindex == filename:
                spine.appendChild(newTag(contentdom, "itemref",
                                         attrs={"idref": "not_purchased",
                                                "linear": "yes"}))

            outputepub.writestr(filename, filedata.encode('utf-8'))
            id = "a%d" % contentcount
            contentcount += 1
            manifest.appendChild(newTag(contentdom, "item",
                                        attrs={'id': id,
                                               'href': filename,
                                               'media-type': type}))
            spine.appendChild(newTag(contentdom, "itemref",
                                     attrs={"idref": id,
                                            "linear": "yes"}))

        if globalindex is None:
            spine.appendChild(newTag(contentdom, "itemref",
                                     attrs={"idref": "not_purchased",
                                            "linear": "yes"}))

        for (linked, type) in self.filecache.linkedfiles:
            try:
                outputepub.writestr(linked, self.get_file(linked))
            except Exception, e:
                print("Failed to copy linked file (%s)\nException: %s" % (linked, e))

            id = "a%d" % contentcount
            contentcount += 1
            manifest.appendChild(newTag(contentdom, "item",
                                        attrs={'id': id,
                                               'href': linked,
                                               'media-type': type}))
Beispiel #17
0
    def writeStoryImpl(self, out):

        ## Python 2.5 ZipFile is rather more primative than later
        ## versions.  It can operate on a file, or on a StringIO, but
        ## not on an open stream.  OTOH, I suspect we would have had
        ## problems with closing and opening again to change the
        ## compression type anyway.
        zipio = StringIO.StringIO()

        ## mimetype must be first file and uncompressed.  Python 2.5
        ## ZipFile can't change compression type file-by-file, so we
        ## have to close and re-open
        outputepub = ZipFile(zipio, 'w', compression=ZIP_STORED)
        outputepub.debug = 3
        outputepub.writestr('mimetype', 'application/epub+zip')
        outputepub.close()

        ## Re-open file for content.
        outputepub = ZipFile(zipio, 'a', compression=ZIP_DEFLATED)
        outputepub.debug = 3

        ## Create META-INF/container.xml file.  The only thing it does is
        ## point to content.opf
        containerdom = getDOMImplementation().createDocument(
            None, "container", None)
        containertop = containerdom.documentElement
        containertop.setAttribute("version", "1.0")
        containertop.setAttribute(
            "xmlns", "urn:oasis:names:tc:opendocument:xmlns:container")
        rootfiles = containerdom.createElement("rootfiles")
        containertop.appendChild(rootfiles)
        rootfiles.appendChild(
            newTag(
                containerdom, "rootfile", {
                    "full-path": "content.opf",
                    "media-type": "application/oebps-package+xml"
                }))
        outputepub.writestr("META-INF/container.xml",
                            containerdom.toxml(encoding='utf-8'))
        containerdom.unlink()
        del containerdom

        ## Epub has two metadata files with real data.  We're putting
        ## them in content.opf (pointed to by META-INF/container.xml)
        ## and toc.ncx (pointed to by content.opf)

        ## content.opf contains metadata, a 'manifest' list of all
        ## other included files, and another 'spine' list of the items in the
        ## file

        uniqueid = 'fanficfare-uid:%s-u%s-s%s' % (
            self.getMetadata('site'), self.story.getList('authorId')[0],
            self.getMetadata('storyId'))

        contentdom = getDOMImplementation().createDocument(
            None, "package", None)
        package = contentdom.documentElement
        package.setAttribute("version", "2.0")
        package.setAttribute("xmlns", "http://www.idpf.org/2007/opf")
        package.setAttribute("unique-identifier", "fanficfare-uid")
        metadata = newTag(contentdom,
                          "metadata",
                          attrs={
                              "xmlns:dc": "http://purl.org/dc/elements/1.1/",
                              "xmlns:opf": "http://www.idpf.org/2007/opf"
                          })
        package.appendChild(metadata)

        metadata.appendChild(
            newTag(contentdom,
                   "dc:identifier",
                   text=uniqueid,
                   attrs={"id": "fanficfare-uid"}))

        if self.getMetadata('title'):
            metadata.appendChild(
                newTag(contentdom, "dc:title", text=self.getMetadata('title')))

        if self.getMetadata('author'):
            if self.story.isList('author'):
                for auth in self.story.getList('author'):
                    metadata.appendChild(
                        newTag(contentdom,
                               "dc:creator",
                               attrs={"opf:role": "aut"},
                               text=auth))
            else:
                metadata.appendChild(
                    newTag(contentdom,
                           "dc:creator",
                           attrs={"opf:role": "aut"},
                           text=self.getMetadata('author')))

        metadata.appendChild(
            newTag(contentdom,
                   "dc:contributor",
                   text="FanFicFare [https://github.com/JimmXinu/FanFicFare]",
                   attrs={"opf:role": "bkp"}))
        metadata.appendChild(newTag(contentdom, "dc:rights", text=""))
        if self.story.getMetadata('langcode'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:language",
                       text=self.story.getMetadata('langcode')))
        else:
            metadata.appendChild(newTag(contentdom, "dc:language", text='en'))

        #  published, created, updated, calibre
        #  Leave calling self.story.getMetadataRaw directly in case date format changes.
        if self.story.getMetadataRaw('datePublished'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:date",
                       attrs={"opf:event": "publication"},
                       text=self.story.getMetadataRaw(
                           'datePublished').strftime("%Y-%m-%d")))

        if self.story.getMetadataRaw('dateCreated'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:date",
                       attrs={"opf:event": "creation"},
                       text=self.story.getMetadataRaw('dateCreated').strftime(
                           "%Y-%m-%d")))

        if self.story.getMetadataRaw('dateUpdated'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:date",
                       attrs={"opf:event": "modification"},
                       text=self.story.getMetadataRaw('dateUpdated').strftime(
                           "%Y-%m-%d")))
            metadata.appendChild(
                newTag(contentdom,
                       "meta",
                       attrs={
                           "name":
                           "calibre:timestamp",
                           "content":
                           self.story.getMetadataRaw('dateUpdated').strftime(
                               "%Y-%m-%dT%H:%M:%S")
                       }))

        series = self.story.getMetadataRaw('series')
        if series and self.getConfig('calibre_series_meta'):
            series_index = "0.0"
            if '[' in series:
                logger.debug(series)
                ## assumed "series [series_index]"
                series_index = series[series.index(' [') + 2:-1]
                series = series[:series.index(' [')]

                ## calibre always outputs a series_index and it's
                ## always a float with 1 or 2 decimals.  FFF usually
                ## has either an integer or no index. (injected
                ## calibre series is the only float at this time)
                series_index = "%.2f" % float(series_index)

            metadata.appendChild(
                newTag(contentdom,
                       "meta",
                       attrs={
                           "name": "calibre:series",
                           "content": series
                       }))
            metadata.appendChild(
                newTag(contentdom,
                       "meta",
                       attrs={
                           "name": "calibre:series_index",
                           "content": series_index
                       }))

        if self.getMetadata('description'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:description",
                       text=self.getMetadata('description')))

        for subject in self.story.getSubjectTags():
            metadata.appendChild(newTag(contentdom, "dc:subject",
                                        text=subject))

        if self.getMetadata('site'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:publisher",
                       text=self.getMetadata('site')))

        if self.getMetadata('storyUrl'):
            metadata.appendChild(
                newTag(contentdom,
                       "dc:identifier",
                       attrs={"opf:scheme": "URL"},
                       text=self.getMetadata('storyUrl')))
            metadata.appendChild(
                newTag(contentdom,
                       "dc:source",
                       text=self.getMetadata('storyUrl')))

        ## end of metadata, create manifest.
        items = []  # list of (id, href, type, title) tuples(all strings)
        itemrefs = []  # list of strings -- idrefs from .opfs' spines
        items.append(("ncx", "toc.ncx", "application/x-dtbncx+xml",
                      None))  ## we'll generate the toc.ncx file,
        ## but it needs to be in the items manifest.

        guide = None
        coverIO = None

        coverimgid = "image0000"
        if not self.story.cover and self.story.oldcover:
            logger.debug(
                "writer_epub: no new cover, has old cover, write image.")
            (oldcoverhtmlhref, oldcoverhtmltype, oldcoverhtmldata,
             oldcoverimghref, oldcoverimgtype,
             oldcoverimgdata) = self.story.oldcover
            outputepub.writestr(oldcoverhtmlhref, oldcoverhtmldata)
            outputepub.writestr(oldcoverimghref, oldcoverimgdata)

            coverimgid = "image0"
            items.append((coverimgid, oldcoverimghref, oldcoverimgtype, None))
            items.append(("cover", oldcoverhtmlhref, oldcoverhtmltype, None))
            itemrefs.append("cover")
            metadata.appendChild(
                newTag(contentdom, "meta", {
                    "content": "image0",
                    "name": "cover"
                }))
            guide = newTag(contentdom, "guide")
            guide.appendChild(
                newTag(contentdom,
                       "reference",
                       attrs={
                           "type": "cover",
                           "title": "Cover",
                           "href": oldcoverhtmlhref
                       }))

        if self.getConfig('include_images'):
            imgcount = 0
            for imgmap in self.story.getImgUrls():
                imgfile = "OEBPS/" + imgmap['newsrc']
                outputepub.writestr(imgfile, imgmap['data'])
                items.append(
                    ("image%04d" % imgcount, imgfile, imgmap['mime'], None))
                imgcount += 1
                if 'cover' in imgfile:
                    # make sure coverimgid is set to the cover, not
                    # just the first image.
                    coverimgid = items[-1][0]

        items.append(("style", "OEBPS/stylesheet.css", "text/css", None))

        if self.story.cover:
            # Note that the id of the cover xhmtl *must* be 'cover'
            # for it to work on Nook.
            items.append(
                ("cover", "OEBPS/cover.xhtml", "application/xhtml+xml", None))
            itemrefs.append("cover")
            #
            # <meta name="cover" content="cover.jpg"/>
            metadata.appendChild(
                newTag(contentdom, "meta", {
                    "content": coverimgid,
                    "name": "cover"
                }))
            # cover stuff for later:
            # at end of <package>:
            # <guide>
            # <reference type="cover" title="Cover" href="Text/cover.xhtml"/>
            # </guide>
            guide = newTag(contentdom, "guide")
            guide.appendChild(
                newTag(contentdom,
                       "reference",
                       attrs={
                           "type": "cover",
                           "title": "Cover",
                           "href": "OEBPS/cover.xhtml"
                       }))

            if self.hasConfig("cover_content"):
                COVER = string.Template(self.getConfig("cover_content"))
            else:
                COVER = self.EPUB_COVER
            coverIO = StringIO.StringIO()
            coverIO.write(
                COVER.substitute(
                    dict(self.story.getAllMetadata().items() +
                         {'coverimg': self.story.cover}.items())))

        if self.getConfig("include_titlepage"):
            items.append(("title_page", "OEBPS/title_page.xhtml",
                          "application/xhtml+xml", "Title Page"))
            itemrefs.append("title_page")
        if len(self.story.getChapters()) > 1 and self.getConfig(
                "include_tocpage") and not self.metaonly:
            items.append(("toc_page", "OEBPS/toc_page.xhtml",
                          "application/xhtml+xml", "Table of Contents"))
            itemrefs.append("toc_page")

        ## save where to insert logpage.
        logpage_indices = (len(items), len(itemrefs))

        dologpage = ( self.getConfig("include_logpage") == "smart" and \
                          (self.story.logfile or self.story.getMetadataRaw("status") == "In-Progress") )  \
                     or self.getConfig("include_logpage") == "true"

        ## collect chapter urls and file names for internalize_text_links option.
        chapurlmap = {}
        for index, chap in enumerate(self.story.getChapters(fortoc=True)):
            if chap.html:
                i = index + 1
                items.append(("file%04d" % i, "OEBPS/file%04d.xhtml" % i,
                              "application/xhtml+xml", chap.title))
                itemrefs.append("file%04d" % i)
                chapurlmap[
                    chap.
                    url] = "file%04d.xhtml" % i  # url -> relative epub file name.

        if dologpage:
            if self.getConfig("logpage_at_end") == "true":
                ## insert logpage after chapters.
                logpage_indices = (len(items), len(itemrefs))
            items.insert(logpage_indices[0],
                         ("log_page", "OEBPS/log_page.xhtml",
                          "application/xhtml+xml", "Update Log"))
            itemrefs.insert(logpage_indices[1], "log_page")

        manifest = contentdom.createElement("manifest")
        package.appendChild(manifest)
        for item in items:
            (id, href, type, title) = item
            manifest.appendChild(
                newTag(contentdom,
                       "item",
                       attrs={
                           'id': id,
                           'href': href,
                           'media-type': type
                       }))

        spine = newTag(contentdom, "spine", attrs={"toc": "ncx"})
        package.appendChild(spine)
        for itemref in itemrefs:
            spine.appendChild(
                newTag(contentdom,
                       "itemref",
                       attrs={
                           "idref": itemref,
                           "linear": "yes"
                       }))
        # guide only exists if there's a cover.
        if guide:
            package.appendChild(guide)

        # write content.opf to zip.
        contentxml = contentdom.toxml(encoding='utf-8')

        # tweak for brain damaged Nook STR.  Nook insists on name before content.
        contentxml = contentxml.replace(
            '<meta content="%s" name="cover"/>' % coverimgid,
            '<meta name="cover" content="%s"/>' % coverimgid)
        outputepub.writestr("content.opf", contentxml)

        contentdom.unlink()
        del contentdom

        ## create toc.ncx file
        tocncxdom = getDOMImplementation().createDocument(None, "ncx", None)
        ncx = tocncxdom.documentElement
        ncx.setAttribute("version", "2005-1")
        ncx.setAttribute("xmlns", "http://www.daisy.org/z3986/2005/ncx/")
        head = tocncxdom.createElement("head")
        ncx.appendChild(head)
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:uid",
                       "content": uniqueid
                   }))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:depth",
                       "content": "1"
                   }))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:totalPageCount",
                       "content": "0"
                   }))
        head.appendChild(
            newTag(tocncxdom,
                   "meta",
                   attrs={
                       "name": "dtb:maxPageNumber",
                       "content": "0"
                   }))

        docTitle = tocncxdom.createElement("docTitle")
        docTitle.appendChild(
            newTag(tocncxdom, "text", text=self.getMetadata('title')))
        ncx.appendChild(docTitle)

        tocnavMap = tocncxdom.createElement("navMap")
        ncx.appendChild(tocnavMap)

        # <navPoint id="<id>" playOrder="<risingnumberfrom0>">
        #   <navLabel>
        #     <text><chapter title></text>
        #   </navLabel>
        #   <content src="<chapterfile>"/>
        # </navPoint>
        index = 0
        for item in items:
            (id, href, type, title) = item
            # only items to be skipped, cover.xhtml, images, toc.ncx, stylesheet.css, should have no title.
            if title:
                navPoint = newTag(tocncxdom,
                                  "navPoint",
                                  attrs={
                                      'id': id,
                                      'playOrder': unicode(index)
                                  })
                tocnavMap.appendChild(navPoint)
                navLabel = newTag(tocncxdom, "navLabel")
                navPoint.appendChild(navLabel)
                ## the xml library will re-escape as needed.
                navLabel.appendChild(
                    newTag(tocncxdom, "text", text=stripHTML(title)))
                navPoint.appendChild(
                    newTag(tocncxdom, "content", attrs={"src": href}))
                index = index + 1

        # write toc.ncx to zip file
        outputepub.writestr("toc.ncx", tocncxdom.toxml(encoding='utf-8'))
        tocncxdom.unlink()
        del tocncxdom

        # write stylesheet.css file.
        outputepub.writestr(
            "OEBPS/stylesheet.css",
            self.EPUB_CSS.substitute(self.story.getAllMetadata()))

        # write title page.
        if self.getConfig("titlepage_use_table"):
            TITLE_PAGE_START = self.EPUB_TABLE_TITLE_PAGE_START
            TITLE_ENTRY = self.EPUB_TABLE_TITLE_ENTRY
            WIDE_TITLE_ENTRY = self.EPUB_TABLE_TITLE_WIDE_ENTRY
            NO_TITLE_ENTRY = self.EPUB_TABLE_NO_TITLE_ENTRY
            TITLE_PAGE_END = self.EPUB_TABLE_TITLE_PAGE_END
        else:
            TITLE_PAGE_START = self.EPUB_TITLE_PAGE_START
            TITLE_ENTRY = self.EPUB_TITLE_ENTRY
            WIDE_TITLE_ENTRY = self.EPUB_TITLE_ENTRY  # same, only wide in tables.
            NO_TITLE_ENTRY = self.EPUB_NO_TITLE_ENTRY
            TITLE_PAGE_END = self.EPUB_TITLE_PAGE_END

        if coverIO:
            outputepub.writestr("OEBPS/cover.xhtml", coverIO.getvalue())
            coverIO.close()

        titlepageIO = StringIO.StringIO()
        self.writeTitlePage(out=titlepageIO,
                            START=TITLE_PAGE_START,
                            ENTRY=TITLE_ENTRY,
                            WIDE_ENTRY=WIDE_TITLE_ENTRY,
                            END=TITLE_PAGE_END,
                            NO_TITLE_ENTRY=NO_TITLE_ENTRY)
        if titlepageIO.getvalue():  # will be false if no title page.
            outputepub.writestr("OEBPS/title_page.xhtml",
                                titlepageIO.getvalue())
        titlepageIO.close()

        # write toc page.
        tocpageIO = StringIO.StringIO()
        self.writeTOCPage(tocpageIO, self.EPUB_TOC_PAGE_START,
                          self.EPUB_TOC_ENTRY, self.EPUB_TOC_PAGE_END)
        if tocpageIO.getvalue():  # will be false if no toc page.
            outputepub.writestr("OEBPS/toc_page.xhtml", tocpageIO.getvalue())
        tocpageIO.close()

        if dologpage:
            # write log page.
            logpageIO = StringIO.StringIO()
            self.writeLogPage(logpageIO)
            outputepub.writestr("OEBPS/log_page.xhtml", logpageIO.getvalue())
            logpageIO.close()

        if self.hasConfig('chapter_start'):
            CHAPTER_START = string.Template(self.getConfig("chapter_start"))
        else:
            CHAPTER_START = self.EPUB_CHAPTER_START

        if self.hasConfig('chapter_end'):
            CHAPTER_END = string.Template(self.getConfig("chapter_end"))
        else:
            CHAPTER_END = self.EPUB_CHAPTER_END

        for index, chap in enumerate(
                self.story.getChapters()):  # (url,title,html)
            if chap.html:
                chap_data = chap.html
                if self.getConfig('internalize_text_links'):
                    soup = bs4.BeautifulSoup(chap.html, 'html5lib')
                    changed = False
                    for alink in soup.find_all('a'):
                        if alink.has_attr(
                                'href') and alink['href'] in chapurlmap:
                            alink['href'] = chapurlmap[alink['href']]
                            changed = True
                    if changed:
                        chap_data = unicode(soup)
                        # Don't want html, head or body tags in
                        # chapter html--bs4 insists on adding them.
                        chap_data = re.sub(r"</?(html|head|body)[^>]*>\r?\n?",
                                           "", chap_data)

                #logger.debug('Writing chapter text for: %s' % chap.title)
                vals = {
                    'url': removeEntities(chap.url),
                    'chapter': removeEntities(chap.title),
                    'origchapter': removeEntities(chap.origtitle),
                    'tocchapter': removeEntities(chap.toctitle),
                    'index': "%04d" % (index + 1),
                    'number': index + 1
                }
                # escape double quotes in all vals.
                for k, v in vals.items():
                    if isinstance(v, basestring):
                        vals[k] = v.replace('"', '&quot;')
                fullhtml = CHAPTER_START.substitute(vals) + \
                    chap_data.strip() + \
                    CHAPTER_END.substitute(vals)
                # strip to avoid ever growning numbers of newlines.
                # ffnet(& maybe others) gives the whole chapter text
                # as one line.  This causes problems for nook(at
                # least) when the chapter size starts getting big
                # (200k+)
                fullhtml = re.sub(r'(</p>|<br ?/>)\n*', r'\1\n', fullhtml)

                outputepub.writestr("OEBPS/file%04d.xhtml" % (index + 1),
                                    fullhtml.encode('utf-8'))
                del fullhtml

        if self.story.calibrebookmark:
            outputepub.writestr("META-INF/calibre_bookmarks.txt",
                                self.story.calibrebookmark)

# declares all the files created by Windows.  otherwise, when
# it runs in appengine, windows unzips the files as 000 perms.
        for zf in outputepub.filelist:
            zf.create_system = 0
        outputepub.close()
        out.write(zipio.getvalue())
        zipio.close()
def doUnMerge(inputio,outdir=None):
    epub = ZipFile(inputio, 'r') # works equally well with inputio as a path or a blob
    outputios = []

    ## Find the .opf file.
    container = epub.read("META-INF/container.xml")
    containerdom = parseString(container)
    rootfilenodelist = containerdom.getElementsByTagName("rootfile")
    rootfilename = rootfilenodelist[0].getAttribute("full-path")

    contentdom = parseString(epub.read(rootfilename))

    ## Save the path to the .opf file--hrefs inside it are relative to it.
    relpath = get_path_part(rootfilename)
    #print("relpath:%s"%relpath)
            
    # spin through the manifest--only place there are item tags.
    # Correction--only place there *should* be item tags.  But
    # somebody found one that did.
    manifesttag=contentdom.getElementsByTagNameNS("*","manifest")[0]
    for item in manifesttag.getElementsByTagNameNS("*","item"):
        # look for our fake media-type for original rootfiles.
        if( item.getAttribute("media-type") == "origrootfile/xml" ):
            # found one, assume the dir containing it is a complete
            # original epub, do initial setup of epub.
            itemhref = relpath+unquote(item.getAttribute("href"))
            #print("Found origrootfile:%s"%itemhref)
            curepubpath = re.sub(r'([^\d/]+/)+$','',get_path_part(itemhref))
            savehref = itemhref[len(curepubpath):]
            #print("curepubpath:%s"%curepubpath)
            
            outputio = StringIO()
            outputepub = ZipFile(outputio, "w", compression=ZIP_STORED)
            outputepub.debug = 3
            outputepub.writestr("mimetype", "application/epub+zip")
            outputepub.close()
        
            ## Re-open file for content.
            outputepub = ZipFile(outputio, "a", compression=ZIP_DEFLATED)
            outputepub.debug = 3
            ## Create META-INF/container.xml file.  The only thing it does is
            ## point to content.opf
            containerdom = getDOMImplementation().createDocument(None, "container", None)
            containertop = containerdom.documentElement
            containertop.setAttribute("version","1.0")
            containertop.setAttribute("xmlns","urn:oasis:names:tc:opendocument:xmlns:container")
            rootfiles = containerdom.createElement("rootfiles")
            containertop.appendChild(rootfiles)
            rootfiles.appendChild(newTag(containerdom,"rootfile",{"full-path":savehref,
                                                                  "media-type":"application/oebps-package+xml"}))
            outputepub.writestr("META-INF/container.xml",containerdom.toprettyxml(indent='   ',encoding='utf-8'))

            outputepub.writestr(savehref,epub.read(itemhref))
            
            for item2 in contentdom.getElementsByTagName("item"):
                item2href = relpath+unquote(item2.getAttribute("href"))
                if item2href.startswith(curepubpath) and item2href != itemhref:
                    save2href = item2href[len(curepubpath):]
                    #print("Found %s -> %s"%(item2href,save2href))
                    outputepub.writestr(save2href,epub.read(item2href))

            # declares all the files created by Windows.  otherwise, when
            # it runs in appengine, windows unzips the files as 000 perms.
            for zf in outputepub.filelist:
                zf.create_system = 0
            outputepub.close()
            
            outputios.append(outputio)

    if outdir:
        outfilenames=[]
        for count,epubIO in enumerate(outputios):
            filename="%s/%d.epub"%(outdir,count)
            print("write %s"%filename)
            outstream = open(filename,"wb")
            outstream.write(epubIO.getvalue())
            outstream.close()
            outfilenames.append(filename)
        return outfilenames
    else:
        return outputios