def processWebpage(self, webpage, dump_xhtml=False): if not hasattr(webpage, 'tree'): webpage.tree = webpage._get_parse_tree() from copy import copy self.remapLinks(webpage) self.tree_processor = TreeProcessor() #self.tree_processor.getMetaInfo(webpage) self.tree_processor.annotateNodes(webpage) self.tree_processor.clean(webpage) webpage.xml = self.serializeArticle(copy(webpage.tree)) self.container.addArticle(webpage) if dump_xhtml: return webpage.xml del webpage.tree del webpage.xml
class EpubWriter(object): def __init__(self, output, coll, status_callback=None, cover_img=None): self.output = output self.target_dir = os.path.dirname(output) self.coll = coll self.scaled_images = {} self.status_callback = status_callback self.cover_img = cover_img def initContainer(self): if not os.path.exists(self.target_dir): print 'created dir' os.makedirs(self.target_dir) self.container = EpubContainer(self.output, self.coll) self.container.link_file(os.path.join(os.path.dirname(__file__), 'wp.css'), 'OPS/wp.css') def closeContainer(self): self.container.close() def renderColl(self, dump_xhtml=False): xhtml = None self.initContainer() self.processCoverImage() self.processTitlePage() progress_inc = 100.0/len(self.coll.outline.items) for n, (lvl, webpage) in enumerate(self.coll.outline.walk()): if isinstance(webpage, collection.WebPage): xhtml = self.processWebpage(webpage, dump_xhtml=dump_xhtml) elif isinstance(webpage, collection.Chapter): self.processChapter(webpage) if self.status_callback: self.status_callback(progress=n*progress_inc) self.processMetaInfo() self.closeContainer() if dump_xhtml: return xhtml def processCoverImage(self): if not self.cover_img: return content = [E.div(dict(style='width:100%;height:100%;'), E.img(dict(src='images/' + 'cover'+ os.path.splitext(self.cover_img)[1], alt='', #width='100%', height='100%', style='max-height:100%;max-width:100%;margin:auto;', ))), ] xml = misc.xhtml_page(title='cover', body_content=content, flatten=True) self.container.addCover(xml, os.path.abspath(self.cover_img)) def processTitlePage(self): if not any(txt != '' for txt in [self.coll.title, self.coll.subtitle, self.coll.editor]): return titlepage = collection.Chapter(self.coll.title) titlepage.id = 'titlepage' body_content = [E.h1(self.coll.title, style="margin-top:20%;font-size:200%;text-align:center;"), E.h2(self.coll.subtitle, style="margin-top:1em;font-size:150%;text-align:center;"), E.h3(self.coll.editor, style="margin-top:1em;font-size:100%;text-align:center;"), ] if any('wikipedia.org' in url for url in self.coll.url2webpage): img_src = 'wikipedia_logo.jpg' titlepage.images = {img_src: os.path.join(os.path.dirname(__file__), img_src)} body_content.append(E.div(E.img(src='images/'+img_src, width='50%', alt='', ), style='text-align:center;margin-top:4em;' )) tree = misc.xhtml_page(title=self.coll.title, body_content=body_content, flatten=False) titlepage.tree = tree titlepage.xml = misc.flatten_tree(tree) self.container.addArticle(titlepage) def processMetaInfo(self): from mwlib.epub import metainfo chapter = collection.Chapter(_('Article Sources and Contributors')) chapter.id = '_articlesources' chapter.xml = metainfo.getArticleMetainfo(chapter, self.coll) self.container.addArticle(chapter) chapter = collection.Chapter(_('Image Sources, Licenses and Contributors')) chapter.id = '_imagesources' chapter.xml = metainfo.getImageMetainfo(chapter, self.coll) self.container.addArticle(chapter) def processChapter(self, chapter): self.num_chapters = getattr(self, 'num_chapters', 0) + 1 chapter.id = 'chapter_%02d' % self.num_chapters title = xmlescape(chapter.title) chapter.xml = misc.xhtml_page( title=title, body_content=[E.h1({'style': 'margin-top:15%;font-size:200%;text-align:center;'}, title)] ) self.container.addArticle(chapter) def processWebpage(self, webpage, dump_xhtml=False): if not hasattr(webpage, 'tree'): webpage.tree = webpage._get_parse_tree() from copy import copy self.remapLinks(webpage) self.tree_processor = TreeProcessor() #self.tree_processor.getMetaInfo(webpage) self.tree_processor.annotateNodes(webpage) self.tree_processor.clean(webpage) webpage.xml = self.serializeArticle(copy(webpage.tree)) self.container.addArticle(webpage) if dump_xhtml: return webpage.xml del webpage.tree del webpage.xml def remapLinks(self, webpage): for img in webpage.tree.findall('.//img'): img_fn = webpage.images.get(img.attrib['src']) if img_fn: zip_rel_path = os.path.join(config.img_rel_path, os.path.basename(img_fn)) img.attrib['src'] = zip_rel_path else: remove_node(img) target_ids = [safe_xml_id(_id) for _id in webpage.tree.xpath('.//@id')] for a in webpage.tree.findall('.//a'): href = a.get('href') if not href: # this link is probably just an anchor continue if href.startswith('#'): target_id = safe_xml_id(href)[1:] if target_id not in target_ids: a.set('id', target_id) target_ids.append(target_id) a.set('href', '#'+target_id) else: url = clean_url(urlparse.urljoin(webpage.url, href)) linked_wp = webpage.coll.url2webpage.get(url) if linked_wp: a.set('href', linked_wp.id + '.xhtml') else: a.set('href', url) def serializeArticle(self, node): assert not node.find('.//body'), 'error: node contains BODY tag' html = E.html({'xmlns':"http://www.w3.org/1999/xhtml"}, E.head(E.meta({'http-equiv':"Content-Type", 'content': "application/xhtml+xml; charset=utf-8"}) ), ) head = html.find('.//head') node_head = node.find('.//head') for head_content in node_head.iterchildren(): head.append(head_content) node_head.getparent().remove(node_head) body = E.body() html.append(body) body.extend(node) return misc.flatten_tree(html)
class EpubWriter(object): def __init__(self, output, coll, status_callback=None, cover_img=None): self.output = output self.target_dir = os.path.dirname(output) self.coll = coll self.scaled_images = {} self.status_callback = status_callback self.cover_img = cover_img def initContainer(self): if not os.path.exists(self.target_dir): print 'created dir' os.makedirs(self.target_dir) self.container = EpubContainer(self.output, self.coll) self.container.link_file( os.path.join(os.path.dirname(__file__), 'wp.css'), 'OPS/wp.css') def closeContainer(self): self.container.close() def renderColl(self, dump_xhtml=False): xhtml = None self.initContainer() self.processCoverImage() self.processTitlePage() progress_inc = 100.0 / len(self.coll.outline.items) for n, (lvl, webpage) in enumerate(self.coll.outline.walk()): if isinstance(webpage, collection.WebPage): xhtml = self.processWebpage(webpage, dump_xhtml=dump_xhtml) elif isinstance(webpage, collection.Chapter): self.processChapter(webpage) if self.status_callback: self.status_callback(progress=n * progress_inc) self.processMetaInfo() self.closeContainer() if dump_xhtml: return xhtml def processCoverImage(self): if not self.cover_img: return content = [ E.div( dict(style='width:100%;height:100%;'), E.img( dict( src='images/' + 'cover' + os.path.splitext(self.cover_img)[1], alt='', #width='100%', height='100%', style='max-height:100%;max-width:100%;margin:auto;', ))), ] xml = misc.xhtml_page(title='cover', body_content=content, flatten=True) self.container.addCover(xml, os.path.abspath(self.cover_img)) def processTitlePage(self): if not any(txt != '' for txt in [self.coll.title, self.coll.subtitle, self.coll.editor]): return titlepage = collection.Chapter(self.coll.title) titlepage.id = 'titlepage' body_content = [ E.h1(self.coll.title, style="margin-top:20%;font-size:200%;text-align:center;"), E.h2(self.coll.subtitle, style="margin-top:1em;font-size:150%;text-align:center;"), E.h3(self.coll.editor, style="margin-top:1em;font-size:100%;text-align:center;"), ] if any('wikipedia.org' in url for url in self.coll.url2webpage): img_src = 'wikipedia_logo.jpg' titlepage.images = { img_src: os.path.join(os.path.dirname(__file__), img_src) } body_content.append( E.div(E.img( src='images/' + img_src, width='50%', alt='', ), style='text-align:center;margin-top:4em;')) tree = misc.xhtml_page(title=self.coll.title, body_content=body_content, flatten=False) titlepage.tree = tree titlepage.xml = misc.flatten_tree(tree) self.container.addArticle(titlepage) def processMetaInfo(self): from mwlib.epub import metainfo chapter = collection.Chapter(_('Article Sources and Contributors')) chapter.id = '_articlesources' chapter.xml = metainfo.getArticleMetainfo(chapter, self.coll) self.container.addArticle(chapter) chapter = collection.Chapter( _('Image Sources, Licenses and Contributors')) chapter.id = '_imagesources' chapter.xml = metainfo.getImageMetainfo(chapter, self.coll) self.container.addArticle(chapter) def processChapter(self, chapter): self.num_chapters = getattr(self, 'num_chapters', 0) + 1 chapter.id = 'chapter_%02d' % self.num_chapters title = xmlescape(chapter.title) chapter.xml = misc.xhtml_page( title=title, body_content=[ E.h1( { 'style': 'margin-top:15%;font-size:200%;text-align:center;' }, title) ]) self.container.addArticle(chapter) def processWebpage(self, webpage, dump_xhtml=False): if not hasattr(webpage, 'tree'): webpage.tree = webpage._get_parse_tree() from copy import copy self.remapLinks(webpage) self.tree_processor = TreeProcessor() #self.tree_processor.getMetaInfo(webpage) self.tree_processor.annotateNodes(webpage) self.tree_processor.clean(webpage) webpage.xml = self.serializeArticle(copy(webpage.tree)) self.container.addArticle(webpage) if dump_xhtml: return webpage.xml del webpage.tree del webpage.xml def remapLinks(self, webpage): for img in webpage.tree.findall('.//img'): img_fn = webpage.images.get(img.attrib['src']) if img_fn: zip_rel_path = os.path.join(config.img_rel_path, os.path.basename(img_fn)) img.attrib['src'] = zip_rel_path else: remove_node(img) target_ids = [safe_xml_id(_id) for _id in webpage.tree.xpath('.//@id')] for a in webpage.tree.findall('.//a'): href = a.get('href') if not href: # this link is probably just an anchor continue if href.startswith('#'): target_id = safe_xml_id(href)[1:] if target_id not in target_ids: a.set('id', target_id) target_ids.append(target_id) a.set('href', '#' + target_id) else: url = clean_url(urlparse.urljoin(webpage.url, href)) linked_wp = webpage.coll.url2webpage.get(url) if linked_wp: a.set('href', linked_wp.id + '.xhtml') else: a.set('href', url) def serializeArticle(self, node): assert not node.find('.//body'), 'error: node contains BODY tag' html = E.html( {'xmlns': "http://www.w3.org/1999/xhtml"}, E.head( E.meta({ 'http-equiv': "Content-Type", 'content': "application/xhtml+xml; charset=utf-8" })), ) head = html.find('.//head') node_head = node.find('.//head') for head_content in node_head.iterchildren(): head.append(head_content) node_head.getparent().remove(node_head) body = E.body() html.append(body) body.extend(node) return misc.flatten_tree(html)