def _make_coverpage_link (self): """ Insert a <link rel="coverpage"> in the html head. First we determine the coverpage url. In HTML we find the coverpage by appling these rules: 1. the image specified in <link rel='coverpage'>, 2. the image with an id of 'coverpage' or 3. the image with an url containing 'cover' 4. the image with an url containing 'title' If one rule returns images we take the first one in document order, else we proceed with the next rule. """ coverpages = xpath (self.xhtml, "//xhtml:link[@rel='coverpage']") for coverpage in coverpages: url = coverpage.get ('src') debug ("Found link to coverpage %s." % url) return # already provided by user # look for a suitable candidate coverpages = xpath (self.xhtml, "//xhtml:img[@id='coverpage']") if not coverpages: coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'cover')]") if not coverpages: coverpages = xpath (self.xhtml, "//xhtml:img[contains (@src, 'title')]") for coverpage in coverpages: for head in xpath (self.xhtml, "/xhtml:html/xhtml:head"): url = coverpage.get ('src') head.append (parsers.em.link (rel = 'coverpage', href = url)) debug ("Inserted link to coverpage %s." % url)
def strip_links(xhtml, manifest): """ Strip all links to urls not in manifest. This includes <a href>, <link href> and <img src> Assume links and urls are already made absolute. """ for link in xpath(xhtml, '//xhtml:a[@href]'): href = urllib.parse.urldefrag(link.get('href'))[0] if href not in manifest: debug("strip_links: Deleting <a> to %s not in manifest." % href) del link.attrib['href'] for link in xpath(xhtml, '//xhtml:link[@href]'): href = link.get('href') if href not in manifest: debug("strip_links: Deleting <link> to %s not in manifest." % href) link.drop_tree() for image in xpath(xhtml, '//xhtml:img[@src]'): src = image.get('src') if src not in manifest: debug( "strip_links: Deleting <img> with src %s not in manifest." % src) image.tail = image.get('alt', '') + (image.tail or '') image.drop_tree()
def reflow_pre(xhtml): """ make <pre> reflowable. This helps a lot with readers like Sony's that cannot scroll horizontally. """ def nbsp(matchobj): return (' ' * (len(matchobj.group(0)) - 1)) + ' ' for pre in xpath(xhtml, "//xhtml:pre"): # white-space: pre-wrap would do fine # but it is not supported by OEB try: pre.tag = NS.xhtml.div writers.HTMLishWriter.add_class(pre, 'pgmonospaced') m = parsers.RE_GUTENBERG.search(pre.text) if m: writers.HTMLishWriter.add_class(pre, 'pgheader') tail = pre.tail s = etree.tostring(pre, encoding=six.text_type, with_tail=False) s = s.replace('>\n', '>') # eliminate that empty first line s = s.replace('\n', '<br/>') s = re.sub(' +', nbsp, s) div = etree.fromstring(s) div.tail = tail pre.getparent().replace(pre, div) except etree.XMLSyntaxError as what: exception("%s\n%s" % (s, what)) raise
def strip_rst_dropcaps(xhtml): """ Replace <img class='dropcap'> with <span class='dropcap'>. """ for e in xpath(xhtml, "//xhtml:img[@class ='dropcap']"): e.tag = NS.xhtml.span e.text = e.get('alt', '')
def strip_ins(xhtml): """ Strip all <ins> tags. There's a bug in the epub validator that trips on class and title attributes in <ins> elements. """ for ins in xpath(xhtml, '//xhtml:ins'): ins.drop_tag()
def strip_noepub(xhtml): """ Strip all <* class='x-ebookmaker-drop'> tags. As a way to tailor your html towards epub. """ for e in xpath(xhtml, "//xhtml:*[contains (@class, 'x-ebookmaker-drop')]"): e.drop_tree()
def _make_coverpage_link(self, coverpage_url=None): """ Insert a <link rel="coverpage"> in the html head using the image specified by the --cover command-line option """ if coverpage_url: for head in xpath(self.xhtml, "/xhtml:html/xhtml:head"): head.append( parsers.em.link(rel='coverpage', href=coverpage_url)) debug("Inserted link to coverpage %s." % coverpage_url) return
def remove_coverpage(self, xhtml, url): """ Remove coverpage from flow. EPUB readers will display the coverpage from the manifest and if we don't remove it from flow it will be displayed twice. """ for img in xpath(xhtml, '//xhtml:img[@src = $url]', url=url): debug("remove_coverpage: dropping <img> %s from flow" % url) img.drop_tree() return # only the first one though
def add_dublincore(self, job, tree): """ Add dublin core metadata to <head>. """ source = gg.archive2files(options.ebook, job.url) if hasattr(options.config, 'FILESDIR'): job.dc.source = source.replace(options.config.FILESDIR, options.config.PGURL) for head in xpath(tree, '//xhtml:head'): for e in job.dc.to_html(): e.tail = '\n' head.append(e)
def fix_style_elements(xhtml): """ Fix CSS style elements. Make sure they are utf-8. """ # debug ("enter fix_style_elements") for style in xpath(xhtml, "//xhtml:style"): p = parsers.CSSParser.Parser() p.parse_string(style.text) try: # pylint: disable=E1103 style.text = p.sheet.cssText.decode('utf-8') except (ValueError, UnicodeError): debug("CSS:\n%s" % p.sheet.cssText) raise
def manifest_item(self, url, mediatype, id_=None): """ Add item to manifest. """ if id_ is None or xpath(self.manifest, "//*[@id = '%s']" % id_): self.item_id += 1 id_ = 'item%d' % self.item_id self.manifest.append( self.opf.item(**{ 'href': url, 'id': id_, 'media-type': mediatype })) return id_
def iterlinks(self): """ Return all links in document. """ # To keep an image even in non-image build specify # class="x-ebookmaker-important" keeps = xpath(self.xhtml, "//img[contains (@class, 'x-ebookmaker-important')]") for keep in keeps: keep.set('rel', 'important') # iterate links for (elem, dummy_attribute, url, dummy_pos) in self.xhtml.iterlinks(): yield url, elem
def strip_links(xhtml, manifest): """ Strip all links to images. This does not strip inline images, only standalone images that are targets of links. EPUB does not allow that. """ for link in xpath(xhtml, '//xhtml:a[@href]'): href = urllib.parse.urldefrag(link.get('href'))[0] if not manifest[href] in OPS_CONTENT_DOCUMENTS: debug( "strip_links: Deleting <a> to non-ops-document-type: %s" % href) del link.attrib['href'] continue
def add_coverpage(self, ocf, url): """ Add a coverpage for ADE and Kindle. The recommended cover size is 600x800 pixels (500 pixels on the smaller side is an absolute minimum). The cover page should be a color picture in JPEG format. """ id_ = None # look for a manifest item with the right url for item in xpath( self.manifest, # cannot xpath for default namespace "//*[local-name () = 'item' and (starts-with (@media-type, 'image/jpeg') or starts-with(@media-type, 'image/png')) and @href = $url]", url=url): id_ = item.get('id') break # else use default cover page image if id_ is None: ext = url.split('.')[-1] try: mediatype = getattr(mt, ext) except AttributeError: mediatype = mt.jpeg try: with open(url, 'r') as f: ocf.add_bytes(Writer.url2filename(url), f.read(), mediatype) except IOError: url = 'cover.jpg' ocf.add_bytes(url, resource_string('ebookmaker.writers', url), mediatype) id_ = self.manifest_item(url, mediatype) debug("Adding coverpage id: %s url: %s" % (id_, url)) # register mobipocket style self.meta_item('cover', id_) # register ADE style href = ocf.add_image_wrapper(Writer.url2filename(url), 'Cover') self.spine_item(href, mt.xhtml, 'coverpage-wrapper', True, True) self.guide_item(href, 'cover', 'Cover')
def insert_root_div(xhtml): """ Insert a div immediately below body and move body contents into it. Rationale: We routinely turn page numbers into <a> elements. <a> elements are illegal as children of body, but are legal as children of <div>. See: `strip_page_numbers ()` """ em = ElementMaker(namespace=str(NS.xhtml), nsmap={None: str(NS.xhtml)}) for body in xpath(xhtml, "/xhtml:body"): div = em.div div.set('id', 'pgepub-root-div') for child in body: div.append(child) body.append(div)
def fix_html_image_dimensions(xhtml): """ Remove all width and height that is not specified in '%'. """ for img in xpath(xhtml, '//xhtml:img'): a = img.attrib if '%' in a.get('width', '%') and '%' in a.get('height', '%'): continue if 'width' in a: del a['width'] if 'height' in a: del a['height']
def get_classes_that_float(xhtml): """ Get a list of all classes that use float or position. """ classes = set() regex = re.compile(r"\.(\w+)", re.ASCII) for style in xpath(xhtml, "//xhtml:style"): p = parsers.CSSParser.Parser() p.parse_string(style.text) for rule in p.sheet: if rule.type == rule.STYLE_RULE: for p in rule.style: if p.name in ('float', 'position'): classes.update( regex.findall(rule.selectorList.selectorText)) break return classes
def parse(self): """ Parse the plain text. Try to find semantic units in the character soup. """ debug("GutenbergTextParser.parse () ...") if self.xhtml is not None: return text = self.unicode_content() text = parsers.RE_RESTRICTED.sub('', text) text = gg.xmlspecialchars(text) lines = [line.rstrip() for line in text.splitlines()] lines.append("") del text blanks = 0 par = Par() for line in lines: if len(line) == 0: blanks += 1 else: if blanks and par.lines: # don't append empty pars par.after = blanks self.pars.append(par) if self.body == 1: self.max_blanks = max(blanks, self.max_blanks) par = Par() par.before = blanks blanks = 0 par.lines.append(line) par.after = blanks if par.lines: self.pars.append(par) lines = None self.analyze() # build xhtml tree em = parsers.em self.xhtml = em.html( em.head( em.title(' '), # pylint: disable=W0142 em.meta(**{ 'http-equiv': 'Content-Style-Type', 'content': 'text/css' }), em.meta( **{ 'http-equiv': 'Content-Type', 'content': mt.xhtml + '; charset=utf-8' })), em.body()) for body in xpath(self.xhtml, '//xhtml:body'): xhtmlparser = lxml.html.XHTMLParser() for par in self.pars: p = etree.fromstring(self.ship_out(par), xhtmlparser) p.tail = '\n\n' body.append(p) self.pars = []
def _to_xhtml11(self): """ Make vanilla xhtml more conform to xhtml 1.1 """ # Change content-type meta to application/xhtml+xml. for meta in xpath(self.xhtml, "/xhtml:html/xhtml:head/xhtml:meta[@http-equiv]"): if meta.get('http-equiv').lower() == 'content-type': meta.set('content', mt.xhtml + '; charset=utf-8') # drop javascript for script in xpath(self.xhtml, "//xhtml:script"): script.drop_tree() # drop form for form in xpath(self.xhtml, "//xhtml:form"): form.drop_tree() # blockquotes for bq in xpath(self.xhtml, "//xhtml:blockquote"): # no naked text allowed in <blockquote> div = etree.Element(NS.xhtml.div) for child in bq: div.append(child) div.text = bq.text bq.text = None bq.append(div) # lxml.html.defs.block_tags # insert tbody for table in xpath(self.xhtml, "//xhtml:table[xhtml:tr]"): # no naked <tr> allowed in <table> tbody = etree.Element(NS.xhtml.tbody) for tr in table: if tr.tag == NS.xhtml.tr: tbody.append(tr) table.append(tbody) # move lang to xml:lang for elem in xpath(self.xhtml, "//xhtml:*[@lang]"): # bug in lxml 2.2.2: sometimes deletes wrong element # so we delete both and reset the right one lang = elem.get('lang') try: del elem.attrib[NS.xml.lang] except KeyError: pass del elem.attrib['lang'] elem.set(NS.xml.lang, lang) # strip deprecated attributes for a, t in DEPRECATED.items(): for tag in t.split(): for elem in xpath(self.xhtml, "//xhtml:%s[@%s]" % (tag, a)): del elem.attrib[a] # strip empty class attributes for elem in xpath( self.xhtml, "//xhtml:*[@class and normalize-space (@class) = '']"): del elem.attrib['class'] # strip bogus header markup by Joe L. for elem in xpath(self.xhtml, "//xhtml:h1"): if elem.text and elem.text.startswith( "The Project Gutenberg eBook"): elem.tag = NS.xhtml.p for elem in xpath(self.xhtml, "//xhtml:h3"): if elem.text and elem.text.startswith("E-text prepared by"): elem.tag = NS.xhtml.p
def make_toc(self, xhtml): """ Build a TOC from HTML headers. Return a list of tuples (url, text, depth). Page numbers are also inserted because DTBook NCX needs the play_order to be sequential. """ def id_generator(i=0): """ Generate an id for the TOC to link to. """ while True: yield 'pgepubid%05d' % i i += 1 idg = id_generator() def get_id(elem): """ Get the id of the element or generate and set one. """ if not elem.get('id'): elem.set('id', six.next(idg)) return elem.get('id') toc = [] last_depth = 0 for header in xpath( xhtml, '//xhtml:h1|//xhtml:h2|//xhtml:h3|//xhtml:h4|' # DP page number '//xhtml:*[contains (@class, "pageno")]|' # DocUtils contents header '//xhtml:p[contains (@class, "topic-title")]'): text = gg.normalize( etree.tostring(header, method="text", encoding=six.text_type, with_tail=False)) text = header.get('title', text).strip() if not text: # so <h2 title=""> may be used to suppress TOC entry continue if header.get('class', '').find('pageno') > -1: toc.append( ["%s#%s" % (self.attribs.url, get_id(header)), text, -1]) else: # header if text.lower().startswith('by '): # common error in PG: <h2>by Lewis Carroll</h2> should # yield no TOC entry continue try: depth = int(header.tag[-1:]) except ValueError: depth = 2 # avoid top level # fix bogus header numberings if depth > last_depth + 1: depth = last_depth + 1 last_depth = depth # if <h*> is first element of a <div> use <div> instead parent = header.getparent() if (parent.tag == NS.xhtml.div and parent[0] == header and parent.text and parent.text.strip() == ''): header = parent toc.append([ "%s#%s" % (self.attribs.url, get_id(header)), text, depth ]) return toc
def strip_pagenumbers(xhtml, strip_classes): """ Strip dp page numbers. Rationale: DP implements page numbers either with float or with absolute positioning. Float is not supported by Kindle. Absolute positioning is not allowed in epub. If we'd leave these in, they would show up as numbers in the middle of the text. To still keep links working, we replace all page number contraptions we can find with empty <a>'s. """ # look for elements with a class that is in strip_classes for class_ in strip_classes: xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_ count = 0 for elem in xpath(xhtml, xp): # save textual content text = gg.normalize( etree.tostring(elem, method="text", encoding=six.text_type, with_tail=False)) if len(text) > 10: # safeguard against removing things that are not pagenumbers continue if not text: text = elem.get('title') # look for id anywhere inside element id_ = elem.xpath(".//@id") # transmogrify element into empty <a> tail = elem.tail elem.clear() elem.tag = NS.xhtml.a if id_: # some blockheaded PPers include more than # one page number in one span. take the last id # because the others represent empty pages. elem.set('id', id_[-1]) if class_ in DP_PAGENUMBER_CLASSES: # mark element as rewritten pagenumber. we # actually don't use this class for styling # because it is on an empty element elem.set('class', 'x-ebookmaker-pageno') if text: elem.set('title', text) elem.tail = tail count += 1 # The OPS Spec 2.0 is very clear: "Reading Systems # must be XML processors as defined in XML 1.1." # Nevertheless many browser-plugin ebook readers use # the HTML parsers of the browser. But HTML parsers # don't grok the minimized form of empty elements. # # This will force lxml to output the non-minimized form # of the element. elem.text = '' if count: warning("%d elements having class %s have been rewritten." % (count, class_))
def _fix_anchors(self): """ Move name to id and fix hrefs and ids. """ # move anchor name to id # 'id' values are more strict than 'name' values # try to fix ill-formed ids seen_ids = set() for anchor in (xpath(self.xhtml, "//xhtml:a[@name]") + xpath(self.xhtml, "//xhtml:*[@id]")): id_ = anchor.get('id') or anchor.get('name') if 'name' in anchor.attrib: del anchor.attrib['name'] if 'id' in anchor.attrib: del anchor.attrib['id'] if NS.xml.id in anchor.attrib: del anchor.attrib[NS.xml.id] id_ = self._fix_id(id_) if not parsers.RE_XML_NAME.match(id_): error("Dropping ill-formed id '%s' in %s" % (id_, self.attribs.url)) continue # well-formed id if id_ in seen_ids: error("Dropping duplicate id '%s' in %s" % (id_, self.attribs.url)) continue seen_ids.add(id_) anchor.set('id', id_) # try to fix bogus fragment ids # 1. fragments point to xml:id, so must be well-formed ids # 2. the ids they point to must exist for link in xpath(self.xhtml, "//xhtml:*[@href]"): href = link.get('href') hre, frag = urllib.parse.urldefrag(href) if frag: frag = self._fix_internal_frag(frag) if not frag: # non-recoverable ill-formed frag del link.attrib['href'] self.add_class(link, 'pgkilled') error('Dropping ill-formed frag in %s' % href) continue # well-formed frag if hre: # we have url + frag link.set( 'href', "%s#%s" % (hre, urllib.parse.quote(frag.encode('utf-8')))) self.add_class(link, 'pgexternal') elif frag in seen_ids: # we have only frag link.set('href', "#%s" % urllib.parse.quote(frag.encode('utf-8'))) self.add_class(link, 'pginternal') else: del link.attrib['href'] self.add_class(link, 'pgkilled') error("Dropping frag to non-existing id in %s" % href)
def build(self, job): """ Build epub """ ncx = TocNCX(job.dc) parsers = [] css_count = 0 # add CSS parser self.add_external_css(job.spider, None, PRIVATE_CSS, 'pgepub.css') try: chunker = HTMLChunker.HTMLChunker() coverpage_url = None # do images early as we need the new dimensions later for p in job.spider.parsers: if hasattr(p, 'resize_image'): if 'coverpage' in p.attribs.rel: if job.maintype == 'kindle': np = p.resize_image(MAX_IMAGE_SIZE_KINDLE, MAX_COVER_DIMEN_KINDLE, 'jpeg') else: np = p.resize_image(MAX_IMAGE_SIZE, MAX_COVER_DIMEN) np.id = p.attribs.get('id', 'coverpage') coverpage_url = p.attribs.url else: if job.maintype == 'kindle': np = p.resize_image(MAX_IMAGE_SIZE_KINDLE, MAX_IMAGE_DIMEN_KINDLE) else: np = p.resize_image(MAX_IMAGE_SIZE, MAX_IMAGE_DIMEN) np.id = p.attribs.get('id') parsers.append(np) for p in job.spider.parsers: if p.mediatype() in OPS_CONTENT_DOCUMENTS: debug("URL: %s" % p.attribs.url) if hasattr(p, 'rst2epub2'): xhtml = p.rst2epub2(job) if options.verbose >= 2: # write html to disk for debugging debugfilename = os.path.join( job.outputdir, job.outputfile) debugfilename = os.path.splitext (debugfilename)[0] + '.' + \ job.maintype + '.debug.html' with open(debugfilename, 'wb') as fp: fp.write( etree.tostring(xhtml, encoding='utf-8')) else: # make a copy so we can mess around p.parse() xhtml = copy.deepcopy(p.xhtml) strip_classes = self.get_classes_that_float(xhtml) strip_classes = strip_classes.intersection(STRIP_CLASSES) if strip_classes: self.strip_pagenumbers(xhtml, strip_classes) # build up TOC # has side effects on xhtml ncx.toc += p.make_toc(xhtml) self.insert_root_div(xhtml) self.fix_charset(xhtml) self.fix_style_elements(xhtml) self.reflow_pre(xhtml) # strip all links to items not in manifest p.strip_links(xhtml, job.spider.dict_urls_mediatypes()) self.strip_links(xhtml, job.spider.dict_urls_mediatypes()) self.strip_noepub(xhtml) # self.strip_rst_dropcaps (xhtml) self.fix_html_image_dimensions(xhtml) if coverpage_url: self.remove_coverpage(xhtml, coverpage_url) # externalize and fix CSS for style in xpath(xhtml, '//xhtml:style'): self.add_external_css(job.spider, xhtml, style.text, "%d.css" % css_count) css_count += 1 style.drop_tree() self.add_external_css(job.spider, xhtml, None, 'pgepub.css') self.add_meta_generator(xhtml) debug("Splitting %s ..." % p.attribs.url) chunker.next_id = 0 chunker.split(xhtml, p.attribs) for p in job.spider.parsers: if hasattr(p, 'sheet'): self.fix_incompatible_css(p.sheet) p.rewrite_links(self.url2filename) parsers.append(p) # after splitting html into chunks we have to rewrite all # internal links in HTML chunker.rewrite_internal_links() # also in the TOC if not ncx.toc: ncx.toc.append([job.spider.parsers[0].attribs.url, 'Start', 1]) chunker.rewrite_internal_links_toc(ncx.toc) # make absolute links zip-filename-compatible chunker.rewrite_links(self.url2filename) ncx.rewrite_links(self.url2filename) # Do away with the chunker and copy all chunks into new parsers. # These are fake parsers that never actually parsed anything, # we just use them to just hold our data. for chunk, attribs in chunker.chunks: p = ParserFactory.ParserFactory.get(attribs) p.xhtml = chunk parsers.append(p) self.shipout(job, parsers, ncx) except Exception as what: exception("Error building Epub: %s" % what) raise