def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'): from calibre.ebooks.oeb.base import XPath, XHTML changed_names = set() snames = set(sheets) lp = XPath('//h:link[@href]') hp = XPath('//h:head') for name in names: root = container.parsed(name) if remove: for link in lp(root): if (link.get('type', mtype) or mtype) == mtype: container.remove_from_xml(link) changed_names.add(name) container.dirty(name) existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype} extra = snames - existing if extra: changed_names.add(name) try: parent = hp(root)[0] except (TypeError, IndexError): parent = XHTML('head') container.insert_into_xml(root, parent, index=0) for sheet in sheets: if sheet in extra: container.insert_into_xml( parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype, href=container.name_to_href(sheet, name))) container.dirty(name) return changed_names
def add_pagenum_toc(root, toc, opts, page_number_display_map): body = last_tag(root) indents = [] for i in range(1, 7): indents.extend((i, 1.4 * i)) css = ''' .calibre-pdf-toc table { width: 100%% } .calibre-pdf-toc table tr td:last-of-type { text-align: right } .calibre-pdf-toc .level-0 { font-size: larger; } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } .calibre-pdf-toc .level-%d td:first-of-type { padding-left: %.1gem } ''' % tuple(indents) + (opts.extra_css or '') style = body.makeelement(XHTML('style'), type='text/css') style.text = css body.append(style) body.set('class', 'calibre-pdf-toc') def E(tag, cls=None, text=None, tail=None, parent=None, **attrs): ans = body.makeelement(XHTML(tag), **attrs) ans.text, ans.tail = text, tail if cls is not None: ans.set('class', cls) if parent is not None: parent.append(ans) return ans E('h2', text=(opts.toc_title or _('Table of Contents')), parent=body) table = E('table', parent=body) for level, node in toc.iterdescendants(level=0): tr = E('tr', cls='level-%d' % level, parent=table) E('td', text=node.title or _('Unknown'), parent=tr) num = node.pdf_loc.pagenum num = page_number_display_map.get(num, num) E('td', text=f'{num}', parent=tr)
def get_cover_page(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML output = u'' if 'cover' in self.oeb_book.guide: if self.name_map.get(self.oeb_book.guide['cover'].href, None): output += '<IMG SRC="%s">' % self.name_map[ self.oeb_book.guide['cover'].href] if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating cover page...') href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += ''.join( self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output
def flatten_spine(self): names = defaultdict(int) styles = {} for item in self.oeb.spine: html = item.data stylizer = self.stylizers[item] if self.specializer is not None: self.specializer(item, stylizer) body = html.find(XHTML('body')) fsize = self.context.dest.fbase self.flatten_node(body, stylizer, names, styles, fsize, item.id) items = [(key, val) for (val, key) in styles.items()] items.sort() css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items) href = self.replace_css(css) global_css = self.collect_global_css() for item in self.oeb.spine: stylizer = self.stylizers[item] self.flatten_head(item, href, global_css[item])
class RecodeCallbackDiv(RecodeCallbackBase): tag = XHTML('div') def get_begin(self, element): classes = self.get_classes(element) functions = [] functions.extend(self.get_class_layout(classes)) functions.extend(self.get_class_style(classes)) # save the number of used functions, so we will close them properly self.push(len(functions)) return "".join(functions) def get_end(self, element): # hence div is a block tag, add a new line at the end return "}" * self.pop() + "\n"
def mlize_spine(self, oeb_book): output = [] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output.append('\n\n') if self.opts.htmlz_class_style == 'external': css = u'<link href="style.css" rel="stylesheet" type="text/css" />' else: css = u'<style type="text/css">' + self.get_css( oeb_book) + u'</style>' title = u'<title>%s</title>' % prepare_string_for_xml(self.book_title) output = [u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \ [css] + [title, u'</head><body>'] + output + [u'</body></html>'] return ''.join(output)
def get_cover_page(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML output = '' if 'cover' in self.oeb_book.guide: output += '\\m="cover.png"\n' self.image_hrefs[self.oeb_book.guide['cover'].href] = 'cover.png' if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating title page...') href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += ''.join( self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output
def merge_css(container, names, master): p = container.parsed msheet = p(master) master_base = os.path.dirname(master) merged = set() for name in names: if name == master: continue # Rebase links if master is in a different directory if os.path.dirname(name) != master_base: container.replace_links(name, LinkRebaser(container, name, master)) sheet = p(name) # Remove charset rules cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE] [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr] for rule in sheet.cssRules: msheet.add(rule) container.remove_item(name) merged.add(name) # Remove links to merged stylesheets in the html files, replacing with a # link to the master sheet for name, mt in container.mime_map.iteritems(): if mt in OEB_DOCS: removed = False root = p(name) for link in XPath('//h:link[@href]')(root): q = container.href_to_name(link.get('href'), name) if q in merged: container.remove_from_xml(link) removed = True if removed: container.dirty(name) if removed and master not in set(all_stylesheets(container, name)): head = root.find('h:head', namespaces=XPNSMAP) if head is not None: link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name)) container.insert_into_xml(head, link)
def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} for item in self.oeb.manifest: if item.media_type in OEB_STYLES: if not self.opts.expand_css and hasattr(item.data, 'cssText'): condense_sheet(self.data(item)) data = self.data(item).cssText sheets[item.href] = len(self.flows) self.flows.append(force_unicode(data, 'utf-8')) for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail = '\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in inlines.iteritems(): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css' % idx)
def get_length(root): strip_space = re.compile(r'\s+') ans = 0 def count(elem): num = 0 tname = elem.tag.rpartition('}')[-1].lower() if elem.text and tname not in 'script style': num += len(strip_space.sub(elem.text, '')) if elem.tail: num += len(strip_space.sub(elem.tail, '')) if tname in 'img svg': num += 2000 return num for body in root.iterdescendants(XHTML('body')): ans += count(body) for elem in body.iterdescendants('*'): ans += count(elem) return ans
def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer output = [u''] output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) content = unicode(etree.tostring(item.data, encoding=unicode)) content = self.remove_newlines(content) content = etree.fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(content.find(XHTML('body')), stylizer, item) output += '\n\n\n\n\n\n' output = u''.join(output) output = u'\n'.join(l.rstrip() for l in output.splitlines()) output = self.cleanup_text(output) return output
def epubify_markup(self, root, log): from calibre.ebooks.oeb.base import XPath, XHTML # Fix empty title tags for t in XPath('//h:title')(root): if not t.text: t.text = u' ' # Fix <p><div> constructs as the asinine epubchecker complains # about them pdiv = XPath('//h:p/h:div') for div in pdiv(root): div.getparent().tag = XHTML('div') # Remove the position:relative as it causes problems with some epub # renderers. Remove display: block on an image inside a div as it is # redundant and prevents text-align:center from working in ADE # Also ensure that the img is contained in its containing div imgpath = XPath('//h:div/h:img[@style]') for img in imgpath(root): div = img.getparent() if len(div) == 1: style = div.attrib.get('style', '') if style and not style.endswith(';'): style = style + ';' style += 'position:static' # Ensures position of containing # div is static # Ensure that the img is always contained in its frame div.attrib['style'] = style img.attrib['style'] = 'max-width: 100%; max-height: 100%' # A div/div/img construct causes text-align:center to not work in ADE # so set the display of the second div to inline. This should have no # effect (apart from minor vspace issues) in a compliant HTML renderer # but it fixes the centering of the image via a text-align:center on # the first div in ADE imgpath = XPath('descendant::h:div/h:div/h:img') for img in imgpath(root): div2 = img.getparent() div1 = div2.getparent() if len(div1) == len(div2) == 1: style = div2.attrib['style'] div2.attrib['style'] = 'display:inline;' + style
def flatten_spine(self): names = defaultdict(int) styles, pseudo_styles = {}, defaultdict(dict) for item in self.items: html = item.data stylizer = self.stylizers[item] if self.specializer is not None: self.specializer(item, stylizer) fsize = self.context.dest.fbase self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False) self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id) items = sorted(((key, val) for (val, key) in iteritems(styles)), key=lambda x: numeric_sort_key(x[0])) # :hover must come after link and :active must come after :hover psels = sorted(pseudo_styles, key=lambda x: { 'hover': 1, 'active': 2 }.get(x, 0)) for psel in psels: styles = pseudo_styles[psel] if not styles: continue x = sorted(((k + ':' + psel, v) for v, k in iteritems(styles))) items.extend(x) css = ''.join(".%s {\n%s;\n}\n\n" % (key, val) for key, val in items) href = self.replace_css(css) global_css = self.collect_global_css() for item in self.items: stylizer = self.stylizers[item] self.flatten_head(item, href, global_css[item])
def rasterize_external(self, elem, style, item, svgitem): width = style['width'] height = style['height'] width = (width / 72) * self.profile.dpi height = (height / 72) * self.profile.dpi data = QByteArray(str(svgitem)) svg = QSvgRenderer(data) size = svg.defaultSize() size.scale(width, height, Qt.KeepAspectRatio) key = (svgitem.href, size.width(), size.height()) if key in self.images: href = self.images[key] else: logger = self.oeb.logger logger.info('Rasterizing %r to %dx%d' % (svgitem.href, size.width(), size.height())) image = QImage(size, QImage.Format_ARGB32_Premultiplied) image.fill(QColor("white").rgb()) painter = QPainter(image) svg.render(painter) painter.end() array = QByteArray() buffer = QBuffer(array) buffer.open(QIODevice.WriteOnly) image.save(buffer, 'PNG') data = str(array) manifest = self.oeb.manifest href = os.path.splitext(svgitem.href)[0] + '.png' id, href = manifest.generate(svgitem.id, href) manifest.add(id, href, PNG_MIME, data=data) self.images[key] = href elem.tag = XHTML('img') for attr in elem.attrib: if attr not in KEEP_ATTRS: del elem.attrib[attr] elem.attrib['src'] = item.relhref(href) if elem.text: elem.attrib['alt'] = elem.text elem.text = None for child in elem: elem.remove(child)
def split_on_page_breaks(self, orig_tree): ordered_ids = OrderedDict() all_page_break_ids = frozenset(self.page_break_ids) for elem_id in orig_tree.xpath('//*/@id'): if elem_id in all_page_break_ids: ordered_ids[elem_id] = self.page_breaks[ self.page_break_ids.index(elem_id)] self.trees = [orig_tree] while ordered_ids: pb_id, (pattern, before) = next(iteritems(ordered_ids)) del ordered_ids[pb_id] for i in range(len(self.trees)-1, -1, -1): tree = self.trees[i] elem = pattern(tree) if elem: self.log.debug('\t\tSplitting on page-break at id=%s'% elem[0].get('id')) before_tree, after_tree = self.do_split(tree, elem[0], before) self.trees[i:i+1] = [before_tree, after_tree] break trees, ids = [], set() for tree in self.trees: root = tree.getroot() if self.is_page_empty(root): discarded_ids = root.xpath('//*[@id]') for x in discarded_ids: x = x.get('id') if not x.startswith('calibre_'): ids.add(x) else: if ids: body = self.get_body(root) if body is not None: existing_ids = frozenset(body.xpath('//*/@id')) for x in ids - existing_ids: body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt')) ids = set() trees.append(tree) self.trees = trees
def get_length(root): strip_space = re.compile(r'\s+') ans = 0 ignore_tags = frozenset('script style title noscript'.split()) def count(elem): num = 0 tname = elem.tag.rpartition('}')[-1].lower() if elem.text and tname not in ignore_tags: num += len(strip_space.sub('', elem.text)) if elem.tail: num += len(strip_space.sub('', elem.tail)) if tname in 'img svg': num += 2000 return num for body in root.iterdescendants(XHTML('body')): ans += count(body) for elem in body.iterdescendants('*'): ans += count(elem) return ans
def get_page_sheet(self): if self.page_sheet is None: manifest = self.oeb.manifest id_, href = manifest.generate('page_css', 'page_styles.css') self.page_sheet = manifest.add(id_, href, CSS_MIME, data=self.parser.parseString( '', validate=False)) head = self.current_item.xpath('//*[local-name()="head"][1]') if head: href = self.current_item.relhref(href) l = etree.SubElement(head[0], XHTML('link'), rel='stylesheet', type=CSS_MIME, href=href) l.tail = '\n' else: self.log.warn('No <head> cannot embed font rules') return self.page_sheet
def transform_css(self): transform_css(self, transform_sheet=transform_sheet, transform_style=transform_declaration) # Firefox flakes out sometimes when dynamically creating <style> tags, # so convert them to external stylesheets to ensure they never fail style_xpath = XPath('//h:style') for name, mt in tuple(iteritems(self.mime_map)): mt = mt.lower() if mt in OEB_DOCS: head = ensure_head(self.parsed(name)) for style in style_xpath(self.parsed(name)): if style.text and (style.get('type') or 'text/css').lower() == 'text/css': in_head = has_ancestor(style, head) if not in_head: extract(style) head.append(style) css = style.text style.clear() style.tag = XHTML('link') style.set('type', 'text/css') style.set('rel', 'stylesheet') sname = self.add_file(name + '.css', css.encode('utf-8'), modify_name_if_needed=True) style.set('href', self.name_to_href(sname, name))
def smallcaps_elem(self, elem, attr): texts = self.split_text(getattr(elem, attr)) setattr(elem, attr, None) last = elem if attr == 'tail' else None attrib = {'class': 'calibre_lowercase'} for text in texts: if text.isupper(): if last is None: elem.text = text else: last.tail = text else: child = elem.makeelement(XHTML('span'), attrib=attrib) child.text = text.upper() if last is None: elem.insert(0, child) else: # addnext() moves the tail for some reason tail = last.tail last.addnext(child) last.tail = tail child.tail = None last = child
def add_anchors_markup(root, uuid, anchors): body = last_tag(root) div = body.makeelement( XHTML('div'), id=uuid, style= 'display:block !important; page-break-before: always !important; break-before: always !important; white-space: pre-wrap !important' ) div.text = '\n\n' body.append(div) def a(anchor): a = div.makeelement( XHTML('a'), href='#' + anchor, style= 'min-width: 10px !important; min-height: 10px !important; border: solid 1px !important;' ) a.text = a.tail = ' ' div.append(a) tuple(map(a, anchors)) a(uuid)
def serialize_item(self, item): ''' Serialize an individual item from the spine of the input document. A reference to this item is stored in self.href_offsets ''' buf = self.buf if not item.linear: self.breaks.append(buf.tell() - 1) self.id_offsets[urlnormalize(item.href)] = buf.tell() if item.is_section_start: buf.write(b'<a ></a> ') if item.is_article_start: buf.write(b'<a ></a> <a ></a>') for elem in item.data.find(XHTML('body')): self.serialize_elem(elem, item) if self.write_page_breaks_after_item: buf.write(b'<mbp:pagebreak/>') if item.is_article_end: # Kindle periodical article end marker buf.write(b'<a ></a> <a ></a>') if item.is_section_end: buf.write(b' <a ></a>') self.anchor_offset = None
def get_text(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer text = ['<body>'] # Create main section if there are no others to create if self.opts.sectionize == 'nothing': text.append('<section>') self.section_level += 1 for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) # Start a <section> if we must sectionize each file or if the TOC references this page page_section_open = False if self.opts.sectionize == 'files' or None in self.toc.get( item.href, ()): text.append('<section>') page_section_open = True self.section_level += 1 text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) if page_section_open: text.append('</section>') self.section_level -= 1 # Close any open sections while self.section_level > 0: text.append('</section>') self.section_level -= 1 text.append('</body>') return ''.join(text)
def extract_svg_into_flows(self): images = {} for item in self.oeb.manifest: if item.media_type == SVG_MIME: data = self.data(item) images[item.href] = len(self.flows) self.flows.append( etree.tostring(data, encoding='UTF-8', with_tail=True, xml_declaration=True)) for item in self.oeb.spine: root = self.data(item) for svg in XPath('//svg:svg')(root): raw = etree.tostring(svg, encoding=unicode_type, with_tail=False) idx = len(self.flows) self.flows.append(raw) p = svg.getparent() pos = p.index(svg) img = etree.Element(XHTML('img'), src="kindle:flow:%s?mime=image/svg+xml" % to_ref(idx)) p.insert(pos, img) extract(svg) for img in XPath('//h:img[@src]')(root): src = img.get('src') abshref = item.abshref(src) idx = images.get(abshref, None) if idx is not None: img.set('src', 'kindle:flow:%s?mime=image/svg+xml' % to_ref(idx))
def get_length(root): ans = 0 fast = getattr(speedup, 'get_element_char_length', None) if fast is None: ignore_tags = frozenset('script style title noscript'.split()) img_tags = ('img', 'svg') strip_space = re.compile(r'\s+') def count(elem): tag = getattr(elem, 'tag', count) if callable(tag): return len( strip_space.sub('', getattr(elem, 'tail', None) or '')) num = 0 tname = tag.rpartition('}')[-1].lower() if elem.text and tname not in ignore_tags: num += len(strip_space.sub('', elem.text)) if elem.tail: num += len(strip_space.sub('', elem.tail)) if tname in img_tags: num += 1000 return num else: def count(elem): tag = getattr(elem, 'tag', count) if callable(tag): return fast('', None, getattr(elem, 'tail', None)) return fast(tag, elem.text, elem.tail) for body in root.iterchildren(XHTML('body')): ans += count(body) for elem in body.iterdescendants(): ans += count(elem) return ans
def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = [u''] output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) for x in item.data.iterdescendants(etree.Comment): if x.text and '--' in x.text: x.text = x.text.replace('--', '__') content = etree.tostring(item.data, encoding='unicode') content = self.remove_newlines(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(content.find(XHTML('body')), stylizer, item) output += '\n\n\n\n\n\n' output = ''.join(output) output = '\n'.join(l.rstrip() for l in output.splitlines()) output = self.cleanup_text(output) return output
def split_on_page_breaks(self, orig_tree): ordered_ids = [] for elem in orig_tree.xpath('//*[@id]'): id = elem.get('id') if id in self.page_break_ids: ordered_ids.append(self.page_breaks[self.page_break_ids.index(id)]) self.trees = [] tree = orig_tree for pattern, before in ordered_ids: elem = pattern(tree) if elem: self.log.debug('\t\tSplitting on page-break') before, after = self.do_split(tree, elem[0], before) self.trees.append(before) tree = after self.trees.append(tree) trees, ids = [], set([]) for tree in self.trees: root = tree.getroot() if self.is_page_empty(root): discarded_ids = root.xpath('//*[@id]') for x in discarded_ids: x = x.get('id') if not x.startswith('calibre_'): ids.add(x) else: if ids: body = self.get_body(root) if body is not None: for x in ids: body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt')) ids = set([]) trees.append(tree) self.trees = trees
def stylize_spine(self): self.stylizers = {} profile = self.context.source css = '' for item in self.oeb.spine: html = item.data body = html.find(XHTML('body')) if 'style' in html.attrib: b = body.attrib.get('style', '') body.set('style', html.get('style') + ';' + b) del html.attrib['style'] bs = body.get('style', '').split(';') bs.append('margin-top: 0pt') bs.append('margin-bottom: 0pt') if float(self.context.margin_left) >= 0: bs.append('margin-left : %gpt' % float(self.context.margin_left)) if float(self.context.margin_right) >= 0: bs.append('margin-right : %gpt' % float(self.context.margin_right)) bs.extend(['padding-left: 0pt', 'padding-right: 0pt']) if self.page_break_on_body: bs.extend(['page-break-before: always']) if self.context.change_justification != 'original': bs.append('text-align: ' + self.context.change_justification) if self.body_font_family: bs.append(u'font-family: ' + self.body_font_family) body.set('style', '; '.join(bs)) stylizer = Stylizer(html, item.href, self.oeb, self.context, profile, user_css=self.context.extra_css, extra_css=css) self.stylizers[item] = stylizer
def merge_html(container, names, master): p = container.parsed root = p(master) # Ensure master has a <head> head = root.find('h:head', namespaces=XPNSMAP) if head is None: head = root.makeelement(XHTML('head')) container.insert_into_xml(root, head, 0) seen_anchors = all_anchors(root) seen_stylesheets = set(all_stylesheets(container, master)) master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1] master_base = os.path.dirname(master) anchor_map = {n: {} for n in names if n != master} for name in names: if name == master: continue # Insert new stylesheets into master for sheet in all_stylesheets(container, name): if sheet not in seen_stylesheets: seen_stylesheets.add(sheet) link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href( sheet, master)) container.insert_into_xml(head, link) # Rebase links if master is in a different directory if os.path.dirname(name) != master_base: container.replace_links(name, LinkRebaser(container, name, master)) root = p(name) children = [] for body in p(name).findall('h:body', namespaces=XPNSMAP): children.append( body.text if body.text and body.text.strip() else '\n\n') children.extend(body) first_child = '' for first_child in children: if not isinstance(first_child, string_or_bytes): break if isinstance(first_child, string_or_bytes): # body contained only text, no tags first_child = body.makeelement(XHTML('p')) first_child.text, children[0] = children[0], first_child amap = anchor_map[name] remove_name_attributes(root) for elem in root.xpath('//*[@id]'): val = elem.get('id') if not val: continue if val in seen_anchors: nval = unique_anchor(seen_anchors, val) elem.set('id', nval) amap[val] = nval else: seen_anchors.add(val) if 'id' not in first_child.attrib: first_child.set('id', unique_anchor(seen_anchors, 'top')) seen_anchors.add(first_child.get('id')) amap[''] = first_child.get('id') # Fix links that point to local changed anchors for a in XPath('//h:a[starts-with(@href, "#")]')(root): q = a.get('href')[1:] if q in amap: a.set('href', '#' + amap[q]) for child in children: if isinstance(child, string_or_bytes): add_text(master_body, child) else: master_body.append(copy.deepcopy(child)) container.remove_item(name, remove_from_guide=False) # Fix all links in the container that point to merged files for fname, media_type in iteritems(container.mime_map): repl = MergeLinkReplacer(fname, anchor_map, master, container) container.replace_links(fname, repl)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id): if not isinstance(node.tag, string_or_bytes) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, numbers.Number): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] if cssdict['text-align'] == 'center': # align=center causes tables to be center aligned, # which text-align does not. And the ever trustworthy Word # uses this construct in its HTML output. See # https://bugs.launchpad.net/bugs/1569583 if tag == 'table': if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict['margin-right'] = 'auto' else: for table in node.iterchildren(XHTML("table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get('margin-right') is None: ts.set('margin-left', 'auto') ts.set('margin-right', 'auto') else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if 'valign' in node.attrib and tag == 'td': if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] if node.tag == XHTML('font'): tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt'%font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = Property('color', node.attrib['color']).value except (ValueError, SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value except (ValueError, SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: del node.attrib['type'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and ( len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f))) # Detect drop caps generated by the docx input plugin if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \ not node.tail and 'line-height' in cssdict and 'font-size' in cssdict: dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = dynamic_rescale_factor(node) if dyn_rescale is not None: fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem'%(fsize/psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt'%fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. if not is_drop_cap and style['line-height'] < minlh * fsize: cssdict['line-height'] = str(minlh) except: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: popval = cssdict.pop(x, None) if self.body_font_family and popval and x == 'font-family' \ and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]: cssdict[x] = popval if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s'%(prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(iteritems(cssdict)) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' # lower() because otherwise if the document uses the same class # name with different case, both cases will apply, leading # to incorrect results. klass = ascii_text(STRIPNUM.sub('', classes.split()[0])).lower().strip().replace(' ', '_') if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in iteritems(pseudo_classes): items = sorted(iteritems(cssdict)) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + str(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} passthrough = getattr(self.opts, 'mobi_passthrough', False) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if not passthrough and not self.opts.expand_css and hasattr( item.data, 'cssText'): condense_sheet(sheet) sheets[item.href] = len(self.flows) self.flows.append(sheet) def fix_import_rules(sheet): changed = False for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): if rule.href: href = item.abshref(rule.href) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) rule.href = 'kindle:flow:%s?mime=text/css' % idx changed = True return changed for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue sheet = cssutils.parseString(raw, validate=False) if fix_import_rules(sheet): raw = force_unicode(sheet.cssText, 'utf-8') repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail = '\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in inlines.iteritems(): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if hasattr(sheet, 'cssRules'): fix_import_rules(sheet) for i, sheet in enumerate(tuple(self.flows)): if hasattr(sheet, 'cssText'): self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) != XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = XHTML('a') else: return tag = barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start']) - 1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] istate.indent = style['text-indent'] if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = (u'\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + (u'\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + (u'\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = (style['white-space'] in ('pre', 'pre-wrap')) istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if style['font-family'] else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int( round(float(value) / (72. / self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify_data(item.data)[:2] except: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = float(width) / float(height) if 'width' not in istate.attrib: try: width = int(istate.attrib['height']) * ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width']) / ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = u'\u201c' + t t = elem.tail if not t: t = '' elem.tail = u'\u201d' + t text = None if elem.text: if istate.preserve: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or (isinstance( valign, (float, int)) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or (isinstance(valign, (float, int)) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, XHTML(vtag)) vtag = etree.SubElement(vtag, XHTML('small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem) == 0)): self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == u'\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(XHTML('br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()