def flatten_head(self, item, href, global_href): html = item.data head = html.find(base.tag('xhtml', 'head')) def safe_lower(x): try: x = x.lower() except Exception: pass return x for node in html.xpath( '//*[local-name()="style" or local-name()="link"]'): if node.tag == base.tag('xhtml', 'link') \ and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \ and safe_lower(node.get('type', base.CSS_MIME)) in base.OEB_STYLES: node.getparent().remove(node) elif node.tag == base.tag('xhtml', 'style') \ and node.get('type', base.CSS_MIME) in base.OEB_STYLES: node.getparent().remove(node) href = item.relhref(href) l = etree.SubElement(head, base.tag('xhtml', 'link'), rel='stylesheet', type=base.CSS_MIME, href=href) l.tail = '\n' if global_href: href = item.relhref(global_href) l = etree.SubElement(head, base.tag('xhtml', 'link'), rel='stylesheet', type=base.CSS_MIME, href=href) l.tail = '\n'
def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and ':' not in parse_utils.barename(elem.tag)): elem.tag = base.tag('opf', parse_utils.barename(elem.tag)) nsmap.update(const.OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(base.tag('opf', 'package'), nsmap={None: const.OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'), nsmap=nsmap) ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata')) for elem in base.xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if parse_utils.namespace(elem.tag) in const.DC_NSES: tag = parse_utils.barename(elem.tag).lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) metadata.append(elem) for element in base.xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in base.xpath(opf, tag): nroot.append(element) return nroot
def process_node(html_parent, toc, level=1, indent=' ', style_level=2): li = html_parent.makeelement(base.tag('xhtml', 'li')) li.tail = '\n' + (indent * level) html_parent.append(li) name, frag = toc.dest, toc.frag href = '#' if name: href = container.name_to_href(name, toc_name) if frag: href += '#' + frag a = li.makeelement(base.tag('xhtml', 'a'), href=href) a.text = toc.title li.append(a) if len(toc) > 0: parent = li.makeelement(base.tag('xhtml', 'ul')) parent.set('class', 'level%d' % (style_level)) li.append(parent) a.tail = '\n\n' + (indent * (level + 2)) parent.text = '\n' + (indent * (level + 3)) parent.tail = '\n\n' + (indent * (level + 1)) for child in toc: process_node(parent, child, level + 3, style_level=style_level + 1) parent[-1].tail = '\n' + (indent * (level + 2))
def get_nav_landmarks(container): nav = find_existing_nav_toc(container) if nav and container.has_name(nav): root = container.parsed(nav) et = base('epub', 'type') for elem in root.iterdescendants(base.tag('xhtml', 'nav')): if elem.get(et) == 'landmarks': for li in elem.iterdescendants(base.tag('xhtml', 'li')): for a in li.iterdescendants(base.tag('xhtml', 'a')): href, rtype = a.get('href'), a.get(et) if href: title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip() href, frag = href.partition('#')[::2] name = container.href_to_name(href, nav) if container.has_name(name): yield { 'dest': name, 'frag': frag, 'title': title or '', 'type': rtype or '' } break
def set_authors(root, prefixes, refines, authors): ensure_prefix(root, prefixes, 'marc') for item in XPath('./opf:metadata/dc:creator')(root): props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) opf_role = item.get(oeb_base.tag('opf', 'role')) if ((opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut'))): continue remove_element(item, refines) metadata = XPath('./opf:metadata')(root)[0] for author in authors: if author.name: a = metadata.makeelement(oeb_base.tag('dc', 'creator')) aid = ensure_id(a) a.text = author.name metadata.append(a) m = metadata.makeelement(oeb_base.tag('opf', 'meta'), attrib={ 'refines': '#' + aid, 'property': 'role', 'scheme': 'marc:relators' }) m.text = 'aut' metadata.append(m) if author.sort: m = metadata.makeelement(oeb_base.tag('opf', 'meta'), attrib={ 'refines': '#' + aid, 'property': 'file-as' }) m.text = author.sort metadata.append(m)
def set_guide_item(container, item_type, title, name, frag=None): ref_tag = base.tag('opf', 'reference') href = None if name: href = container.name_to_href(name, container.opf_name) if frag: href += '#' + frag guides = container.opf_xpath('//opf:guide') if not guides and href: g = container.opf.makeelement(base.tag('opf', 'guide'), nsmap={'opf': const.OPF2_NS}) container.insert_into_xml(container.opf, g) guides = [g] for guide in guides: matches = [] for child in guide.iterchildren(etree.Element): if (child.tag == ref_tag and child.get('type', '').lower() == item_type.lower()): matches.append(child) if not matches and href: r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf': const.OPF2_NS}) container.insert_into_xml(guide, r) matches.append(r) for m in matches: if href: m.set('title', title) m.set('href', href) m.set('type', item_type) else: container.remove_from_xml(m) container.dirty(container.opf_name)
def map_resources(self, oeb_book): for item in oeb_book.manifest: if item.media_type in base.OEB_IMAGES: if item.href not in self.images: ext = os.path.splitext(item.href)[1] fname = '%s%s' % (len(self.images), ext) fname = fname.zfill(10) self.images[item.href] = fname if item in oeb_book.spine: self.get_link_id(item.href) root = item.data.find(base.tag('xhtml', 'body')) link_attrs = set(html.defs.link_attrs) link_attrs.add(base.tag('xlink', 'href')) for el in root.iter(): attribs = el.attrib try: if not isinstance(el.tag, (str, bytes)): continue except: continue for attr in attribs: if attr in link_attrs: href = item.abshref(attribs[attr]) href, id = urllib.parse.urldefrag(href) if href in self.base_hrefs: self.get_link_id(href, id)
def upshift_markup(self): # {{{ 'Upgrade markup to comply with XHTML 1.1 where possible' for x in self.oeb.spine: root = x.data if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')): root.set(base.tag('xml', 'lang'), root.get('lang')) body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue for u in base.XPath('//h:u')(root): u.tag = 'span' seen_ids, seen_names = set(), set() for x in base.XPath('//*[@id or @name]')(root): eid, name = x.get('id', None), x.get('name', None) if eid: if eid in seen_ids: del x.attrib['id'] else: seen_ids.add(eid) if name: if name in seen_names: del x.attrib['name'] else: seen_names.add(name)
def linearize_jacket(oeb): for x in oeb.spine[:4]: if XPath(JACKET_XPATH)(x.data): for e in XPath('//h:table|//h:tr|//h:th')(x.data): e.tag = base.tag('xhtml', 'div') for e in XPath('//h:td')(x.data): e.tag = base.tag('xhtml', 'span') break
def add_toc_level(self, elem, toc): for node in toc: block = base.element(elem, base.tag('xhtml', 'div'), attrib={'class': 'calibre_toc_block'}) line = base.element(block, base.tag('xhtml', 'a'), attrib={'href': node.href, 'class': 'calibre_toc_line'}) line.text = node.title self.add_toc_level(block, node)
def create_li(ol, entry): li = ol.makeelement(base.tag('xhtml', 'li')) ol.append(li) a = li.makeelement(base.tag('xhtml', 'a')) li.append(a) href = container.name_to_href(entry['dest'], tocname) if entry['frag']: href += '#' + entry['frag'] a.set('href', href) return a
def mobimlize_spine(self): 'Iterate over the spine and convert it to MOBIML' for item in self.oeb.spine: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) body = item.data.find(base.tag('xhtml', 'body')) nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP) nbody = etree.SubElement(nroot, base.tag('xhtml', 'body')) self.current_spine_item = item self.mobimlize_elem(body, stylizer, BlockState(nbody), [FormatState()]) item.data = nroot
def detect_chapters(self): self.detected_chapters = [] self.chapter_title_attribute = None def find_matches(expr, doc): try: ans = XPath(expr)(doc) len(ans) return ans except Exception: self.log.warn('Invalid chapter expression, ignoring: %s' % expr) return [] if self.opts.chapter: chapter_path, title_attribute = ( self.get_toc_parts_for_xpath(self.opts.chapter)) self.chapter_title_attribute = title_attribute for item in self.oeb.spine: for x in find_matches(chapter_path, item.data): self.detected_chapters.append((item, x)) chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' c = collections.Counter() for item, elem in self.detected_chapters: c[item] += 1 text = base.xml2text(elem).strip() text = re.sub(r'\s+', ' ', text.strip()) self.log('\tDetected chapter:', text[:50]) if chapter_mark == 'none': continue if chapter_mark == 'rule': mark = elem.makeelement(base.tag('xhtml', 'hr')) elif chapter_mark == 'pagebreak': if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if # they are at the start of the file, in which case # inserting a page break in unnecessary and can lead # to extra blank pages in the PDF Output plugin. We # need to use two as feedbooks epubs match both a # heading tag and its containing div with the default # chapter expression. continue mark = elem.makeelement(base.tag('xhtml', 'div'), style=page_break_after) else: # chapter_mark == 'both': mark = elem.makeelement(base.tag('xhtml', 'hr'), style=page_break_before) try: elem.addprevious(mark) except TypeError: self.log.exception('Failed to mark chapter')
def create_manifest_item(root, href_template, id_template, media_type=None): all_ids = frozenset(root.xpath('//*/@id')) all_hrefs = frozenset(root.xpath('//*/@href')) href = ensure_unique(href_template, all_hrefs) item_id = ensure_unique(id_template, all_ids) manifest = root.find(base.tag('opf', 'manifest')) if manifest is not None: i = manifest.makeelement(base.tag('opf', 'item')) i.set('href', href), i.set('id', item_id) i.set('media-type', media_type or guess_type(href_template)) manifest.append(i) return i
def mangle_spine(self): id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') self.oeb.manifest.add(id, href, base.CSS_MIME, data=CASE_MANGLER_CSS) for item in self.oeb.spine: html = item.data relhref = item.relhref(href) etree.SubElement(html.find(base.tag('xhtml', 'head')), base.tag('xhtml', 'link'), rel='stylesheet', href=relhref, type=base.CSS_MIME) stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) self.mangle_elem(html.find(base.tag('xhtml', 'body')), stylizer)
def add_from_li(container, li, parent, nav_name): dest = frag = text = None for x in li.iterchildren(base.tag('xhtml', 'a'), base.tag('xhtml', 'span')): text = (etree.tostring( x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()) href = x.get('href') if href: dest = (nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)) frag = urllib.parse.urlparse(href).fragment or None break return parent.add(text or None, dest or None, frag or None)
def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers=False): uid = root.get('unique-identifier') package_identifier = None for ident in XPath('./opf:metadata/dc:identifier')(root): if uid is not None and uid == ident.get('id'): package_identifier = ident continue val = (ident.text or '').strip() if not val: ident.getparent().remove(ident) continue scheme, val = parse_identifier(ident, val, refines) if (not scheme or not val or force_identifiers or scheme in new_identifiers): remove_element(ident, refines) continue metadata = XPath('./opf:metadata')(root)[0] for scheme, val in new_identifiers.items(): ident = metadata.makeelement(oeb_base.tag('dc', 'ident')) ident.text = '%s:%s' % (scheme, val) if package_identifier is None: metadata.append(ident) else: p = package_identifier.getparent() p.insert(p.index(package_identifier), ident)
def create_rating(root, prefixes, val): ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) m = XPath('./opf:metadata')(root)[0] d = m.makeelement(oeb_base.tag('opf', 'meta'), attrib={'property': 'calibre:rating'}) d.text = val m.append(d)
def read_authors(root, prefixes, refines): roled_authors, unroled_authors = [], [] def author(item, props, val): aus = None file_as = props.get('file-as') if file_as: aus = file_as[0][-1] else: aus = item.get(oeb_base.tag('opf', 'file_as')) or None return Author(normalize_whitespace(val), normalize_whitespace(aus)) for item in XPath('./opf:metadata/dc:creator')(root): val = (item.text or '').strip() if val: props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) role = props.get('role') opf_role = item.get(oeb_base.tag('opf', 'role')) if role: if is_relators_role(props, 'aut'): roled_authors.append(author(item, props, val)) elif opf_role: if opf_role.lower() == 'aut': roled_authors.append(author(item, props, val)) else: unroled_authors.append(author(item, props, val)) return uniq(roled_authors or unroled_authors)
def author(item, props, val): aus = None file_as = props.get('file-as') if file_as: aus = file_as[0][-1] else: aus = item.get(oeb_base.tag('opf', 'file_as')) or None return Author(normalize_whitespace(val), normalize_whitespace(aus))
def create_series(root, refines, series, series_index): m = XPath('./opf:metadata')(root)[0] d = m.makeelement(oeb_base.tag('opf', 'meta'), attrib={'property': 'belongs-to-collection'}) d.text = series m.append(d) set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index))
def set_pubdate(root, prefixes, refines, val): for date in XPath('./opf:metadata/dc:date')(root): remove_element(date, refines) if not is_date_undefined(val): val = isoformat(val) m = XPath('./opf:metadata')(root)[0] d = m.makeelement(oeb_base.tag('dc', 'date')) d.text = val m.append(d)
def process_node(xml_parent, toc_parent): for child in toc_parent: play_order['c'] += 1 point = etree.SubElement(xml_parent, base.tag('ncx', 'navPoint'), id='num_%d' % play_order['c'], playOrder=str(play_order['c'])) label = etree.SubElement(point, base.tag('ncx', 'navLabel')) title = child.title if title: title = spat.sub(' ', title) etree.SubElement(label, base.tag('ncx', 'text')).text = title if child.dest: href = to_href(child.dest) if child.frag: href += '#' + child.frag etree.SubElement(point, base.tag('ncx', 'content'), src=href) process_node(point, child)
def set_comments(root, prefixes, refines, val): for dc in XPath('./opf:metadata/dc:description')(root): remove_element(dc, refines) m = XPath('./opf:metadata')(root)[0] if val: val = val.strip() if val: c = m.makeelement(oeb_base.tag('dc', 'desc')) c.text = val m.append(c)
def convert_metadata(self, oeb): package = etree.Element(base.tag('opf', 'package'), attrib={'version': '2.0'}, nsmap={None: const.OPF2_NS}) oeb.metadata.to_opf2(package) self.mi = opf_meta.OPF(io.BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
def set_publisher(root, prefixes, refines, val): for dc in XPath('./opf:metadata/dc:publisher')(root): remove_element(dc, refines) m = XPath('./opf:metadata')(root)[0] if val: val = val.strip() if val: c = m.makeelement(oeb_base.tag('dc', 'publisher')) c.text = normalize_whitespace(val) m.append(c)
def process_node(xml_parent, toc_parent): for child in toc_parent: li = xml_parent.makeelement(base.tag('xhtml', 'li')) xml_parent.append(li) title = child.title or '' title = spat.sub(' ', title).strip() a = li.makeelement(base.tag('xhtml', 'a' if child.dest else 'span')) a.text = title li.append(a) if child.dest: href = to_href(child.dest) if child.frag: href += '#' + child.frag a.set('href', href) if len(child): ol = li.makeelement(base.tag('xhtml', 'ol')) li.append(ol) process_node(ol, child)
def create_timestamp(root, prefixes, m, val): if not is_date_undefined(val): ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'dcterms') val = w3cdtf(val) d = m.makeelement(oeb_base.tag('opf', 'meta'), attrib={'property': 'calibre:timestamp', 'scheme': 'dcterms:W3CDTF'}) d.text = val m.append(d)
def ensure_single_nav_of_type(root, ntype='toc'): et = base('epub', 'type') navs = [ n for n in root.iterdescendants(base.tag('xhtml', 'nav')) if n.get(et) == ntype ] for x in navs[1:]: extract(x) if navs: nav = navs[0] tail = nav.tail attrib = dict(nav.attrib) nav.clear() nav.attrib.update(attrib) nav.tail = tail else: nav = root.makeelement(base.tag('xhtml', 'nav')) first_child(root, base.tag('xhtml', 'body')).append(nav) nav.set(et, ntype) return nav
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[base.tag('xlink', 'href')]) path = urllib.parse.urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = linkee.bytes_representation ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[base.tag('xlink', 'href')] = pt.name return svg