def get(x, single=True): ans = m[x] if single: ans = clean_xml_chars(ans[0]) if ans else '' else: ans = [clean_xml_chars(y) for y in ans] return ans
def navpoint(parent, np): text = np.text if not text: text = '' c[1] += 1 item_id = 'num_%d'%c[1] text = clean_xml_chars(text) elem = E.navPoint( E.navLabel(E.text(re.sub(r'\s+', ' ', text))), E.content(src=unicode_type(np.href)+(('#' + unicode_type(np.fragment)) if np.fragment else '')), id=item_id, playOrder=str(np.play_order) ) au = getattr(np, 'author', None) if au: au = re.sub(r'\s+', ' ', au) elem.append(C.meta(au, name='author')) desc = getattr(np, 'description', None) if desc: desc = re.sub(r'\s+', ' ', desc) try: elem.append(C.meta(desc, name='description')) except ValueError: elem.append(C.meta(clean_xml_chars(desc), name='description')) idx = getattr(np, 'toc_thumbnail', None) if idx: elem.append(C.meta(idx, name='toc_thumbnail')) parent.append(elem) for np2 in np: navpoint(elem, np2)
def navpoint(parent, np): text = np.text if not text: text = '' c[1] += 1 item_id = 'num_%d' % c[1] text = clean_xml_chars(text) elem = E.navPoint( E.navLabel(E.text(re.sub(r'\s+', ' ', text))), E.content(src=unicode_type(np.href) + ( ('#' + unicode_type(np.fragment)) if np.fragment else '')), id=item_id, playOrder=unicode_type(np.play_order)) au = getattr(np, 'author', None) if au: au = re.sub(r'\s+', ' ', au) elem.append(C.meta(au, name='author')) desc = getattr(np, 'description', None) if desc: desc = re.sub(r'\s+', ' ', desc) try: elem.append(C.meta(desc, name='description')) except ValueError: elem.append( C.meta(clean_xml_chars(desc), name='description')) idx = getattr(np, 'toc_thumbnail', None) if idx: elem.append(C.meta(idx, name='toc_thumbnail')) parent.append(elem) for np2 in np: navpoint(elem, np2)
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def _generate(self, f, feeds, cutoff, extra_css=None, style=None): from calibre.utils.cleantext import clean_xml_chars feed = feeds[f] head = HEAD(TITLE(feed.title)) if style: head.append(STYLE(style, type='text/css')) if extra_css: head.append(STYLE(extra_css, type='text/css')) body = BODY() body.append(self.get_navbar(f, feeds)) div = DIV( H2(feed.title, CLASS('calibre_feed_title', 'calibre_rescale_160')), CLASS('calibre_rescale_100') ) body.append(div) if getattr(feed, 'image', None): div.append(DIV(IMG( alt=feed.image_alt if feed.image_alt else '', src=feed.image_url ), CLASS('calibre_feed_image'))) if getattr(feed, 'description', None): d = DIV(clean_xml_chars(feed.description), CLASS('calibre_feed_description', 'calibre_rescale_80')) d.append(BR()) div.append(d) ul = UL(CLASS('calibre_article_list')) for i, article in enumerate(feed.articles): if not getattr(article, 'downloaded', False): continue li = LI( A(article.title, CLASS('article calibre_rescale_120', href=article.url)), SPAN(article.formatted_date, CLASS('article_date')), CLASS('calibre_rescale_100', id='article_%d'%i, style='padding-bottom:0.5em') ) if article.summary: li.append(DIV(clean_xml_chars(cutoff(article.text_summary)), CLASS('article_description', 'calibre_rescale_70'))) ul.append(li) div.append(ul) div.append(self.get_navbar(f, feeds, top=False)) self.root = HTML(head, body) if self.html_lang: self.root.set('lang', self.html_lang)
def navpoint(parent, np): text = np.text if not text: text = "" c[1] += 1 item_id = "num_%d" % c[1] text = clean_xml_chars(text) elem = E.navPoint( E.navLabel(E.text(re.sub(r"\s+", " ", text))), E.content(src=unicode(np.href) + (("#" + unicode(np.fragment)) if np.fragment else "")), id=item_id, playOrder=str(np.play_order), ) au = getattr(np, "author", None) if au: au = re.sub(r"\s+", " ", au) elem.append(C.meta(au, name="author")) desc = getattr(np, "description", None) if desc: desc = re.sub(r"\s+", " ", desc) elem.append(C.meta(desc, name="description")) idx = getattr(np, "toc_thumbnail", None) if idx: elem.append(C.meta(idx, name="toc_thumbnail")) parent.append(elem) for np2 in np: navpoint(elem, np2)
def parse_outline(raw, output_dir): from lxml import etree from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx outline = outline[0] toc = TOC() count = [0] def process_node(node, toc): for child in node.iterdescendants('*'): if child.tag == 'outline': parent = toc.children[-1] if toc.children else toc process_node(child, parent) else: page = child.get('page', '1') toc.add(child.text, 'index.html', page) count[0] += 1 process_node(outline, toc) if count[0] > 2: root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml') with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', OPF1_NS, data) try: opf = etree.fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace('<dc-metadata>', '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section') except etree.XMLSyntaxError: from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER opf = etree.fromstring(data, parser=RECOVER_PARSER) self.logger.warn('OPF contains invalid markup, trying to parse it anyway') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf
def html(self): raw = original_html = self.toHtml() check = self.toPlainText().strip() raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return '' try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: root = parse(clean_xml_chars(raw), maybe_xhtml=False, sanitize_names=True) if root.xpath('//meta[@name="calibre-dont-sanitize"]'): # Bypass cleanup if special meta tag exists return original_html try: cleanup_qt_markup(root) except Exception: import traceback traceback.print_exc() elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding='unicode') for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = '<div>%s</div>'%(''.join(elems)) else: ans = ''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans return xml_replace_entities(ans)
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{{{}}}{}'.format(XHTML_NS, 'html') or root.prefix)): raise ValueError( 'Failed to parse correctly, root has tag: {} and prefix: {}'. format(root.tag, root.prefix)) return root
def parse_outline(raw, output_dir): from lxml import etree from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER raw = clean_xml_chars( xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx outline = outline[0] toc = TOC() count = [0] def process_node(node, toc): for child in node.iterchildren('*'): if child.tag == 'outline': parent = toc.children[-1] if toc.children else toc process_node(child, parent) else: if child.text: page = child.get('page', '1') toc.add(child.text, 'index.html', 'p' + page) count[0] += 1 process_node(outline, toc) if count[0] > 2: root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en', 'pdftohtml') with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: f.write( etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def append_text(el, attr): try: setattr(el, attr, (getattr(el, attr) or '') + data) except ValueError: text = data.replace('\u000c', ' ') try: setattr(el, attr, (getattr(el, attr) or '') + text) except ValueError: setattr(el, attr, (getattr(el, attr) or '') + clean_xml_chars(text))
def build_node(current_node, parent=None): if parent is None: parent = etree.Element('ul') elif len(current_node.nodes): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') href = relpath(abspath(unquote(node.href)), dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') link = element(point, 'a', href=clean_xml_chars(href)) title = node.title if isinstance(title, bytes): title = title.decode('utf-8') if title: title = re.sub(r'\s+', ' ', title) link.text = clean_xml_chars(title) build_node(node, point) return parent
def parse_html(markup): from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, unicode_type): markup = strip_encoding_declarations(markup) markup = substitute_entites(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) from html5_parser.soup import parse return parse(markup, return_root=False)
def __init__(self, id, title, url, author, summary, published, content): from lxml import html self.downloaded = False self.id = id if not title or not isinstance(title, string_or_bytes): title = _('Unknown') title = force_unicode(title, 'utf-8') self._title = clean_xml_chars(title).strip() try: self._title = re.sub(r'&(\S+?);', entity_to_unicode, self._title) except: pass self._title = clean_ascii_chars(self._title) self.url = url self.author = author self.toc_thumbnail = None self.internal_toc_entries = () if author and not isinstance(author, str): author = author.decode('utf-8', 'replace') if summary and not isinstance(summary, str): summary = summary.decode('utf-8', 'replace') summary = clean_xml_chars(summary) if summary else summary self.summary = summary if summary and '<' in summary: try: s = html.fragment_fromstring(summary, create_parent=True) summary = html.tostring(s, method='text', encoding='unicode') except: print('Failed to process article summary, deleting:') print(summary.encode('utf-8')) traceback.print_exc() summary = '' self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True) self.localtime = self.utctime.astimezone(local_tz) self._formatted_date = None
def __init__(self, id, title, url, author, summary, published, content): from lxml import html self.downloaded = False self.id = id if not title or not isinstance(title, string_or_bytes): title = _('Unknown') title = force_unicode(title, 'utf-8') self._title = clean_xml_chars(title).strip() try: self._title = re.sub(r'&(\S+?);', entity_to_unicode, self._title) except: pass self._title = clean_ascii_chars(self._title) self.url = url self.author = author self.toc_thumbnail = None if author and not isinstance(author, unicode_type): author = author.decode('utf-8', 'replace') if summary and not isinstance(summary, unicode_type): summary = summary.decode('utf-8', 'replace') summary = clean_xml_chars(summary) if summary else summary self.summary = summary if summary and '<' in summary: try: s = html.fragment_fromstring(summary, create_parent=True) summary = html.tostring(s, method='text', encoding=unicode_type) except: print('Failed to process article summary, deleting:') print(summary.encode('utf-8')) traceback.print_exc() summary = u'' self.text_summary = clean_ascii_chars(summary) self.author = author self.content = content self.date = published self.utctime = dt_factory(self.date, assume_utc=True, as_utc=True) self.localtime = self.utctime.astimezone(local_tz) self._formatted_date = None
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def parse_opf(stream_or_path): stream = stream_or_path if not hasattr(stream, 'read'): stream = open(stream, 'rb') raw = stream.read() if not raw: raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) raw = raw[raw.find('<'):] root = safe_xml_fromstring(clean_xml_chars(raw)) if root is None: raise ValueError('Not an OPF file') return root
def html5_parse(data, max_nesting_depth=100): from html5_parser import parse from calibre.utils.cleantext import clean_xml_chars data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) # Check that the asinine HTML 5 algorithm did not result in a tree with # insane nesting depths for x in data.iterdescendants(): if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node depth = node_depth(x) if depth > max_nesting_depth: raise ValueError('HTML 5 parsing resulted in a tree with nesting' ' depth > %d'%max_nesting_depth) return data
def save_html(browser, output_dir, postprocess_html, url, recursion_level): import html5lib from calibre.utils.cleantext import clean_xml_chars html = strip_encoding_declarations(browser.html) if isinstance(html, unicode): html = clean_xml_chars(html) root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot() root = postprocess_html(root, url, recursion_level) if root is None: # user wants this page to be aborted raise AbortFetch('%s was aborted during postprocess' % url) with open(os.path.join(output_dir, 'index.html'), 'wb') as f: from lxml.html import tostring f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True)) return f.name
def parse_html_toc(data): from html5_parser import parse from calibre.utils.cleantext import clean_xml_chars from lxml import etree if isinstance(data, bytes): data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): purl = urlparse(unquote(a.get('href'))) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = etree.tostring(a, method='text', encoding='unicode') yield href, fragment, txt
def index_to_soup(self, url_or_raw, raw=False): ''' Convenience method that takes an URL to the index page and returns a parsed lxml tree representation of it. See http://lxml.de/tutorial.html `url_or_raw`: Either a URL or the downloaded index page as a string ''' if re.match(r'\w+://', url_or_raw): self.jsbrowser.start_load(url_or_raw) html = self.jsbrowser.html else: html = url_or_raw if isinstance(html, bytes): html = xml_to_unicode(html)[0] html = strip_encoding_declarations(html) if raw: return html import html5lib root = html5lib.parse(clean_xml_chars(html), treebuilder='lxml', namespaceHTMLElements=False).getroot() return root
def sanitize(s): return unicodedata.normalize( 'NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
def clean(x): if isinstance(x, string_or_bytes): x = clean_xml_chars(x) return x
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace('\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) image_name_map = self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') self.processed_html = strip_encoding_declarations(self.processed_html) # These trip up the html5 parser causing all content to be placed # under the <guide> tag self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I) self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I) try: root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root, image_name_map) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, unicode_type): data = data.encode('utf-8') with lopen(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(lopen(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, prefix, have_kobo_browser=False): logo = DIV(IMG(src=prefix + "/static/calibre.png", alt=__appname__), id="logo") search_box = build_search_box(num, search, sort, order, prefix) navigation = build_navigation(start, num, total, prefix + url_base) navigation2 = build_navigation(start, num, total, prefix + url_base) bookt = TABLE(id="listing") body = BODY(logo, search_box, navigation, HR(CLASS("spacer")), bookt, HR(CLASS("spacer")), navigation2) # Book list {{{ for book in books: thumbnail = TD( IMG(type="image/jpeg", border="0", src=prefix + "/get/thumb/%s" % book["id"]), CLASS("thumbnail") ) data = TD() for fmt in book["formats"].split(","): if not fmt or fmt.lower().startswith("original_"): continue file_extension = "kepub.epub" if have_kobo_browser and fmt.lower() == "kepub" else fmt a = quote(ascii_filename(book["authors"])) t = quote(ascii_filename(book["title"])) s = SPAN( A(fmt.lower(), href=prefix + "/get/%s/%s-%s_%d.%s" % (fmt, a, t, book["id"], file_extension.lower())), CLASS("button"), ) s.tail = u"" data.append(s) div = DIV(CLASS("data-container")) data.append(div) series = u"[%s - %s]" % (book["series"], book["series_index"]) if book["series"] else "" tags = u"Tags=[%s]" % book["tags"] if book["tags"] else "" ctext = "" for key in CKEYS: val = book.get(key, None) if val: ctext += "%s=[%s] " % tuple(val.split(":#:")) first = SPAN( u"\u202f%s %s by %s" % (clean_xml_chars(book["title"]), clean_xml_chars(series), clean_xml_chars(book["authors"])), CLASS("first-line"), ) div.append(first) second = SPAN(u"%s - %s %s %s" % (book["size"], book["timestamp"], tags, ctext), CLASS("second-line")) div.append(second) bookt.append(TR(thumbnail, data)) # }}} body.append( DIV( A( _("Switch to the full interface (non-mobile interface)"), href=prefix + "/browse", style="text-decoration: none; color: blue", title=_( "The full interface gives you many more features, " "but it may not work well on a small screen" ), ), style="text-align:center", ) ) return HTML( HEAD( TITLE(__appname__ + " Library"), LINK(rel="icon", href="//calibre-ebook.com/favicon.ico", type="image/x-icon"), LINK(rel="stylesheet", type="text/css", href=prefix + "/mobile/style.css"), LINK(rel="apple-touch-icon", href="/static/calibre.png"), META(name="robots", content="noindex"), ), # End head body, ) # End html
def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) raw = raw[12:] pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True self.start_offset = None left = self.num_items self.kf8_header = None self.uuid = self.cdetype = None self.page_progression_direction = None self.primary_writing_mode = None self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace')) while left > 0: left -= 1 idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size if idx >= 100 and idx < 200: self.process_metadata(idx, content, codec) elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) elif idx == 501: try: self.cdetype = content.decode('ascii') except UnicodeDecodeError: self.cdetype = None # cdetype if content == b'EBSP': if not self.mi.tags: self.mi.tags = [] self.mi.tags.append(_('Sample Book')) elif idx == 502: # last update time pass elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the # title contains non ASCII chars or non filename safe chars # they are messed up in the PDB header try: title = self.decode(content) except: pass elif idx == 524: # Lang code try: lang = content.decode(codec) lang = canonicalize_lang(lang) if lang: self.mi.language = lang except: pass elif idx == 525: try: pwm = content.decode(codec) if pwm: self.primary_writing_mode = pwm except Exception: pass elif idx == 527: try: ppd = content.decode(codec) if ppd: self.page_progression_direction = ppd except Exception: pass # else: # print 'unknown record', idx, repr(content) if title: self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))
def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) raw = raw[12:] pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True self.start_offset = None left = self.num_items self.kf8_header = None self.uuid = self.cdetype = None self.page_progression_direction = None self.primary_writing_mode = None self.decode = lambda x: clean_ascii_chars(x.decode(codec, 'replace')) while left > 0: left -= 1 idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size if idx >= 100 and idx < 200: self.process_metadata(idx, content, codec) elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) elif idx == 501: try: self.cdetype = content.decode('ascii') except UnicodeDecodeError: self.cdetype = None # cdetype if content == b'EBSP': if not self.mi.tags: self.mi.tags = [] self.mi.tags.append(_('Sample Book')) elif idx == 502: # last update time pass elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the # title contains non ASCII chars or non filename safe chars # they are messed up in the PDB header try: title = self.decode(content) except: pass elif idx == 524: # Lang code try: lang = content.decode(codec) lang = canonicalize_lang(lang) if lang: self.mi.language = lang except: pass elif idx == 525: try: pwm = content.decode(codec) if pwm: self.primary_writing_mode = pwm except Exception: pass elif idx == 527: try: ppd = content.decode(codec) if ppd: self.page_progression_direction = ppd except Exception: pass # else: # print 'unknown record', idx, repr(content) if title: self.mi.title = replace_entities( clean_xml_chars(clean_ascii_chars(title)))
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) # Author names in Amazon MOBI files are usually in LN, FN format, # try to detect and auto-correct that. m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip()) if m is not None: if tweaks['author_sort_copy_method'] != 'copy': self.mi.authors.append(m.group(2) + ' ' + m.group(1)) else: self.mi.authors.append(m.group()) if self.mi.is_null('author_sort'): self.mi.author_sort = m.group() else: self.mi.authors.append(au) elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([ x.strip() for x in clean_xml_chars(self.decode(content)).split(';') ]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars( self.decode(content).strip()) elif idx == 109: self.mi.rights = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def clean(x): if isinstance(x, str): x = clean_xml_chars(x) return x
def _generate(self, f, feeds, cutoff, extra_css=None, style=None): from calibre.utils.cleantext import clean_xml_chars def trim_title(title,clip=18): if len(title)>clip: tokens = title.split(' ') new_title_tokens = [] new_title_len = 0 if len(tokens[0]) > clip: return tokens[0][:clip] + '...' for token in tokens: if len(token) + new_title_len < clip: new_title_tokens.append(token) new_title_len += len(token) else: new_title_tokens.append('...') title = ' '.join(new_title_tokens) break return title self.IS_HTML = False feed = feeds[f] # Construct the navbar navbar_t = TABLE(CLASS('touchscreen_navbar')) navbar_tr = TR() # Previous Section link = '' if f > 0: link = A(CLASS('feed_link'), trim_title(feeds[f-1].title), href='../feed_%d/index.html' % int(f-1)) navbar_tr.append(TD(CLASS('feed_prev'),link)) # Up to Sections link = A(_('Sections'), href="../index.html") navbar_tr.append(TD(CLASS('feed_up'),link)) # Next Section link = '' if f < len(feeds)-1: link = A(CLASS('feed_link'), trim_title(feeds[f+1].title), href='../feed_%d/index.html' % int(f+1)) navbar_tr.append(TD(CLASS('feed_next'),link)) navbar_t.append(navbar_tr) top_navbar = navbar_t bottom_navbar = copy.copy(navbar_t) # print "\n%s\n" % etree.tostring(navbar_t, pretty_print=True) # Build the page head = HEAD(TITLE(feed.title)) if style: head.append(STYLE(style, type='text/css')) if extra_css: head.append(STYLE(extra_css, type='text/css')) body = BODY() div = DIV( top_navbar, H2(feed.title, CLASS('feed_title')) ) body.append(div) if getattr(feed, 'image', None): div.append(DIV(IMG( alt=feed.image_alt if feed.image_alt else '', src=feed.image_url ), CLASS('calibre_feed_image'))) if getattr(feed, 'description', None): d = DIV(clean_xml_chars(feed.description), CLASS('calibre_feed_description', 'calibre_rescale_80')) d.append(BR()) div.append(d) for i, article in enumerate(feed.articles): if not getattr(article, 'downloaded', False): continue div_td = DIV(CLASS('article_summary'), A(article.title, CLASS('summary_headline','calibre_rescale_120', href=article.url))) if article.author: div_td.append(DIV(article.author, CLASS('summary_byline', 'calibre_rescale_100'))) if article.summary: div_td.append(DIV(cutoff(article.text_summary), CLASS('summary_text', 'calibre_rescale_100'))) div.append(div_td) div.append(bottom_navbar) self.root = HTML(head, body) if self.html_lang: self.root.set('lang', self.html_lang)
def sanitize(s): return unicodedata.normalize('NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
def sanitize(s): return clean_xml_chars(clean_ascii_chars(force_unicode(s or '')))
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) # Author names in Amazon MOBI files are usually in LN, FN format, # try to detect and auto-correct that. m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip()) if m is not None: if tweaks['author_sort_copy_method'] != 'copy': self.mi.authors.append(m.group(2) + ' ' + m.group(1)) else: self.mi.authors.append(m.group()) if self.mi.is_null('author_sort'): self.mi.author_sort = m.group() else: self.mi.authors.append(au) elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) elif idx == 109: self.mi.rights = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def html5_parse(data, max_nesting_depth=100): import html5lib, warnings # HTML5 parsing algorithm idiocy: http://code.google.com/p/html5lib/issues/detail?id=195 data = fix_self_closing_cdata_tags(data) with warnings.catch_warnings(): warnings.simplefilter('ignore') try: data = html5lib.parse(data, treebuilder='lxml').getroot() except ValueError: from calibre.utils.cleantext import clean_xml_chars data = html5lib.parse(clean_xml_chars(data), treebuilder='lxml').getroot() # Check that the asinine HTML 5 algorithm did not result in a tree with # insane nesting depths for x in data.iterdescendants(): if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node depth = node_depth(x) if depth > max_nesting_depth: raise ValueError('html5lib resulted in a tree with nesting' ' depth > %d'%max_nesting_depth) # html5lib has the most inelegant handling of namespaces I have ever seen # Try to reconstitute destroyed namespace info xmlns_declaration = '{%s}'%XMLNS_NS non_html5_namespaces = {} seen_namespaces = set() for elem in tuple(data.iter(tag=etree.Element)): elem.attrib.pop('xmlns', None) # Set lang correctly xl = elem.attrib.pop('xmlU0003Alang', None) if xl is not None and 'lang' not in elem.attrib: elem.attrib['lang'] = xl namespaces = {} for x in tuple(elem.attrib): if x.startswith('xmlnsU') or x.startswith(xmlns_declaration): # A namespace declaration val = elem.attrib.pop(x) if x.startswith('xmlnsU0003A'): prefix = x[11:] namespaces[prefix] = val remapped_namespaces = {} if namespaces: # Some destroyed namespace declarations were found p = elem.getparent() if p is None: # We handle the root node later non_html5_namespaces = namespaces else: idx = p.index(elem) p.remove(elem) elem = clone_element(elem, nsmap=namespaces) p.insert(idx, elem) remapped_namespaces = {ns:namespaces[ns] for ns in set(namespaces) - set(elem.nsmap)} b = barename(elem.tag) idx = b.find('U0003A') if idx > -1: prefix, tag = b[:idx], b[idx+6:] ns = elem.nsmap.get(prefix, None) if ns is None: ns = non_html5_namespaces.get(prefix, None) if ns is None: ns = remapped_namespaces.get(prefix, None) if ns is not None: elem.tag = '{%s}%s'%(ns, tag) for b in tuple(elem.attrib): idx = b.find('U0003A') if idx > -1: prefix, tag = b[:idx], b[idx+6:] ns = elem.nsmap.get(prefix, None) if ns is None: ns = non_html5_namespaces.get(prefix, None) if ns is None: ns = remapped_namespaces.get(prefix, None) if ns is not None: elem.attrib['{%s}%s'%(ns, tag)] = elem.attrib.pop(b) seen_namespaces |= set(elem.nsmap.itervalues()) nsmap = dict(html5lib.constants.namespaces) nsmap[None] = nsmap.pop('html') non_html5_namespaces.update(nsmap) nsmap = non_html5_namespaces data = clone_element(data, nsmap=nsmap, in_context=False) # Remove unused namespace declarations fnsmap = {k:v for k,v in nsmap.iteritems() if v in seen_namespaces and v != XMLNS_NS} return clone_element(data, nsmap=fnsmap, in_context=False)
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match( r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([ x.strip() for x in clean_xml_chars(self.decode(content)).split(';') ]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars( self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace(u'\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') self.processed_html = strip_encoding_declarations(self.processed_html) try: root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, unicode_type): data = data.encode('utf-8') with lopen(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(lopen(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def build_index(books, num, search, sort, order, start, total, url_base, CKEYS, prefix, have_kobo_browser=False): logo = DIV(IMG(src=prefix+'/static/calibre.png', alt=__appname__), id='logo') search_box = build_search_box(num, search, sort, order, prefix) navigation = build_navigation(start, num, total, prefix+url_base) navigation2 = build_navigation(start, num, total, prefix+url_base) bookt = TABLE(id='listing') body = BODY( logo, search_box, navigation, HR(CLASS('spacer')), bookt, HR(CLASS('spacer')), navigation2 ) # Book list {{{ for book in books: thumbnail = TD( IMG(type='image/jpeg', border='0', src=prefix+'/get/thumb/%s' % book['id']), CLASS('thumbnail')) data = TD() for fmt in book['formats'].split(','): if not fmt or fmt.lower().startswith('original_'): continue file_extension = "kepub.epub" if have_kobo_browser and fmt.lower() == "kepub" else fmt a = quote(ascii_filename(book['authors'])) t = quote(ascii_filename(book['title'])) s = SPAN( A( fmt.lower(), href=prefix+'/get/%s/%s-%s_%d.%s' % (fmt, a, t, book['id'], file_extension.lower()) ), CLASS('button')) s.tail = u'' data.append(s) div = DIV(CLASS('data-container')) data.append(div) series = u'[%s - %s]'%(book['series'], book['series_index']) \ if book['series'] else '' tags = u'Tags=[%s]'%book['tags'] if book['tags'] else '' ctext = '' for key in CKEYS: val = book.get(key, None) if val: ctext += '%s=[%s] '%tuple(val.split(':#:')) first = SPAN(u'\u202f%s %s by %s' % (clean_xml_chars(book['title']), clean_xml_chars(series), clean_xml_chars(book['authors'])), CLASS('first-line')) div.append(first) second = SPAN(u'%s - %s %s %s' % (book['size'], book['timestamp'], tags, ctext), CLASS('second-line')) div.append(second) bookt.append(TR(thumbnail, data)) # }}} body.append(DIV( A(_('Switch to the full interface (non-mobile interface)'), href=prefix+"/browse", style="text-decoration: none; color: blue", title=_('The full interface gives you many more features, ' 'but it may not work well on a small screen')), style="text-align:center")) return HTML( HEAD( TITLE(__appname__ + ' Library'), LINK(rel='icon', href='http://calibre-ebook.com/favicon.ico', type='image/x-icon'), LINK(rel='stylesheet', type='text/css', href=prefix+'/mobile/style.css'), LINK(rel='apple-touch-icon', href="/static/calibre.png"), META(name="robots", content="noindex") ), # End head body ) # End html
def clean(x): if isinstance(x, basestring): x = clean_xml_chars(x) return x