def parse_outline(raw, output_dir): raw = clean_xml_chars( xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw).xpath('(//outline)[1]') if outline: from ebook_converter.ebooks.oeb.polish.toc import TOC, create_ncx outline = outline[0] toc = TOC() count = [0] def process_node(node, toc): for child in node.iterchildren('*'): if child.tag == 'outline': parent = toc.children[-1] if toc.children else toc process_node(child, parent) else: if child.text: page = child.get('page', '1') toc.add(child.text, 'index.html', 'p' + page) count[0] += 1 process_node(outline, toc) if count[0] > 2: root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en', 'pdftohtml') with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: f.write( etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def _create_html_root(self, hhcpath, log, encoding): hhcdata = self._read_file(hhcpath) hhcdata = hhcdata.decode(encoding) hhcdata = xml_to_unicode(hhcdata, verbose=True, strip_encoding_pats=True, resolve_entities=True)[0] hhcroot = html.fromstring(hhcdata) toc = self._process_nodes(hhcroot) log.debug('Found %d section nodes' % toc.count()) htmlpath = os.path.splitext(hhcpath)[0] + ".html" base = os.path.dirname(os.path.abspath(htmlpath)) def unquote(x): if isinstance(x, str): x = x.encode('utf-8') return _unquote(x).decode('utf-8') def unquote_path(x): y = unquote(x) if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))): x = y return x def donode(item, parent, base, subpath): for child in item: title = child.title if not title: continue raw = unquote_path(child.href or '') rsrcname = os.path.basename(raw) rsrcpath = os.path.join(subpath, rsrcname) if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))): rsrcpath = raw if '%' not in rsrcpath: rsrcpath = urlquote(rsrcpath) if not raw: rsrcpath = '' c = builder.DIV(builder.A(title, href=rsrcpath)) donode(child, c, base, subpath) parent.append(c) with open(htmlpath, 'wb') as f: if toc.count() > 1: path0 = toc[0].href path0 = unquote_path(path0) subpath = os.path.dirname(path0) base = os.path.dirname(f.name) root = builder.DIV() donode(toc, root, base, subpath) raw = html.tostring(builder.HTML(builder.BODY(root)), encoding='utf-8', pretty_print=True) f.write(raw) else: f.write(as_bytes(hhcdata)) return htmlpath, toc
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = entities.xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if ((discard_namespaces and root.tag != 'html') or (not discard_namespaces and (root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))): raise ValueError('Failed to parse correctly, root has tag: %s and ' 'prefix: %s' % (root.tag, root.prefix)) return root
def read_ncx_toc(self, toc, root=None): self.base_path = os.path.dirname(toc) if root is None: with open(toc, 'rb') as f: raw = xml_to_unicode(f.read(), assume_utf8=True, strip_encoding_pats=True)[0] root = etree.fromstring(raw) xpn = {'re': 'http://exslt.org/regular-expressions'} XPath = functools.partial(etree.XPath, namespaces=xpn) def get_attr(node, default=None, attr='playorder'): for name, val in node.attrib.items(): if name and val and name.lower().endswith(attr): return val return default nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]') txt_path = XPath('./*[re:match(local-name(), "text$", "i")]') content_path = XPath('./*[re:match(local-name(), "content$", "i")]') np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]') def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except Exception: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = '' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urllib.parse.urlparse(content.get('src')) href = polyglot.unquote(purl[2]) fragment = polyglot.unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd) nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root) if not nm: raise ValueError('NCX files must have a <navmap> element.') nm = nm[0] for child in np_path(nm): process_navpoint(child, self)
def parse_html(markup): if isinstance(markup, str): markup = chardet.strip_encoding_declarations(markup) markup = chardet.substitute_entites(markup) else: markup = chardet.xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = cleantext.clean_xml_chars(markup) return html5_soup.parse(markup, return_root=False)
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: # Handle � raw = entities.xml_replace_entities(raw).replace('\0', '') raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw, limit=10 * 1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: ans = etree.fromstring(raw) if ans.tag != '{%s}html' % const.XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(etree.element): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def parse_opf(stream_or_path): stream = stream_or_path if not hasattr(stream, 'read'): stream = open(stream, 'rb') raw = stream.read() if not raw: raise ValueError('Empty file: ' + getattr(stream, 'name', 'stream')) raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) raw = raw[raw.find('<'):] root = etree.fromstring(clean_xml_chars(raw)) if root is None: raise ValueError('Not an OPF file') return root
def html2text(html): from html2text import HTML2Text import re if isinstance(html, bytes): from ebook_converter.ebooks.chardet import xml_to_unicode html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0] # replace <u> tags with <span> as <u> becomes emphasis in html2text html = re.sub(r'<\s*(?P<solidus>/?)\s*[uU]\b(?P<rest>[^>]*)>', r'<\g<solidus>span\g<rest>>', html) h2t = HTML2Text() h2t.default_image_alt = 'Unnamed image' h2t.body_width = 0 h2t.single_line_break = True h2t.emphasis_mark = '*' return h2t.handle(html)
def fget(self): if self.compressed_info_size == 0: raise LRFException("This document has no meta info") size = self.compressed_info_size - 4 self._file.seek(self.info_start) try: src = zlib.decompress(self._file.read(size)) if len(src) != self.uncompressed_info_size: raise LRFException("Decompression of document meta info\ yielded unexpected results") src = xml_to_unicode(src, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0] return minidom.parseString(src) except zlib.error: raise LRFException("Unable to decompress document meta " "information")
def parse_html_toc(data): from html5_parser import parse from ebook_converter.utils.cleantext import clean_xml_chars from lxml import etree if isinstance(data, bytes): data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): purl = urllib.parse.urlparse(unquote(a.get('href'))) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = etree.tostring(a, method='text', encoding='unicode') yield href, fragment, txt
def render_html_svg_workaround(path_to_html, log, width=590, height=750): from ebook_converter.ebooks.oeb.base import SVG_NS with open(path_to_html, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] data = None if SVG_NS in raw: try: data = extract_cover_from_embedded_svg( raw, os.path.dirname(path_to_html), log) except Exception: pass if data is None: try: data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log) except Exception: pass if data is None: data = render_html_data(path_to_html, width, height) return data
def postprocess_book(self, oeb, opts, log): from ebook_converter.ebooks.oeb.base import XPath, XHTML for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for bad in ('metadata', 'guide'): metadata = XPath('//h:'+bad)(root) if metadata: for x in metadata: x.getparent().remove(x) body = XPath('//h:body')(root) if body: body = body[0] if len(body) == 1 and body[0].tag == XHTML('pre'): pre = body[0] from ebook_converter.ebooks.txt.processor import \ convert_basic, separate_paragraphs_single_line from ebook_converter.ebooks.chardet import xml_to_unicode self.log('LIT file with all text in singe <pre> tag ' 'detected') html = separate_paragraphs_single_line(pre.text) html = convert_basic(html).replace('<html>', '<html xmlns="%s">' % const.XHTML_NS) html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0] if opts.smarten_punctuation: # SmartyPants skips text inside <pre> tags from ebook_converter.ebooks.conversion import \ preprocess html = preprocess.smarten_punctuation(html, self.log) root = etree.fromstring(html) body = XPath('//h:body')(root) pre.tag = XHTML('div') pre.text = '' for elem in body: ne = copy.deepcopy(elem) pre.append(ne)
def _get_fbroot(raw): raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] root = etree.fromstring(raw) return ensure_namespace(root)
def convert_epub3_nav(self, nav_path, opf, log, opts): from lxml import etree from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.oeb.polish.parsing import parse from ebook_converter.ebooks.oeb.base import \ serialize from ebook_converter.ebooks.oeb.polish.toc import first_child from tempfile import NamedTemporaryFile with open(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/' 'ncx/" version="2005-1" xml:lang="eng">' '<navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % const.EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(base.tag('xhtml', 'a'), base.tag('xhtml', 'span')): text = etree.tostring( x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(base.tag('ncx', 'navPoint')) parent.append(np) np.append(np.makeelement(base.tag('ncx', 'navLabel'))) np[0].append(np.makeelement(base.tag('ncx', 'text'))) np[0][0].text = text if href: np.append( np.makeelement(base.tag('ncx', 'content'), attrib={'src': href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(base.tag('xhtml', 'li')): child = add_from_li(li, toc_parent) ol = first_child(li, base.tag('xhtml', 'ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(base.tag('xhtml', 'nav')): if nav.get(et) == 'toc': ol = first_child(nav, base.tag('xhtml', 'ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/') ncx_id = opf.create_manifest_item(ncx_href, base.NCX_MIME, append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) url = os.path.relpath(nav_path).replace(os.sep, '/') opts.epub3_nav_href = base.urlnormalize(url) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] link_path = (os.path.relpath( os.path.join(base_path, urllib.parse.unquote(href)), base_path)) abs_href = base.urlnormalize(link_path) if abs_href == self.removed_cover: changed = True elem.set('data-calibre-removed-titlepage', '1') if changed: with open(nav_path, 'wb') as f: f.write(base.serialize(root, 'application/xhtml+xml'))
def parse_html(data, log=None, decoder=None, preprocessor=None, filename='<string>', non_html_file_tags=frozenset()): if log is None: log = LOG filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, str): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = '' idx = data.find('<html') if idx == -1: idx = data.find('<HTML') has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if '<!DOCTYPE' in pre: # Handle user defined entities # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None # Process private entities user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);' % ('|'.join(list(user_entities.keys())))) data = pat.sub(lambda m: user_entities[m.group(1)], data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace('\0', '') data = raw = clean_word_doc(data, log) # Try with more & more drastic measures to parse try: data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) data = raw try: data = html5_parse(data) except Exception: log.exception( 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) if has_html4_doctype or data.tag == 'HTML' or ( len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))): # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in tuple(x.attrib.items()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != 'html': if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML' % filename) nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body': has_body = True break parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment' % filename) nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') data.attrib['xmlns'] = const.XHTML_NS data = etree.tostring(data, encoding='unicode') try: data = etree.fromstring(data) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: data = etree.fromstring(data) except etree.XMLSyntaxError: log.warn('Stripping comments from %s' % filename) data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data) data = data.replace( "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: data = etree.fromstring(data) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s' % filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) data = etree.fromstring(data) elif namespace(data.tag) != const.XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), nsmap={None: const.XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, (str, bytes)) and \ namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot # Remove non default prefixes referring to the XHTML namespace data = ensure_namespace_prefixes(data, {None: const.XHTML_NS}) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing <head/> element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = 'Unknown' elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = 'Unknown' # Ensure <title> is not empty title = xpath(data, '/h:html/h:head/h:title')[0] if not title.text or not title.text.strip(): title.text = 'Unknown' # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML('meta'), attrib={'http-equiv': 'Content-Type'}) meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing <body/> element' % filename) etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [ x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag ] for x in r: x.tag = XHTML('span') def remove_elem(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = '' p.text += a.tail else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, '//h:br'): if len(br) > 0 or br.text: br.tag = XHTML('div') # Remove any stray text in the <head> section and format it nicely data.text = '\n ' head = xpath(data, '//h:head') if head: head = head[0] head.text = '\n ' head.tail = '\n ' for child in head: child.tail = '\n ' child.tail = '\n ' return data
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isinstance(src, bytes): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or 'Unknown' # Author authors = authors_to_string(get_all('authors')) or 'Unknown' # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages',): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments',): val = get(field) if val: setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k,v) in meta_tag_ids.items(): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn('Multiple HTML files found in the archive. Only %s will ' 'be used.' % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception('No top level HTML file found.') if not html: raise Exception('Top level HTML file %s is empty' % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, u'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = u'index%d.html' % c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from ebook_converter.customize.ui import get_file_type_metadata from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, mimetypes.guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.fb2 import ensure_namespace from ebook_converter.ebooks.metadata.fb2 import get_fb2_data from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: doc = etree.fromstring(raw.replace('& ', '&')) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and ' '@type="text/css"]') css = '' for s in stylesheets: css += etree.tostring( s, encoding='unicode', method='text', with_tail=False) + '\n\n' if css: import css_parser import logging parser = css_parser.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = const.XHTML_NS css = stylesheet.cssText if isinstance(css, bytes): css = css.decode('utf-8', 'replace') css = css.replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') with open( pkg_resources.resource_filename('ebook_converter', 'data/fb2.xsl')) as f: ss = f.read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log.info('Disabling generation of inline FB2 TOC') ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', re.DOTALL).sub('', ss) styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = { a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#') } cites = { a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '') } all_ids = {x for x in result.xpath('//*/@id')} for cite, a in cites.items(): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) with open('index.xhtml', 'wb') as f: f.write(index.encode('utf-8')) with open('inline-styles.css', 'wb') as f: f.write(css.encode('utf-8')) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] cpath = None if mi.cover_data and mi.cover_data[1]: with open('fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href' % const.XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(os.getcwd(), mi) entries = [(f2, mimetypes.guess_type(f2)[0]) for f2 in os.listdir(u'.')] opf.create_manifest(entries) opf.create_spine(['index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.join(os.getcwd(), 'metadata.opf')