def smarten_punctuation(container, report): from ebook_converter.ebooks.conversion.preprocess import smarten_punctuation smartened = False for path in container.spine_items: name = container.abspath_to_name(path) changed = False with container.open(name, 'r+b') as f: html = container.decode(f.read()) newhtml = smarten_punctuation(html, container.log) if newhtml != html: changed = True report('Smartened punctuation in: %s' % name) newhtml = strip_encoding_declarations(newhtml) f.seek(0) f.truncate() f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) if changed: # Add an encoding declaration (it will be added automatically when # serialized) root = container.parsed(name) for m in root.xpath( 'descendant::*[local-name()="meta" and @http-equiv]'): m.getparent().remove(m) container.dirty(name) smartened = True if not smartened: report('No punctuation that could be smartened found') return smartened
def parse_html(markup): if isinstance(markup, str): markup = chardet.strip_encoding_declarations(markup) markup = chardet.substitute_entites(markup) else: markup = chardet.xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = cleantext.clean_xml_chars(markup) return html5_soup.parse(markup, return_root=False)
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: # Handle � raw = entities.xml_replace_entities(raw).replace('\0', '') raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw, limit=10 * 1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: ans = etree.fromstring(raw) if ans.tag != '{%s}html' % const.XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(etree.element): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def expand_mobi8_markup(mobi8_reader, resource_map, log): # First update all internal links that are based on offsets parts = update_internal_links(mobi8_reader, log) # Remove pointless markup inserted by kindlegen remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids) # Handle substitutions for the flows pieces first as they may # be inlined into the xhtml text flows = update_flow_links(mobi8_reader, resource_map, log) # Insert inline flows into the markup insert_flows_into_markup(parts, flows, mobi8_reader, log) # Insert raster images into markup insert_images_into_markup(parts, resource_map, log) # Perform general markup cleanups upshift_markup(parts) # Update the parts and flows stored in the reader mobi8_reader.parts = parts mobi8_reader.flows = flows # write out the parts and file flows os.mkdir('text') # directory containing all parts spine = [] for i, part in enumerate(parts): pi = mobi8_reader.partinfo[i] with open(os.path.join(pi.type, pi.filename), 'wb') as f: part = strip_encoding_declarations(part) part = part.replace('<head>', '<head><meta charset="UTF-8"/>', 1) f.write(part.encode('utf-8')) spine.append(f.name) for i, flow in enumerate(flows): fi = mobi8_reader.flowinfo[i] if fi.format == 'file': if not os.path.exists(fi.dir): os.mkdir(fi.dir) with open(os.path.join(fi.dir, fi.fname), 'wb') as f: f.write(flow.encode('utf-8')) return spine
def parse_html(data, log=None, decoder=None, preprocessor=None, filename='<string>', non_html_file_tags=frozenset()): if log is None: log = LOG filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, str): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = '' idx = data.find('<html') if idx == -1: idx = data.find('<HTML') has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if '<!DOCTYPE' in pre: # Handle user defined entities # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None # Process private entities user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);' % ('|'.join(list(user_entities.keys())))) data = pat.sub(lambda m: user_entities[m.group(1)], data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace('\0', '') data = raw = clean_word_doc(data, log) # Try with more & more drastic measures to parse try: data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) data = raw try: data = html5_parse(data) except Exception: log.exception( 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) if has_html4_doctype or data.tag == 'HTML' or ( len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))): # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in tuple(x.attrib.items()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != 'html': if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML' % filename) nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body': has_body = True break parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment' % filename) nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') data.attrib['xmlns'] = const.XHTML_NS data = etree.tostring(data, encoding='unicode') try: data = etree.fromstring(data) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: data = etree.fromstring(data) except etree.XMLSyntaxError: log.warn('Stripping comments from %s' % filename) data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data) data = data.replace( "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: data = etree.fromstring(data) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s' % filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) data = etree.fromstring(data) elif namespace(data.tag) != const.XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), nsmap={None: const.XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, (str, bytes)) and \ namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot # Remove non default prefixes referring to the XHTML namespace data = ensure_namespace_prefixes(data, {None: const.XHTML_NS}) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing <head/> element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = 'Unknown' elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = 'Unknown' # Ensure <title> is not empty title = xpath(data, '/h:html/h:head/h:title')[0] if not title.text or not title.text.strip(): title.text = 'Unknown' # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML('meta'), attrib={'http-equiv': 'Content-Type'}) meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing <body/> element' % filename) etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [ x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag ] for x in r: x.tag = XHTML('span') def remove_elem(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = '' p.text += a.tail else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, '//h:br'): if len(br) > 0 or br.text: br.tag = XHTML('div') # Remove any stray text in the <head> section and format it nicely data.text = '\n ' head = xpath(data, '//h:head') if head: head = head[0] head.text = '\n ' head.tail = '\n ' for child in head: child.tail = '\n ' child.tail = '\n ' return data
def generate_html(comments): args = { 'author': author, 'comments': comments, 'css': css, 'footer': '', 'pubdate': pubdate, 'pubdate_label': 'Published', 'publisher': publisher, 'rating': rating, 'rating_label': 'Rating', 'searchable_tags': ' '.join(saxutils.escape(t) + 'ttt' for t in tags.tags_list), 'series': series, 'series_label': 'Series', 'tags': tags, 'tags_label': 'Tags', 'title': title, 'title_str': title_str, 'xmlns': const.XHTML_NS } for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars( mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' display = m.get('display', {}) ctype = display.get('interpret_as') or 'html' if ctype == 'long-text': val = ('<pre style="white-space:pre-wrap">%s</pre>' % saxutils.escape(val)) elif ctype == 'short-text': val = '<span>%s</span>' % saxutils.escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = saxutils.escape(val) args[dkey + '_label'] = saxutils.escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to # args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) has_data['series'] = bool(series) has_data['tags'] = bool(tags) has_data['rating'] = bool(rating) has_data['pubdate'] = bool(pubdate) return strip_encoding_declarations(generated_html)
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode( self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace('\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', entities.xml_entity_to_unicode, self.processed_html) image_name_map = self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html .replace('\x0c', '') .replace('\x14', '')) except Exception: self.log.warning('MOBI markup appears to contain random bytes. ' 'Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') self.processed_html = strip_encoding_declarations( self.processed_html) # These trip up the html5 parser causing all content to be placed # under the <guide> tag self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I) self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I) try: root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random ' 'bytes. Stripping.') self.processed_html = self.remove_random_bytes( self.processed_html) root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warning('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warning('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type': 'text/css', 'href': 'styles.css', 'rel': 'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv': 'Content-Type', 'content': 'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root, image_name_map) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = (os.path.basename(htmlfile) + ref.attrib['href']) except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, str): data = data.encode('utf-8') with open(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(open(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)