def fget(self): ans = u'' try: check = unicode(self.page().mainFrame().toPlainText()).strip() raw = unicode(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except: root = fromstring(raw) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding=unicode) for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = u'<div>%s</div>'%(u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.soupparser import fromstring from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = fromstring(raw) except: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = {'english':'eng', 'french':'fra', 'german':'deu', 'spanish':'spa'}.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding=unicode).strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def _html4_parse(data, prefer_soup=False): if prefer_soup: from calibre.utils.soupparser import fromstring data = fromstring(data) else: data = html.fromstring(data) data.attrib.pop('xmlns', None) for elem in data.iter(tag=etree.Comment): if elem.text: elem.text = elem.text.strip('-') data = etree.tostring(data, encoding=unicode) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: data = etree.fromstring(data, parser=RECOVER_PARSER) return data
def fget(self): ans = u'' try: if not self.page().mainFrame().documentElement().findFirst( 'meta[name="calibre-dont-sanitize"]').isNull(): # Bypass cleanup if special meta tag exists return unicode_type(self.page().mainFrame().toHtml()) check = unicode_type( self.page().mainFrame().toPlainText()).strip() raw = unicode_type(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except: root = fromstring(raw) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [ html.tostring(x, encoding=unicode_type) for x in body if x.tag not in ('script', 'style') ] if len(elems) > 1: ans = u'<div>%s</div>' % (u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>' % ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace(u'\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_ascii_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from calibre.utils.soupparser import fromstring self.log.warning('Malformed markup, parsing using BeautifulSoup') try: root = fromstring(self.processed_html) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = fromstring(self.processed_html) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, unicode_type): data = data.encode('utf-8') with lopen(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(lopen(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def prepare_xhtml_pt1(self, f): if FIX_CHARACTER_ENCODING: new_data = f.data if new_data.startswith(b"\xef\xbb\xbf"): new_data = new_data[3:] log.info("Removed UTF-8 BOM from %s" % f.name) header = new_data[:1024].decode("utf-8", "ignore") for pat in ENCODING_PATS: m = re.search(pat, header, re.IGNORECASE) if m: enc = m.group(2).lower() if enc == "utf8": enc = "utf-8" break else: log.info("Assuming UTF-8 encoding for %s" % f.name) enc = "utf-8" try: new_text = new_data.decode("utf-8") except UnicodeDecodeError: if enc == "utf-8": log.warning("Content failed to decode as UTF-8 in %s" % f.name) else: log.info("Changed encoding from %s to UTF-8 in %s" % (enc.upper(), f.name)) new_text = new_data.decode(enc, errors="replace") if enc != "utf-8": for pat in ENCODING_PATS: new_text, i = re.subn(pat, r"\1utf-8\3", new_text, re.IGNORECASE) if i: log.info( "Changed encoding declaration from %s to UTF-8 in %s" % (enc.upper(), f.name)) if (f.ext == "xhtml" or f.mimetype == "application/xhtml+xml" ) and not new_text.strip().startswith("<?xml"): new_text = "<?xml version='1.0' encoding='utf-8'?>" + new_text log.info("Added XML declaration to %s" % f.name) new_text = re.sub(r"<\?xml([^\?]*?)\?><", r"<?xml\1?>\n<", new_text) f.data = new_text.encode("utf-8") try: document = self.parse_xhtml_file(f) except Exception: return body = tfind(document, "body") if body is not None: f.body_id = body.get("id") if self.is_dictionary: html = tfind(document, "html") for ns, url in DICTIONARY_NSMAP.items(): if ns not in html.nsmap and ("<%s:" % ns).encode("utf8") in f.data: f.data = f.data.replace(b"<html", ("<html xmlns:%s=\"%s\"" % (ns, url)).encode("utf8"), 1) log.info("Added %s XML namespace to %s" % (ns, f.name)) if not f.data.startswith(b"<?xml"): log.info("Parsing %s as HTML soup" % f.name) document = soupparser.fromstring(f.data) f.data = etree.tostring(document, encoding="utf-8", pretty_print=False, xml_declaration=False)
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace(u'\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) for pat in ENCODING_PATS: self.processed_html = pat.sub('', self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_ascii_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from calibre.utils.soupparser import fromstring self.log.warning('Malformed markup, parsing using BeautifulSoup') try: root = fromstring(self.processed_html) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = fromstring(self.processed_html) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = fromstring(self.processed_html) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(open(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') open(ncx_path, 'wb').write(ncx) with open('styles.css', 'wb') as s: s.write(self.base_css_rules + '\n\n') for cls, rule in self.tag_css_rules.items(): if isinstance(rule, unicode): rule = rule.encode('utf-8') s.write('.%s { %s }\n\n' % (cls, rule)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = cStringIO.StringIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: open(os.path.splitext(htmlfile)[0] + '.ncx', 'wb').write(ncx)
from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.soupparser import fromstring from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception, e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = fromstring(raw) except: return False pub_date = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath( "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]" )
from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.soupparser import fromstring from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception, e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = fromstring(raw) except: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.soupparser import fromstring from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = fromstring(raw) except: return False pub_date = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath( "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]" ) if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = { 'english': 'eng', 'french': 'fra', 'german': 'deu', 'spanish': 'spa' }.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None