def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) set_soup_module(sys.modules[BeautifulSoup.__module__]) soup = parse(usrc, return_root=False) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = parse(replace, return_root=False) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def get_soup(self, src, url=None): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage.extend(self.preprocess_regexps) # Some websites have buggy doctype declarations that mess up beautifulsoup nmassage += [(re.compile(r'<!DOCTYPE .+?>', re.DOTALL|re.IGNORECASE), lambda m: '')] # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) soup = BeautifulSoup(usrc, markupMassage=nmassage) replace = self.prepreprocess_html_ext(soup) if replace is not None: soup = BeautifulSoup(xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0], markupMassage=nmassage) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: ans = safe_xml_fromstring(raw, recover=False) if ans.tag != '{%s}html' % XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw) try: parser = XMLParser(no_network=True) ans = fromstring(raw, parser=parser) if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) stream_class = partial(FastStream, track_position=line_numbers) stream = stream_class(raw) builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute) while True: try: parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DataLossWarning) try: parser.parse(stream, parseMeta=False, useChardet=False) finally: parser.tree.proxy_cache = None except NamespacedHTMLPresent as err: raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) stream = stream_class(raw) continue break root = parser.tree.getDocument() if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def get_metadata_from_reader(rdr): raw = rdr.GetFile(rdr.home) home = BeautifulSoup( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) title = rdr.title try: x = rdr.GetEncoding() codecs.lookup(x) enc = x except: enc = 'cp1252' title = force_unicode(title, enc) authors = _get_authors(home) mi = MetaInformation(title, authors) publisher = _get_publisher(home) if publisher: mi.publisher = publisher isbn = _get_isbn(home) if isbn: mi.isbn = isbn comments = _get_comments(home) if comments: mi.comments = comments cdata = _get_cover(home, rdr) if cdata is not None: mi.cover_data = ('jpg', cdata) return mi
def search(self, query, max_results=10, timeout=60): search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\ 'search=%s&limit=0,%s' search_url = search_url % (urllib2.quote(query), max_results) counter = max_results br = browser() br.addheaders.append(['Accept-Encoding', 'gzip']) with closing(br.open(search_url, timeout=timeout)) as r: ungzipResponse(r, br) raw = xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(raw, parser=parser) for data in doc.xpath('//*[local-name() = "fb2-book"]'): if counter <= 0: break counter -= 1 try: sRes = self.create_search_result(data) except Exception as e: prints('ERROR: cannot parse search result #%s: %s' % (max_results - counter + 1, e)) continue yield sRes
def search(self, query, max_results=15, timeout=60): search_url = self.shop_url + '/webservice/webservice.asmx/SearchWebService?'\ 'searchText=%s&searchContext=ebook' % urllib2.quote(query) search_urls = [ search_url ] ## add this as the fist try if it looks like ozon ID if re.match("^\d{6,9}$", query): ozon_detail = self.shop_url + '/webservices/OzonWebSvc.asmx/ItemDetail?ID=%s' % query search_urls.insert(0, ozon_detail) xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format('ID')) s.title = data.xpath(xp_template.format('Name')) s.author = data.xpath(xp_template.format('Author')) s.price = data.xpath(xp_template.format('Price')) s.cover_url = data.xpath(xp_template.format('Picture')) s.price = format_price_in_RUR(s.price) yield s
def get_decoded_raw(name): from calibre.ebooks.chardet import xml_to_unicode, force_encoding with open(name, 'rb') as f: raw = f.read() syntax = syntax_from_mime(name, guess_type(name)) if syntax is None: try: raw = raw.decode('utf-8') except ValueError: pass elif syntax != 'raster_image': if syntax in {'html', 'xml'}: raw = xml_to_unicode(raw, verbose=True)[0] else: m = re.search(br"coding[:=]\s*([-\w.]+)", raw[:1024], flags=re.I) if m is not None and m.group(1) != '8bit': enc = m.group(1) if enc == b'unicode': enc = 'utf-8' else: enc = force_encoding(raw, verbose=True) try: raw = raw.decode(enc) except (LookupError, ValueError): try: raw = raw.decode('utf-8') except ValueError: pass return raw, syntax
def html(self): raw = original_html = self.toHtml() check = self.toPlainText().strip() raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return '' root = parse(raw, maybe_xhtml=False, sanitize_names=True) if root.xpath('//meta[@name="calibre-dont-sanitize"]'): # Bypass cleanup if special meta tag exists return original_html try: cleanup_qt_markup(root) except Exception: import traceback traceback.print_exc() elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding='unicode') for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = '<div>%s</div>'%(u''.join(elems)) else: ans = ''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans return xml_replace_entities(ans)
def decode(self, data): """Automatically decode :param:`data` into a `unicode` object.""" def fix_data(d): return d.replace('\r\n', '\n').replace('\r', '\n') if isinstance(data, unicode): return fix_data(data) bom_enc = None if data[:4] in {b'\0\0\xfe\xff', b'\xff\xfe\0\0'}: bom_enc = {b'\0\0\xfe\xff':'utf-32-be', b'\xff\xfe\0\0':'utf-32-le'}[data[:4]] data = data[4:] elif data[:2] in {b'\xff\xfe', b'\xfe\xff'}: bom_enc = {b'\xff\xfe':'utf-16-le', b'\xfe\xff':'utf-16-be'}[data[:2]] data = data[2:] elif data[:3] == b'\xef\xbb\xbf': bom_enc = 'utf-8' data = data[3:] if bom_enc is not None: try: self.used_encoding = bom_enc return fix_data(data.decode(bom_enc)) except UnicodeDecodeError: pass try: self.used_encoding = 'utf-8' return fix_data(data.decode('utf-8')) except UnicodeDecodeError: pass data, self.used_encoding = xml_to_unicode(data) return fix_data(data)
def search(query, max_results=15, timeout=60): url = 'http://www.ozon.ru/?context=search&text=%s&store=1,0&group=div_book' % quote_plus( query) counter = max_results br = browser() with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] root = parse_html(raw) for tile in root.xpath('//*[@class="bShelfTile inline"]'): if counter <= 0: break counter -= 1 s = SearchResult(store_name='OZON.ru') s.detail_item = shop_url + tile.xpath( 'descendant::a[@class="eShelfTile_Link"]/@href')[0] s.title = tile.xpath( 'descendant::span[@class="eShelfTile_ItemNameText"]/@title')[0] s.author = tile.xpath( 'descendant::span[@class="eShelfTile_ItemPerson"]/@title')[0] s.price = ''.join( tile.xpath( 'descendant::div[contains(@class, "eShelfTile_Price")]/text()' )) s.cover_url = 'http:' + tile.xpath( 'descendant::img/@data-original')[0] s.price = format_price_in_RUR(s.price) yield s
def decode(self, data): """Automatically decode :param:`data` into a `unicode` object.""" def fix_data(d): return d.replace("\r\n", "\n").replace("\r", "\n") if isinstance(data, unicode): return fix_data(data) bom_enc = None if data[:4] in {b"\0\0\xfe\xff", b"\xff\xfe\0\0"}: bom_enc = {b"\0\0\xfe\xff": "utf-32-be", b"\xff\xfe\0\0": "utf-32-le"}[data[:4]] data = data[4:] elif data[:2] in {b"\xff\xfe", b"\xfe\xff"}: bom_enc = {b"\xff\xfe": "utf-16-le", b"\xfe\xff": "utf-16-be"}[data[:2]] data = data[2:] elif data[:3] == b"\xef\xbb\xbf": bom_enc = "utf-8" data = data[3:] if bom_enc is not None: try: self.used_encoding = bom_enc return fix_data(data.decode(bom_enc)) except UnicodeDecodeError: pass try: self.used_encoding = "utf-8" return fix_data(data.decode("utf-8")) except UnicodeDecodeError: pass data, self.used_encoding = xml_to_unicode(data) return fix_data(data)
def search(self, query, max_results=15, timeout=60): search_url = ( self.shop_url + "/webservice/webservice.asmx/SearchWebService?" "searchText=%s&searchContext=ebook" % urllib2.quote(query) ) search_urls = [search_url] xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' counter = max_results br = browser() for url in search_urls: with closing(br.open(url, timeout=timeout)) as f: raw = xml_to_unicode(f.read(), strip_encoding_pats=True, assume_utf8=True)[0] doc = etree.fromstring(raw) for data in doc.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]'): if counter <= 0: break counter -= 1 s = SearchResult() s.detail_item = data.xpath(xp_template.format("ID")) s.title = data.xpath(xp_template.format("Name")) s.author = data.xpath(xp_template.format("Author")) s.price = data.xpath(xp_template.format("Price")) s.cover_url = data.xpath(xp_template.format("Picture")) s.price = format_price_in_RUR(s.price) yield s
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: parser = XMLParser(no_network=True) ans = fromstring(raw, parser=parser) if ans.tag != '{%s}html' % html_ns: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def make_query(self, q, abort, title=None, authors=None, identifiers={}, max_pages=10, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars page_num = 1 parser = etree.XMLParser(recover=True, no_network=True) br = self.browser seen = set() candidates = [] total_found = 0 while page_num <= max_pages and not abort.is_set(): url = q.replace('&page_number=1&', '&page_number=%d&' % page_num) page_num += 1 raw = br.open_novisit(url, timeout=timeout).read() feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) total, found, results = self.parse_feed(feed, seen, title, authors, identifiers) total_found += found candidates += results if total_found >= total or len(candidates) > 9: break return candidates
def parse_html(raw): import html5lib from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]) return html5lib.parse(raw, treebuilder="lxml", namespaceHTMLElements=False).getroot()
def make_query(self, q, abort, title=None, authors=None, identifiers={}, max_pages=10, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars page_num = 1 parser = etree.XMLParser(recover=True, no_network=True) br = self.browser seen = set() candidates = [] total_found = 0 while page_num <= max_pages and not abort.is_set(): url = q.replace('&page_number=1&', '&page_number=%d&'%page_num) page_num += 1 raw = br.open_novisit(url, timeout=timeout).read() feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) total, found, results = self.parse_feed( feed, seen, title, authors, identifiers) total_found += found candidates += results if total_found >= total or len(candidates) > 9: break return candidates
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r'%query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]') if entries: metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def search(self, query, max_results=10, timeout=60): search_url = u'http://robot.litres.ru/pages/catalit_browser/?checkpoint=2000-01-02&'\ 'search=%s&limit=0,%s' search_url = search_url % (urllib2.quote(query), max_results) counter = max_results br = browser() br.addheaders.append(['Accept-Encoding','gzip']) with closing(br.open(search_url, timeout=timeout)) as r: ungzipResponse(r,br) raw= xml_to_unicode(r.read(), strip_encoding_pats=True, assume_utf8=True)[0] parser = etree.XMLParser(recover=True, no_network=True) doc = etree.fromstring(raw, parser=parser) for data in doc.xpath('//*[local-name() = "fb2-book"]'): if counter <= 0: break counter -= 1 try: sRes = self.create_search_result(data) except Exception as e: prints('ERROR: cannot parse search result #%s: %s'%(max_results - counter + 1, e)) continue yield sRes
def get_metadata_from_reader(rdr): raw = rdr.GetFile(rdr.home) home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) title = rdr.title try: x = rdr.GetEncoding() codecs.lookup(x) enc = x except: enc = 'cp1252' title = force_unicode(title, enc) authors = _get_authors(home) mi = MetaInformation(title, authors) publisher = _get_publisher(home) if publisher: mi.publisher = publisher isbn = _get_isbn(home) if isbn: mi.isbn = isbn comments = _get_comments(home) if comments: mi.comments = comments cdata = _get_cover(home, rdr) if cdata is not None: mi.cover_data = ('jpg', cdata) return mi
def _parse(self, raw, mimetype): mt = mimetype.lower() if mt.endswith('+xml'): parser = etree.XMLParser(no_network=True, huge_tree=not iswindows) raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0].strip() idx = raw.find('<html') if idx == -1: idx = raw.find('<HTML') if idx > -1: pre = raw[:idx] raw = raw[idx:] if '<!DOCTYPE' in pre: user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) raw = pat.sub(lambda m:user_entities[m.group(1)], raw) return etree.fromstring(raw, parser=parser) return raw
def fget(self): ans = u'' try: check = unicode(self.page().mainFrame().toPlainText()).strip() raw = unicode(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except: root = fromstring(raw) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding=unicode) for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = u'<div>%s</div>'%(u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def parse_outline(raw, output_dir): from lxml import etree from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]) outline = etree.fromstring(raw, parser=RECOVER_PARSER).xpath('(//outline)[1]') if outline: from calibre.ebooks.oeb.polish.toc import TOC, create_ncx outline = outline[0] toc = TOC() count = [0] def process_node(node, toc): for child in node.iterdescendants('*'): if child.tag == 'outline': parent = toc.children[-1] if toc.children else toc process_node(child, parent) else: page = child.get('page', '1') toc.add(child.text, 'index.html', page) count[0] += 1 process_node(outline, toc) if count[0] > 2: root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml') with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True))
def postprocess_book(self, oeb, opts, log): from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): continue for bad in ('metadata', 'guide'): metadata = XPath('//h:'+bad)(root) if metadata: for x in metadata: x.getparent().remove(x) body = XPath('//h:body')(root) if body: body = body[0] if len(body) == 1 and body[0].tag == XHTML('pre'): pre = body[0] from calibre.ebooks.txt.processor import convert_basic, \ separate_paragraphs_single_line from calibre.ebooks.chardet import xml_to_unicode from lxml import etree import copy html = separate_paragraphs_single_line(pre.text) html = convert_basic(html).replace('<html>', '<html xmlns="%s">'%XHTML_NS) html = xml_to_unicode(html, strip_encoding_pats=True, resolve_entities=True)[0] root = etree.fromstring(html) body = XPath('//h:body')(root) pre.tag = XHTML('div') pre.text = '' for elem in body: ne = copy.deepcopy(elem) pre.append(ne)
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class': 'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class': 'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def identify( # {{{ self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): from lxml import etree entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and title and not abort.is_set(): if identifiers: log('No results found, retrying without identifiers') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) ntitle = cleanup_title(title) if ntitle and ntitle != title: log('No results found, retrying without sub-title') return self.identify(log, result_queue, abort, title=ntitle, authors=authors, timeout=timeout) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout)
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = {'english':'eng', 'french':'fra', 'german':'deu', 'spanish':'spa'}.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def get_series(title, authors, timeout=60): mi = Metadata(title, authors) if title and title[0] in _ignore_starts: title = title[1:] title = re.sub(r'^(A|The|An)\s+', '', title).strip() if not title: return mi if isinstance(title, unicode): title = title.encode('utf-8') title = urllib.quote_plus(title) author = authors[0].strip() if not author: return mi if ',' in author: author = author.split(',')[0] else: author = author.split()[-1] url = URL.format(author, title) br = browser() try: raw = br.open_novisit(url, timeout=timeout).read() except URLError as e: if isinstance(e.reason, socket.timeout): raise Exception('KDL Server busy, try again later') raise if 'see the full results' not in raw: return mi raw = xml_to_unicode(raw)[0] soup = BeautifulSoup(raw) searcharea = soup.find('div', attrs={'class':'searcharea'}) if searcharea is None: return mi ss = searcharea.find('div', attrs={'class':'seriessearch'}) if ss is None: return mi a = ss.find('a', href=True) if a is None: return mi href = a['href'].partition('?')[-1] data = urlparse.parse_qs(href) series = data.get('SeriesName', []) if not series: return mi series = series[0] series = re.sub(r' series$', '', series).strip() if series: mi.series = series ns = ss.nextSibling if ns.contents: raw = unicode(ns.contents[0]) raw = raw.partition('.')[0].strip() try: mi.series_index = int(raw) except: pass return mi
def read_ncx_toc(self, toc, root=None): self.base_path = os.path.dirname(toc) if root is None: with open(toc, 'rb') as f: raw = xml_to_unicode(f.read(), assume_utf8=True, strip_encoding_pats=True)[0] root = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True)) xpn = {'re': 'http://exslt.org/regular-expressions'} XPath = functools.partial(etree.XPath, namespaces=xpn) def get_attr(node, default=None, attr='playorder'): for name, val in node.attrib.items(): if name and val and name.lower().endswith(attr): return val return default nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]') txt_path = XPath('./*[re:match(local-name(), "text$", "i")]') content_path = XPath('./*[re:match(local-name(), "content$", "i")]') np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]') def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = '' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urlparse(content.get('src')) href, fragment = unquote(purl[2]), unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd) nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root) if not nm: raise ValueError('NCX files must have a <navmap> element.') nm = nm[0] for child in np_path(nm): process_navpoint(child, self)
def clean_html(raw): from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars return clean_ascii_chars( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0])
def parse_html(raw): import html5lib from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0]) return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False).getroot()
def fetch_raw( self, log, url, br, testing, # {{{ identifiers={}, timeout=30): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring import html5lib try: raw = br.open_novisit( url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r' % url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('DangDang timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r' % url log.exception(msg) return as_unicode(msg) raw = clean_ascii_chars( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='dangdang_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print('Downloaded html for results page saved in', f.name) matches = [] found = '<title>对不起,您要访问的页面暂时没有找到' not in raw if found: try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse DangDang page for query: %r' % url log.exception(msg) return msg return found, root
def __new__(cls, path, mime_type=None, read_anchor_map=True, run_char_count=True, from_epub=False, read_links=True): ppath = path.partition('#')[0] if not os.path.exists(path) and os.path.exists(ppath): path = ppath obj = super(SpineItem, cls).__new__(cls, path) with open(path, 'rb') as f: raw = f.read() if from_epub: # According to the spec, HTML in EPUB must be encoded in utf-8 or # utf-16. Furthermore, there exist epub files produced by the usual # incompetents that have utf-8 encoded HTML files that contain # incorrect encoding declarations. See # http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2 # http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc # https://bugs.launchpad.net/bugs/1188843 # So we first decode with utf-8 and only if that fails we try xml_to_unicode. This # is the same algorithm as that used by the conversion pipeline (modulo # some BOM based detection). Sigh. try: raw, obj.encoding = raw.decode('utf-8'), 'utf-8' except UnicodeDecodeError: raw, obj.encoding = xml_to_unicode(raw) else: raw, obj.encoding = xml_to_unicode(raw) obj.character_count = character_count(raw) if run_char_count else 10000 obj.anchor_map = anchor_map(raw) if read_anchor_map else {} obj.all_links = all_links(raw) if read_links else set() obj.verified_links = set() obj.start_page = -1 obj.pages = -1 obj.max_page = -1 obj.index_entries = [] if mime_type is None: mime_type = guess_type(obj)[0] obj.mime_type = mime_type obj.is_single_page = None return obj
def get_details(self): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r' % self.url self.log.exception(msg) return if self.domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): self.url = 'http://amazon.co.jp' + a.get('href') self.log('Black curtain redirect found, following') return self.get_details() errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r' % self.url msg += self.tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(oraw, root)
def convert_epub3_nav(self, nav_path, opf, log): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX from calibre.ebooks.oeb.polish.toc import first_child from tempfile import NamedTemporaryFile with lopen(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(NCX('navPoint')) parent.append(np) np.append(np.makeelement(NCX('navLabel'))) np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: np.append(np.makeelement(NCX('content'), attrib={'src':href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(XHTML('li')): child = add_from_li(li, toc_parent) ol = first_child(li, XHTML('ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(XHTML('nav')): if nav.get(et) == 'toc': ol = first_child(nav, XHTML('ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME) for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id)
def parse_details_page(url, log, timeout, browser, domain): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit(url, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r'%url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r'%url log.exception(msg) return oraw = raw if 'amazon.com.br' in url: raw = raw.decode('utf-8') # amazon.com.br serves utf-8 but has an incorrect latin1 <meta> tag raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r'%url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%url log.exception(msg) return if domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): url = 'http://amazon.co.jp'+a.get('href') log('Black curtain redirect found, following') return parse_details_page(url, log, timeout, browser, domain) errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def test_lxml_unicode_parsing(self): from calibre.ebooks.chardet import xml_to_unicode with open( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'unicode-test.opf'), 'rb') as f: raw = f.read() text = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0] self.assertIsNotNone(safe_xml_fromstring(text))
def read_ncx_toc(self, toc, root=None): self.base_path = os.path.dirname(toc) if root is None: raw = xml_to_unicode(open(toc, 'rb').read(), assume_utf8=True, strip_encoding_pats=True)[0] root = etree.fromstring(raw, parser=etree.XMLParser(recover=True, no_network=True)) xpn = {'re': 'http://exslt.org/regular-expressions'} XPath = functools.partial(etree.XPath, namespaces=xpn) def get_attr(node, default=None, attr='playorder'): for name, val in node.attrib.items(): if name and val and name.lower().endswith(attr): return val return default nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]') txt_path = XPath('./*[re:match(local-name(), "text$", "i")]') content_path = XPath('./*[re:match(local-name(), "content$", "i")]') np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]') def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = u'' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urlparse(content.get('src')) href, fragment = unquote(purl[2]), unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd) nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root) if not nm: raise ValueError('NCX files must have a <navmap> element.') nm = nm[0] for child in np_path(nm): process_navpoint(child, self)
def parse_html(markup): from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, unicode_type): markup = strip_encoding_declarations(markup) markup = substitute_entites(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) from html5_parser.soup import parse return parse(markup, return_root=False)
def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None
def get_details(self): ''' 从书籍详情页获取书籍详情信息 ''' from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib try: raw = self.browser.open_novisit( self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r' % self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = '17k.com timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r' % self.url self.log.exception(msg) return oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: self.log.error('URL malformed: %r' % self.url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse 17k.com details page: %r' % self.url self.log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse 17k.com details page: %r' % self.url msg += self.tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(oraw, root)
def render_cover(opf, opf_path, zf, reader=None): from calibre.ebooks import render_html_svg_workaround from calibre.utils.logging import default_log cpage = opf.first_spine_item() if not cpage: return if reader is not None and reader.encryption_meta.is_encrypted(cpage): return with TemporaryDirectory('_epub_meta') as tdir: with CurrentDir(tdir): zf.extractall() opf_path = opf_path.replace('/', os.sep) cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage) if not os.path.exists(cpage): return if isosx: # On OS X trying to render a HTML cover which uses embedded # fonts more than once in the same process causes a crash in Qt # so be safe and remove the fonts as well as any @font-face # rules for f in walk('.'): if os.path.splitext(f)[1].lower() in ('.ttf', '.otf'): os.remove(f) ffpat = re.compile(br'@font-face.*?{.*?}', re.DOTALL | re.IGNORECASE) with open(cpage, 'r+b') as f: raw = f.read() f.truncate(0) f.seek(0) raw = ffpat.sub(b'', raw) f.write(raw) from calibre.ebooks.chardet import xml_to_unicode raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] from lxml import html for link in html.fromstring(raw).xpath('//link'): href = link.get('href', '') if href: path = os.path.join(os.path.dirname(cpage), href) if os.path.exists(path): with open(path, 'r+b') as f: raw = f.read() f.truncate(0) f.seek(0) raw = ffpat.sub(b'', raw) f.write(raw) return render_html_svg_workaround(cpage, default_log)
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def parse_opf(stream_or_path): stream = stream_or_path if not hasattr(stream, 'read'): stream = open(stream, 'rb') raw = stream.read() if not raw: raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) raw = raw[raw.find('<'):] root = etree.fromstring(raw, PARSER) if root is None: raise ValueError('Not an OPF file') return root
def get_details(self): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib try: raw = self.browser.open_novisit(self.url, timeout=self.timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: self.log.error('URL malformed: %r'%self.url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' self.log.error(msg) else: msg = 'Failed to make details query: %r'%self.url self.log.exception(msg) return oraw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: self.log.error('URL malformed: %r'%self.url) return try: root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r'%self.url self.log.exception(msg) return if self.domain == 'jp': for a in root.xpath('//a[@href]'): if 'black-curtain-redirect.html' in a.get('href'): self.url = 'http://amazon.co.jp'+a.get('href') self.log('Black curtain redirect found, following') return self.get_details() errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r'%self.url msg += self.tostring(errmsg, method='text', encoding=unicode).strip() self.log.error(msg) return self.parse_details(oraw, root)
def render_cover(opf, opf_path, zf, reader=None): from calibre.ebooks import render_html_svg_workaround from calibre.utils.logging import default_log cpage = opf.first_spine_item() if not cpage: return if reader is not None and reader.encryption_meta.is_encrypted(cpage): return with TemporaryDirectory('_epub_meta') as tdir: with CurrentDir(tdir): zf.extractall() opf_path = opf_path.replace('/', os.sep) cpage = os.path.join(tdir, os.path.dirname(opf_path), cpage) if not os.path.exists(cpage): return if isosx: # On OS X trying to render a HTML cover which uses embedded # fonts more than once in the same process causes a crash in Qt # so be safe and remove the fonts as well as any @font-face # rules for f in walk('.'): if os.path.splitext(f)[1].lower() in ('.ttf', '.otf'): os.remove(f) ffpat = re.compile(br'@font-face.*?{.*?}', re.DOTALL|re.IGNORECASE) with open(cpage, 'r+b') as f: raw = f.read() f.truncate(0) f.seek(0) raw = ffpat.sub(b'', raw) f.write(raw) from calibre.ebooks.chardet import xml_to_unicode raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] from lxml import html for link in html.fromstring(raw).xpath('//link'): href = link.get('href', '') if href: path = os.path.join(os.path.dirname(cpage), href) if os.path.exists(path): with open(path, 'r+b') as f: raw = f.read() f.truncate(0) f.seek(0) raw = ffpat.sub(b'', raw) f.write(raw) return render_html_svg_workaround(cpage, default_log)
def __new__(cls, path, mime_type=None, read_anchor_map=True, run_char_count=True, from_epub=False, read_links=True): ppath = path.partition('#')[0] if not os.path.exists(path) and os.path.exists(ppath): path = ppath obj = super(SpineItem, cls).__new__(cls, path) with lopen(path, 'rb') as f: raw = f.read() if from_epub: # According to the spec, HTML in EPUB must be encoded in utf-8 or # utf-16. Furthermore, there exist epub files produced by the usual # incompetents that have utf-8 encoded HTML files that contain # incorrect encoding declarations. See # http://www.idpf.org/epub/20/spec/OPS_2.0.1_draft.htm#Section1.4.1.2 # http://www.idpf.org/epub/30/spec/epub30-publications.html#confreq-xml-enc # https://bugs.launchpad.net/bugs/1188843 # So we first decode with utf-8 and only if that fails we try xml_to_unicode. This # is the same algorithm as that used by the conversion pipeline (modulo # some BOM based detection). Sigh. try: raw, obj.encoding = raw.decode('utf-8'), 'utf-8' except UnicodeDecodeError: raw, obj.encoding = xml_to_unicode(raw) else: raw, obj.encoding = xml_to_unicode(raw) obj.character_count = character_count(raw) if run_char_count else 10000 obj.anchor_map = anchor_map(raw) if read_anchor_map else {} obj.all_links = all_links(raw) if read_links else set() obj.verified_links = set() obj.start_page = -1 obj.pages = -1 obj.max_page = -1 obj.index_entries = [] if mime_type is None: mime_type = guess_type(obj)[0] obj.mime_type = mime_type obj.is_single_page = None return obj
def __init__(self, stream, logger): self.logger = logger src = stream.read() self.soup = BeautifulStoneSoup(xml_to_unicode(src)[0]) self.objects = {} for obj in self.soup.findAll(objid=True): self.objects[obj['objid']] = obj self.parsed_objects = {} self.first_pass() self.second_pass() self.third_pass() self.fourth_pass() self.fifth_pass()
def parse_details_page(url, log, timeout, browser): from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode import html5lib from lxml.html import tostring try: raw = browser.open_novisit( url, timeout=timeout).read().decode('gb18030').strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('URL malformed: %r' % url) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = 'Amazon timed out. Try again later.' log.error(msg) else: msg = 'Failed to make details query: %r' % url log.exception(msg) return oraw = raw raw = raw raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] if '<title>404 - ' in raw: log.error('URL malformed: %r' % url) return try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon details page: %r' % url log.exception(msg) return errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = 'Failed to parse amazon details page: %r' % url msg += tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) return from css_selectors import Select selector = Select(root) return oraw, root, selector
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=90): # {{{ from calibre.ebooks.chardet import xml_to_unicode from HTMLParser import HTMLParser from lxml import etree, html if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath(u'//div[@class="bSearchResult"]') if entries_block: entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True)))) id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)') # result containing ozon_id and entry_title entry_info = re.search(id_title_pat, entry_string) ozon_id = entry_info.group(1) if entry_info else None entry_title = entry_info.group(2) if entry_info else None if ozon_id: metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors) identifiers['ozon'] = ozon_id self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) else: log.error('No SearchResults in Ozon.ru response found') except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def fget(self): if self.compressed_info_size == 0: raise LRFException("This document has no meta info") size = self.compressed_info_size - 4 self._file.seek(self.info_start) try: src = zlib.decompress(self._file.read(size)) if len(src) != self.uncompressed_info_size: raise LRFException("Decompression of document meta info\ yielded unexpected results") src = xml_to_unicode(src, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)[0] return dom.parseString(src) except zlib.error: raise LRFException("Unable to decompress document meta information")