def _parse_multiple_apps(self, list_response): """Extracts app ids from a list's Response object, sends GET requests to each app, parses detailed info and returns all apps in a list. :param list_response: the Response object from a list request :return: a list of app dictionaries """ soup = html5_parser.parse(list_response.text, treebuilder='soup') app_ids = [ x.attrs['data-docid'] for x in soup.find_all('span', class_='preview-overlay-container') ] responses = multi_app_request(app_ids) apps = [] errors = [] for i, r in enumerate(responses): if r is not None and r.status_code == requests.codes.ok: # soup = BeautifulSoup(r.text, 'lxml', parse_only=app_strainer) soup = html5_parser.parse(r.text, treebuilder='soup') apps.append(self._parse_app_details(soup)) else: errors.append(app_ids[i]) if errors: self._log.error( "There was an error parsing the following apps: {errors}.". format(errors=", ".join(errors))) return apps
def content_parser(data, url=None, response_info=None, transport_encoding=None, default_encoding=DEFAULT_ENCODING, is_html=True): ''' Parse data (a bytes object) into an etree representation such as :py:mod:`xml.etree.ElementTree` or `lxml.etree` :param bytes data: The data to parse :param url: The URL of the document being parsed or None :param response_info: Information about the document (contains all HTTP headers as :class:`mimetools.Message`) :param transport_encoding: The character encoding for the document being parsed as specified in the HTTP headers or None. :param default_encoding: The character encoding to use if no encoding could be detected and no transport_encoding is specified :param is_html: If the document is to be parsed as HTML. ''' if not is_html: return try: from html5_parser import parse except ImportError: from html5lib import parse return parse(data, transport_encoding=transport_encoding, namespaceHTMLElements=False) else: return parse(data, transport_encoding=transport_encoding)
def html(self): raw = original_html = self.toHtml() check = self.toPlainText().strip() raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return '' try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: root = parse(clean_xml_chars(raw), maybe_xhtml=False, sanitize_names=True) if root.xpath('//meta[@name="calibre-dont-sanitize"]'): # Bypass cleanup if special meta tag exists return original_html try: cleanup_qt_markup(root) except Exception: import traceback traceback.print_exc() elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding='unicode') for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = '<div>%s</div>'%(''.join(elems)) else: ans = ''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans return xml_replace_entities(ans)
def content_parser(data, url=None, response_info=None, transport_encoding=None, default_encoding=DEFAULT_ENCODING, is_html=True): ''' Parse data (a bytes object) into an etree representation such as :py:mod:`xml.etree.ElementTree` or `lxml.etree` :param bytes data: The data to parse :param url: The URL of the document being parsed or None :param response_info: Information about the document (contains all HTTP headers as :class:`HTTPMessage`) :param transport_encoding: The character encoding for the document being parsed as specified in the HTTP headers or None. :param default_encoding: The character encoding to use if no encoding could be detected and no transport_encoding is specified :param is_html: If the document is to be parsed as HTML. ''' if not is_html: return try: from html5_parser import parse except Exception: from html5lib import parse kw = {'namespaceHTMLElements': False} if transport_encoding and isinstance(data, bytes): kw['transport_encoding'] = transport_encoding return parse(data, **kw) else: return parse(data, transport_encoding=transport_encoding)
def _parse_multiple_apps(self, list_response): """Extracts app ids from a list's Response object, sends GET requests to each app, parses detailed info and returns all apps in a list. :param list_response: the Response object from a list request :return: a list of app dictionaries """ soup = html5_parser.parse(list_response.text, treebuilder='soup') app_ids = [x.attrs['data-docid'] for x in soup.find_all('span', class_='preview-overlay-container')] responses = multi_app_request(app_ids) apps = [] errors = [] for i, r in enumerate(responses): if r is not None and r.status_code == requests.codes.ok: # soup = BeautifulSoup(r.text, 'lxml', parse_only=app_strainer) soup = html5_parser.parse(r.text, treebuilder='soup') apps.append(self._parse_app_details(soup)) else: errors.append(app_ids[i]) if errors: self._log.error("There was an error parsing the following apps: {errors}.".format( errors=", ".join(errors))) return apps
def test_doctype(self): base = '\n<html><body><p>xxx</p></body></html>' for dt in ('html', 'html PUBLIC "-//W3C//DTD HTML 4.01//EN" ' '"http://www.w3.org/TR/html4/strict.dtd"'): dt = '<!DOCTYPE {}>'.format(dt) t = parse(dt + base).getroottree() self.ae(dt, t.docinfo.doctype) t = parse(dt + base, keep_doctype=False).getroottree() self.assertFalse(t.docinfo.doctype)
def search(query, max_results=10, timeout=60, write_raw_to=None): url = 'https://www.gutenberg.org/ebooks/search/?query={}&submit_search=Search'.format( quote_plus(query)) counter = max_results br = browser() raw = br.open(url).read() if write_raw_to is not None: with open(write_raw_to, 'wb') as f: f.write(raw) root = parse(raw) CSSSelect = Select(root) for li in CSSSelect('li.booklink'): if counter <= 0: break counter -= 1 s = SearchResult() a = next(CSSSelect('a.link', li)) s.detail_item = absurl(a.get('href')) s.title = etree.tostring(next(CSSSelect('span.title', li)), method='text', encoding='unicode').strip() try: s.author = etree.tostring(next(CSSSelect('span.subtitle', li)), method='text', encoding='unicode').strip() except StopIteration: s.author = "" for img in CSSSelect('img.cover-thumb', li): s.cover_url = absurl(img.get('src')) break # Get the formats and direct download links. details_doc = parse(br.open_novisit(s.detail_item).read()) doc_select = Select(details_doc) for tr in doc_select('table.files tr[typeof="pgterms:file"]'): for a in doc_select('a.link', tr): href = a.get('href') type = a.get('type') ext = mimetypes.guess_extension( type.split(';')[0]) if type else None if href and ext: url = absurl(href.split('?')[0]) ext = ext[1:].upper().strip() if ext not in s.downloads: s.downloads[ext] = url break s.formats = ', '.join(s.downloads.keys()) if not s.formats: continue yield s
def test_xmlns(self): root = parse('<html><p xmlns:foo="f">xxx<f:moo/>') self.ae(tostring(root), '<html><head/><body><p xmlns_foo="f">xxx<f_moo/></p></body></html>') root = parse('<p xmlns="x"><p xmlns:="y"><svg xmlns:xlink="xxx">') self.ae( tostring(root), '<html xmlns:xlink="http://www.w3.org/1999/xlink"><head/>' '<body><p/><p><svg/></p></body></html>') root = parse("""<p a:a="1" xmlns:a="a">""", maybe_xhtml=True) p = root[1][0] self.ae(p.attrib, {'{a}a': '1'})
def test_maybe_xhtml(self): for tag in 'title script style'.split(): html = '<html><head><{}/></head><body><p>xxx</p></body></html>'.format( tag) root = parse(html) root = parse(html, maybe_xhtml=True) self.ae(len(root[1]), 1) html = '<html><head></head><body><{}/><p>xxx</p></body></html>'.format( tag) root = parse(html, maybe_xhtml=True) self.ae(len(root[1]), 2)
def test_maybe_xhtml(self): for tag in 'title script style'.split(): html = '<html><head><{}/></head><body><p>xxx</p></body></html>'.format(tag) root = parse(html) root = parse(html, maybe_xhtml=True) self.ae(len(root[1]), 1) html = '<html><head></head><body><{}/><p>xxx</p></body></html>'.format(tag) root = parse(html, maybe_xhtml=True) self.ae(len(root[1]), 2) root = parse('<title/><title>t</title></title></title><link href="h">', maybe_xhtml=True) self.ae( tostring(root), '<html xmlns="http://www.w3.org/1999/xhtml"><head><title/>' '<title>t</title><link href="h"/></head><body/></html>')
def developer(self, developer, results=None, page=None, detailed=False): """Sends a POST request and retrieves a list of the developer's published applications on the Play Store. :param developer: developer name to retrieve apps from, e.g. 'Disney' :param results: the number of app results to retrieve :param page: the page number to retrieve :param detailed: if True, sends request per app for its full detail :return: a list of app dictionaries """ results = s.DEV_RESULTS if results is None else results page = 0 if page is None else page page_num = (results // 20) * page if not 0 <= page_num <= 12: raise ValueError( 'Page out of range. (results // 20) * page must be between 0 - 12' ) pagtok = self._pagtok[page_num] url = build_url('developer', developer) data = generate_post_data(results, 0, pagtok) response = send_request('POST', url, data) soup = html5_parser.parse(response.text, treebuilder='soup') if detailed: apps = self._parse_multiple_apps(response) else: apps = [ self._parse_card_info(app) for app in soup.select('div[data-uitype=500]') ] return apps
def implementation(self, fragment_context, html, expected, errors, test_name): if fragment_context: fragment_context = fragment_context.replace(' ', ':') bad = self.check_test(fragment_context, html, expected, errors, test_name) if bad is not None: raise unittest.SkipTest(bad) root = parse(html, namespace_elements=True, sanitize_names=False, fragment_context=fragment_context) output = serialize_construction_output( root, fragment_context=fragment_context) # html5lib doesn't yet support the template tag, but it appears in the # tests with the expectation that the template contents will be under the # word 'contents', so we need to reformat that string a bit. # expected = reformatTemplateContents(expected) error_msg = '\n'.join([ '\n\nInput:', html, '\nExpected:', expected, '\nReceived:', output ]) self.ae(expected, output, error_msg + '\n')
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = entities.xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if ((discard_namespaces and root.tag != 'html') or (not discard_namespaces and (root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))): raise ValueError('Failed to parse correctly, root has tag: %s and ' 'prefix: %s' % (root.tag, root.prefix)) return root
def developer(self, developer, results=None, page=None, detailed=False): """Sends a POST request and retrieves a list of the developer's published applications on the Play Store. :param developer: developer name to retrieve apps from, e.g. 'Disney' :param results: the number of app results to retrieve :param page: the page number to retrieve :param detailed: if True, sends request per app for its full detail :return: a list of app dictionaries """ results = s.DEV_RESULTS if results is None else results page = 0 if page is None else page page_num = (results // 20) * page if not 0 <= page_num <= 12: raise ValueError('Page out of range. (results // 20) * page must be between 0 - 12') pagtok = self._pagtok[page_num] url = build_url('developer', developer) data = generate_post_data(results, 0, pagtok) response = send_request('POST', url, data) soup = html5_parser.parse(response.text, treebuilder='soup') if detailed: apps = self._parse_multiple_apps(response) else: apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')] return apps
def generate_public_uti_map(): from lxml import etree from polyglot.urllib import urlopen from html5_parser import parse raw = urlopen( 'https://developer.apple.com/library/ios/documentation/Miscellaneous/Reference/UTIRef/Articles/System-DeclaredUniformTypeIdentifiers.html').read() root = parse(raw) tables = root.xpath('//table')[0::2] data = {} for table in tables: for tr in table.xpath('descendant::tr')[1:]: td = tr.xpath('descendant::td') identifier = etree.tostring(td[0], method='text', encoding=unicode_type).strip() tags = etree.tostring(td[2], method='text', encoding=unicode_type).strip() identifier = identifier.split()[0].replace('\u200b', '') exts = [x.strip()[1:].lower() for x in tags.split(',') if x.strip().startswith('.')] for ext in exts: data[ext] = identifier lines = ['PUBLIC_UTI_MAP = {'] for ext in sorted(data): r = ("'" + ext + "':").ljust(16) lines.append((' ' * 4) + r + "'" + data[ext] + "',") lines.append('}') with open(__file__, 'r+b') as f: raw = f.read() f.seek(0) nraw = re.sub(r'^PUBLIC_UTI_MAP = .+?}', '\n'.join(lines), raw, flags=re.MULTILINE | re.DOTALL) f.truncate(), f.write(nraw)
def html5_parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) from html5_parser import parse root = parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError( 'Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def test_dom(self): root = parse(HTML, treebuilder='dom', namespace_elements=True) doc = root.ownerDocument self.ae(doc.doctype, DOCTYPE) self.ae(root.tagName, 'html') self.ae( dict(root.attributes.itemsNS()), dict([((u'xmlns', u'xmlns'), 'http://www.w3.org/1999/xhtml'), ((u'xmlns', u'xlink'), 'http://www.w3.org/1999/xlink'), ((None, u'xml_lang'), 'en'), ((None, u'lang'), 'en')])) script = doc.getElementsByTagName('script')[0] self.ae(script.firstChild.nodeValue, 'a < & " b') p = doc.getElementsByTagName('p')[0] self.ae(p.toxml(), '<p>A <span>test</span> of text and tail\n</p>') p = doc.getElementsByTagName('p')[-1] self.ae(dict(p.attributes.itemsNS()), dict([((None, u'xml_lang'), 'de')])) svg = doc.getElementsByTagName('svg')[0] self.ae( dict(svg.attributes.itemsNS()), { (None, 'viewBox'): 'v', (u'xmlns', u'xmlns'): 'http://www.w3.org/2000/svg' }) self.ae(dict(svg.firstChild.attributes.itemsNS()), dict([((XLINK, u'href'), 'h')])) self.ae(root.lastChild.nodeValue, COMMENT.replace('--', '\u2014'))
def insert_style(html, css): """ Insert a new <style> tag with CSS. Parameters ---------- html : string css : string Returns ------- render : string """ soup = html5_parser.parse(html, treebuilder='soup', return_root=False) # Ensure html includes a <head></head>. if not soup.head: head = soup.new_tag('head') soup.html.insert(0, head) style_tag = soup.new_tag("style", type="text/css") style_tag.string = css soup.head.append(style_tag) render = soup.prettify(formatter=None) return render
def search(self, query, page=None, detailed=False): """Sends a POST request and retrieves a list of applications matching the query term(s). :param query: search query term(s) to retrieve matching apps :param page: the page number to retrieve. Max is 12. :param detailed: if True, sends request per app for its full detail :return: a list of apps matching search terms """ page = 0 if page is None else page if page > len(self._pagtok) - 1: raise ValueError('Page out of range. Please choose a number between 0 - 12') pagtok = self._pagtok[page] data = generate_post_data(0, 0, pagtok) params = { 'q': quote_plus(query), 'c': 'apps' } response = send_request('POST', self._search_url, data, params) soup = html5_parser.parse(response.text, treebuilder='soup') if detailed: apps = self._parse_multiple_apps(response) else: apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')] return apps
def test_matching(self): root = parse(namespace_elements=True, html=''' <html id='root'> <head id='head'></head> <body id='body'> <p class="one red" id='p1'>simple <p class="two green" id='p2'>a'b"c ''') all_ids = root.xpath('//*/@id') def q(mt, query=''): r = Rule({'match_type': mt, 'query': query, 'actions': []}) ans = [] for tag in r.selector(root): ans.append(tag.get('id')) return ans def t(mt, query='', expected=[]): self.ae(expected, q(mt, query)) t('*', expected=all_ids) t('is', 'body', ['body']) t('is', 'p', ['p1', 'p2']) t('has_class', 'one', ['p1']) ei = list(all_ids) ei.remove('p1') t('not_has_class', 'one', ei) t('css', '#body > p.red', ['p1']) t('xpath', '//h:body', ['body']) t('contains_text', 'imple', ['p1']) t('contains_text', 'a\'b"c', ['p2'])
def get_images_from_polyglot_text(txt: str, base_dir: str = '', file_ext: str = 'txt') -> set: from calibre.ebooks.oeb.base import OEB_IMAGES from calibre import guess_type if not base_dir: base_dir = os.getcwd() images = set() def check_path(path: str) -> None: if path and not os.path.isabs(path) and guess_type( path)[0] in OEB_IMAGES and os.path.exists( os.path.join(base_dir, path)): images.add(path) if file_ext in ('txt', 'text', 'textile'): # Textile for m in re.finditer( r'(?mu)(?:[\[{])?\!(?:\. )?(?P<path>[^\s(!]+)\s?(?:\(([^\)]+)\))?\!(?::(\S+))?(?:[\]}]|(?=\s|$))', txt): path = m.group('path') check_path(path) if file_ext in ('txt', 'text', 'md', 'markdown'): # Markdown from markdown import Markdown html = HTML_TEMPLATE % ('', Markdown().convert(txt)) from html5_parser import parse root = parse(html) for img in root.iterdescendants('img'): path = img.get('src') check_path(path) return images
def format_template(template, page_num, height): template = template.replace('_TOP_LEVEL_SECTION_PAGES_', str(toplevel_pagenum_map[page_num - 1])) template = template.replace('_TOP_LEVEL_SECTION_PAGENUM_', str(toplevel_pages_map[page_num - 1])) template = template.replace('_TOTAL_PAGES_', str(pages_in_doc)) template = template.replace('_PAGENUM_', str(page_number_display_map[page_num])) template = template.replace( '_TITLE_', prepare_string_for_xml(pdf_metadata.title, True)) template = template.replace( '_AUTHOR_', prepare_string_for_xml(pdf_metadata.author, True)) template = template.replace( '_TOP_LEVEL_SECTION_', prepare_string_for_xml(toplevel_toc_map[page_num - 1])) template = template.replace( '_SECTION_', prepare_string_for_xml(page_toc_map[page_num - 1])) troot = parse(template, namespace_elements=True) ans = last_tag(troot)[0] style = ans.get('style') or '' style = ( 'margin: 0; padding: 0; height: {height}pt; border-width: 0;' 'display: flex; align-items: center; overflow: hidden; background-color: unset;' ).format(height=height) + style ans.set('style', style) for child in ans.xpath('descendant-or-self::*[@class]'): cls = frozenset(child.get('class').split()) q = 'even-page' if page_num % 2 else 'odd-page' if q in cls or q.replace('-', '_') in cls: style = child.get('style') or '' child.set('style', style + '; display: none') return ans
def html_to_lxml(raw): raw = '<div>%s</div>' % raw root = parse(raw, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True) root = next(root.iterdescendants('div')) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding='unicode') try: return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding='unicode') try: return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)
def search(self, query, page=None, detailed=False): """Sends a POST request and retrieves a list of applications matching the query term(s). :param query: search query term(s) to retrieve matching apps :param page: the page number to retrieve. Max is 12. :param detailed: if True, sends request per app for its full detail :return: a list of apps matching search terms """ page = 0 if page is None else page if page > len(self._pagtok) - 1: raise ValueError( 'Page out of range. Please choose a number between 0 - 12') pagtok = self._pagtok[page] data = generate_post_data(0, 0, pagtok) params = {'q': quote_plus(query), 'c': 'apps'} response = send_request('POST', self._search_url, data, params) soup = html5_parser.parse(response.text, treebuilder='soup') if detailed: apps = self._parse_multiple_apps(response) else: apps = [ self._parse_card_info(app) for app in soup.select('div[data-uitype=500]') ] return apps
def parse_html(self, file, encoding=None, parser_type='html'): '''Return an iterator of elements found in the document. Args: file: A file object containing the document. encoding (str): The encoding of the document. parser_type (str): The type of parser to use. Accepted values: ``html``, ``xhtml``, ``xml``. Returns: iterator: Each item is an element from :mod:`.document.htmlparse.element` ''' if parser_type == 'html' or parser_type == 'xhtml': content = file.read() # Don't use maybe_xhtml=True for xhtml because it segfaults: # https://github.com/ludios/wpull/issues/15 tree = html5_parser.parse(content, transport_encoding=encoding) else: if encoding: lxml_encoding = to_lxml_encoding(encoding) or 'latin1' else: lxml_encoding = encoding parser = lxml.etree.XMLParser(encoding=lxml_encoding, recover=True) tree = lxml.etree.parse(file, parser=parser) for element in tree.getiterator(): yield element
def similar(self, app_id, results=None, detailed=False): """Sends a POST request and retrieves a list of applications similar to the specified app. :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa' :param results: the number of apps to retrieve at a time. :param detailed: if True, sends request per app for its full detail :return: a list of similar apps """ results = s.SIMILAR_RESULTS if results is None else results url = build_url('similar', app_id) data = generate_post_data(results) response = send_request('POST', url, data) soup = html5_parser.parse(response.text, treebuilder='soup') if detailed: apps = self._parse_multiple_apps(response) else: apps = [ self._parse_card_info(app) for app in soup.select('div[data-uitype=500]') ] return apps
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = {'english':'eng', 'french':'fra', 'german':'deu', 'spanish':'spa'}.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def _get_text(html): "Extract textual content from HTML." soup = html5_parser.parse(html, treebuilder='soup', return_root=False) [ element.extract() for element in soup.find_all( string=lambda text: isinstance(text, Comment)) ] return soup.find_all(text=True)
def create_doc(cls, data, **kwargs): if isinstance(data, str): doc = parse(data) elif isinstance(data, (dict, list)): doc = object_to_dom("root", data) elif not isinstance(data, parsel.Selector): raise DocumentTypeNotSupportedForQuerier() return doc
def test_xlink(self): src = '''<html xmlns:xlink="xl"><svg><image xlink:href="x"/>''' root = parse(src, maybe_xhtml=True) self.ae( tostring(root), '''<html xmlns="http://www.w3.org/1999/xhtml" xmlns:xlink="xl"><head/><body>\ <svg xmlns="http://www.w3.org/2000/svg"><image xlink:href="x"/></svg></body></html>''' )
def pagure_html_diff_changed(html_string): parsed = html5_parser.parse(str(html_string)) elements = parsed.xpath("//section[contains(@class, 'commit_diff')]" "//div[contains(@class, 'card-header')]" "//a[contains(@class, 'font-weight-bold')]" "/text()") return set([str(x) for x in elements])
def parse_html(raw): try: from html5_parser import parse except ImportError: # Old versions of calibre import html5lib return html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) else: return parse(raw)
def _parse_page_html5(content: str) -> etree._Element: """ Internal function to parse a page and return the data. This uses html5_parser. """ if content and content.lower() != 'none': data = parse(content) return data
def test_line_numbers(self): root = parse('<html>\n<head>\n<body>\n<p><span>', line_number_attr='ln') self.ae(root.sourceline, 1) self.ae(int(root.get('ln')), 1) self.ae(root[0].sourceline, 2) self.ae(root[1].sourceline, 3) self.ae(root[1][0].sourceline, 4) self.ae(root[1][0][0].sourceline, 4) self.ae(root[1][0][0].get('ln'), '4')
def links_diff(a_text, b_text, a_headers=None, b_headers=None, content_type_options='normal'): """ Extracts all the outgoing links from a page and produces a diff of an HTML document that is simply a list of the text and URL of those links. It ignores links that merely navigate within the page. NOTE: this diff currently suffers from the fact that our diff server does not know the original URL of the content, so it can identify: >>> <a href="#anchor-in-this-page">Text</a> as an internal link, but not: >>> <a href="http://this.domain.com/this/page#anchor-in-this-page">Text</a> """ raise_if_not_diffable_html( a_text, b_text, a_headers, b_headers, content_type_options) a_soup = html5_parser.parse(a_text, treebuilder='soup', return_root=False) b_soup = html5_parser.parse(b_text, treebuilder='soup', return_root=False) a_links = sorted( set([Link.from_element(element) for element in _find_outgoing_links(a_soup)]), key=lambda link: link.text.lower() + f'({link.href})') b_links = sorted( set([Link.from_element(element) for element in _find_outgoing_links(b_soup)]), key=lambda link: link.text.lower() + f'({link.href})') matcher = SequenceMatcher(a=a_links, b=b_links) opcodes = matcher.get_opcodes() diff = list(_assemble_diff(a_links, b_links, opcodes)) return { 'change_count': _count_changes(diff), 'diff': diff, 'a_parsed': a_soup, 'b_parsed': b_soup }
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def html5_parse(data, max_nesting_depth=100): from html5_parser import parse from calibre.utils.cleantext import clean_xml_chars data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) # Check that the asinine HTML 5 algorithm did not result in a tree with # insane nesting depths for x in data.iterdescendants(): if isinstance(x.tag, basestring) and len(x) is 0: # Leaf node depth = node_depth(x) if depth > max_nesting_depth: raise ValueError('HTML 5 parsing resulted in a tree with nesting' ' depth > %d'%max_nesting_depth) return data
def details(self, app_id): """Sends a GET request and parses an application's details. :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa' :return: a dictionary of app details """ url = build_url('details', app_id) try: response = send_request('GET', url) soup = html5_parser.parse(response.text, treebuilder='soup') except requests.exceptions.HTTPError as e: raise ValueError('Invalid application ID: {app}. {error}'.format(app=app_id, error=e)) return self._parse_app_details(soup)
def parse_html_toc(data): from html5_parser import parse from calibre.utils.cleantext import clean_xml_chars from lxml import etree if isinstance(data, bytes): data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): purl = urlparse(unquote(a.get('href'))) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = etree.tostring(a, method='text', encoding='unicode') yield href, fragment, txt
def fix_resources(self, html, base_dir): from html5_parser import parse root = parse(html) changed = False for img in root.xpath('//img[@src]'): src = img.get('src') prefix = src.split(':', 1)[0].lower() if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src): src = os.path.join(base_dir, src) if os.access(src, os.R_OK): with open(src, 'rb') as f: data = f.read() f = self.shift_file(base_dir, os.path.basename(src), data) changed = True img.set('src', os.path.basename(f)) if changed: from lxml import etree html = etree.tostring(root, encoding='unicode') return html
def similar(self, app_id, results=None, detailed=False): """Sends a POST request and retrieves a list of applications similar to the specified app. :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa' :param results: the number of apps to retrieve at a time. :param detailed: if True, sends request per app for its full detail :return: a list of similar apps """ results = s.SIMILAR_RESULTS if results is None else results url = build_url('similar', app_id) data = generate_post_data(results) response = send_request('POST', url, data) soup = html5_parser.parse(response.text, treebuilder='soup') if detailed: apps = self._parse_multiple_apps(response) else: apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')] return apps
def html(self): ans = u'' try: if not self.page().mainFrame().documentElement().findFirst('meta[name="calibre-dont-sanitize"]').isNull(): # Bypass cleanup if special meta tag exists return unicode_type(self.page().mainFrame().toHtml()) check = unicode_type(self.page().mainFrame().toPlainText()).strip() raw = unicode_type(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except Exception: root = parse(raw, maybe_xhtml=False, sanitize_names=True) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding='unicode') for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = u'<div>%s</div>'%(u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def collection(self, collection, category=None, results=None, page=None, age=None, detailed=False): """Sends a POST request and fetches a list of applications belonging to the collection and an optional category. :param collection: the collection id, e.g. 'NEW_FREE'. :param category: (optional) the category name, e.g. 'GAME_ACTION'. :param results: the number of apps to retrieve at a time. :param page: page number to retrieve; limitation: page * results <= 500. :param age: an age range to filter by (only for FAMILY categories) :param detailed: if True, sends request per app for its full detail :return: a list of app dictionaries """ collection = self.collections[collection] category = '' if category is None else self.categories[category] results = s.NUM_RESULTS if results is None else results if results > 120: raise ValueError('Number of results cannot be more than 120.') page = 0 if page is None else page if page * results > 500: raise ValueError('Start (page * results) cannot be greater than 500.') params = {} if category.startswith('FAMILY') and age is not None: params['age'] = self.age[age] url = build_collection_url(category, collection) data = generate_post_data(results, page) response = send_request('POST', url, data, params) if detailed: apps = self._parse_multiple_apps(response) else: soup = html5_parser.parse(response.text, treebuilder='soup') apps = [self._parse_card_info(app) for app in soup.select('div[data-uitype=500]')] return apps
from html5_parser import parse from lxml.etree import tostring root = parse(some_html) print(tostring(root))
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace(u'\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') self.processed_html = strip_encoding_declarations(self.processed_html) try: root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, unicode_type): data = data.encode('utf-8') with lopen(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(lopen(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def parse_html(raw): raw = clean_html(raw) from html5_parser import parse return parse(raw)