def fragment_fromstring(html, create_parent=False, guess_charset=False, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If create_parent is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. """ if not isinstance(html, _strings): raise TypeError('string required') if create_parent: container = create_parent or 'div' html = '<%s>%s</%s>' % (container, html, container) children = fragments_fromstring(html, True, guess_charset, parser) if not children: raise etree.ParserError('No elements found') if len(children) > 1: raise etree.ParserError('Multiple elements found') result = children[0] if result.tail and result.tail.strip(): raise etree.ParserError('Element followed by text: %r' % result.tail) result.tail = None return result
def fragments_fromstring(html, no_leading_text=False, guess_charset=False, parser=None): """Parses several HTML elements, returning a list of elements. The first item in the list may be a string. If no_leading_text is true, then it will be an error if there is leading text, and it will always be a list of only elements. If `guess_charset` is `True` and the text was not unicode but a bytestring, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') if parser is None: parser = html_parser children = parser.parseFragment(html, 'div', useChardet=guess_charset) if children and isinstance(children[0], _strings): if no_leading_text: if children[0].strip(): raise etree.ParserError('There is leading text: %r' % children[0]) del children[0] return children
def fragments_fromstring(html, no_leading_text=False, guess_charset=None, parser=None): """Parses several HTML elements, returning a list of elements. The first item in the list may be a string. If no_leading_text is true, then it will be an error if there is leading text, and it will always be a list of only elements. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') if parser is None: parser = html_parser options = {} if guess_charset is None and isinstance(html, bytes): # html5lib does not accept useChardet as an argument, if it # detected the htmls argument would produce unicode objects. guess_charset = False if guess_charset is not None: options['useChardet'] = guess_charset children = parser.parseFragment(html, 'div', **options) if children and isinstance(children[0], _strings): if no_leading_text: if children[0].strip(): raise etree.ParserError('There is leading text: %r' % children[0]) del children[0] return children
def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False): self.user_agent = user_agent response = self.get(url, cookies=cookies) if encoding: response.encoding = encoding try: text = response.text if xml: text = text.replace('<?xml version="1.0" encoding="utf-8"?>', '') # XXX ca_bc page = etree.fromstring(text) else: page = lxml.html.fromstring(text) except etree.ParserError: raise etree.ParserError('Document is empty {}'.format(url)) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib['content'].split('=', 1) return self.lxmlize(url, encoding) elif xml: return page else: page.make_links_absolute(url) return page
def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None): self.user_agent = user_agent response = self.get(url, cookies=cookies) if encoding: response.encoding = encoding try: text = response.text text = text.replace( '"www.facebook.com/', '"https://www.facebook.com/') # XXX ca_candidates text = re.sub('(?<=<!DOCTYPE html>)<script .+?</script>.', '', text, flags=re.DOTALL) # XXX ca_qc_longueuil page = lxml.html.fromstring(text) except etree.ParserError: raise etree.ParserError('Document is empty {}'.format(url)) meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib['content'].split('=', 1) return self.lxmlize(url, encoding) else: page.make_links_absolute(url) return page
def fragment_fromstring(html, create_parent=False, guess_charset=None, parser=None): """Parses a single HTML element; it is an error if there is more than one element, or if anything but whitespace precedes or follows the element. If 'create_parent' is true (or is a tag name) then a parent node will be created to encapsulate the HTML in a single element. In this case, leading or trailing text is allowed. If `guess_charset` is true, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError("string required") accept_leading_text = bool(create_parent) elements = fragments_fromstring( html, guess_charset=guess_charset, parser=parser, no_leading_text=not accept_leading_text, ) if create_parent: if not isinstance(create_parent, _strings): create_parent = "div" new_root = Element(create_parent) if elements: if isinstance(elements[0], _strings): new_root.text = elements[0] del elements[0] new_root.extend(elements) return new_root if not elements: raise etree.ParserError("No elements found") if len(elements) > 1: raise etree.ParserError("Multiple elements found") result = elements[0] if result.tail and result.tail.strip(): raise etree.ParserError("Element followed by text: %r" % result.tail) result.tail = None return result