def fromstring(html, guess_charset=None, parser=None): """Parse the html, returning a single element/document. This tries to minimally parse the chunk of text, without knowing if it is a fragment or a document. 'base_url' will set the document's base_url attribute (and the tree's docinfo.URL) If `guess_charset` is true, or if the input is not Unicode but a byte string, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError("string required") doc = document_fromstring(html, parser=parser, guess_charset=guess_charset) # document starts with doctype or <html>, full document! start = html[:50] if isinstance(start, bytes): # Allow text comparison in python3. # Decode as ascii, that also covers latin-1 and utf-8 for the # characters we need. start = start.decode("ascii", "replace") start = start.lstrip().lower() if start.startswith("<html") or start.startswith("<!doctype"): return doc head = _find_tag(doc, "head") # if the head is not empty we have a full document if len(head): return doc body = _find_tag(doc, "body") # The body has just one element, so it was probably a single # element passed in if ( len(body) == 1 and (not body.text or not body.text.strip()) and (not body[-1].tail or not body[-1].tail.strip()) ): return body[0] # Now we have a body which represents a bunch of tags which have the # content that was passed in. We will create a fake container, which # is the body tag, except <body> implies too much structure. if _contains_block_level_tag(body): body.tag = "div" else: body.tag = "span" return body
def fromstring(html, guess_charset=None, parser=None): """Parse the html, returning a single element/document. This tries to minimally parse the chunk of text, without knowing if it is a fragment or a document. 'base_url' will set the document's base_url attribute (and the tree's docinfo.URL) If `guess_charset` is true, or if the input is not Unicode but a byte string, the `chardet` library will perform charset guessing on the string. """ if not isinstance(html, _strings): raise TypeError('string required') doc = document_fromstring(html, parser=parser, guess_charset=guess_charset) # document starts with doctype or <html>, full document! start = html[:50] if isinstance(start, bytes): # Allow text comparison in python3. # Decode as ascii, that also covers latin-1 and utf-8 for the # characters we need. start = start.decode('ascii', 'replace') start = start.lstrip().lower() if start.startswith('<html') or start.startswith('<!doctype'): return doc head = _find_tag(doc, 'head') # if the head is not empty we have a full document if len(head): return doc body = _find_tag(doc, 'body') # The body has just one element, so it was probably a single # element passed in if (len(body) == 1 and (not body.text or not body.text.strip()) and (not body[-1].tail or not body[-1].tail.strip())): return body[0] # Now we have a body which represents a bunch of tags which have the # content that was passed in. We will create a fake container, which # is the body tag, except <body> implies too much structure. if _contains_block_level_tag(body): body.tag = 'div' else: body.tag = 'span' return body
def fromstring(html, guess_charset=True, parser=None): """Parse the html, returning a single element/document. This tries to minimally parse the chunk of text, without knowing if it is a fragment or a document. base_url will set the document's base_url attribute (and the tree's docinfo.URL) """ if not isinstance(html, _strings): raise TypeError('string required') doc = document_fromstring(html, parser=parser, guess_charset=guess_charset) # document starts with doctype or <html>, full document! start = html[:50].lstrip().lower() if start.startswith('<html') or start.startswith('<!doctype'): return doc head = _find_tag(doc, 'head') # if the head is not empty we have a full document if len(head): return doc body = _find_tag(doc, 'body') # The body has just one element, so it was probably a single # element passed in if (len(body) == 1 and (not body.text or not body.text.strip()) and (not body[-1].tail or not body[-1].tail.strip())): return body[0] # Now we have a body which represents a bunch of tags which have the # content that was passed in. We will create a fake container, which # is the body tag, except <body> implies too much structure. if _contains_block_level_tag(body): body.tag = 'div' else: body.tag = 'span' return body