def pre_parse(self): http_content_type = self.response.headers.get('content-type', '') target = HTMLEncodings(http_content_type) # parser will fail on non-ascii unless we set it explicitly parser = HTMLParser(target=target, encoding='ISO-8859-1') total_bytes = 0 while target: chunk = self.response.read(PRE_PARSE_CHUNK_SIZE) if not chunk: try: parser.close() except XMLSyntaxError: pass break if self.bom is None: assert PRE_PARSE_CHUNK_SIZE >= 4 self.bom = b'' for i in range(4, 1, -1): if chunk[:i] in BOM_ENC: self.bom = chunk[:i] target.encodings.append(('bom', BOM_ENC[self.bom])) # the can only be one BOM - stop here break parser.feed(chunk) total_bytes += len(chunk) if total_bytes >= MAX_PRE_PARSE_BYTES: break return target.encodings
def pre_parse(self): http_content_type = self.response.headers.get('content-type', '') target = HTMLEncodings(http_content_type) # parser will fail on non-ascii unless we set it explicitly parser = HTMLParser(target=target, encoding='ISO-8859-1') total_bytes = 0 self.response.seek(0) while target: chunk = self.response.read(PRE_PARSE_CHUNK_SIZE) if not chunk: try: parser.close() except XMLSyntaxError: pass break if self.bom is None: assert PRE_PARSE_CHUNK_SIZE >= 4 self.bom = b'' for i in range(4, 1, -1): if chunk[:i] in BOM_ENC: self.bom = chunk[:i] target.encodings.append(('bom', BOM_ENC[self.bom])) # the can only be one BOM - stop here break parser.feed(chunk) total_bytes += len(chunk) if total_bytes >= MAX_PRE_PARSE_BYTES: break return target.encodings
def parse_html(html, encoding="utf8"): if not html: return html if type(html) != unicode: html = html.decode(encoding) try: html_obj = etree.XML(html) except: try: parser = HTMLParser() parser.feed(html) html_obj = parser.close() except: try: html_obj = etree.HTML(html) except: html_obj = soupparser.fromstring(html) return html_obj
def parse_html(html, encoding='utf8'): if not html: return html if type(html) != unicode: html = html.decode(encoding) try: html_obj = etree.XML(html) except: try: parser = HTMLParser() parser.feed(html) html_obj = parser.close() except: try: html_obj = etree.HTML(html) except: html_obj = soupparser.fromstring(html) return html_obj