Exemple #1
0
    def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings
Exemple #2
0
    def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        self.response.seek(0)
        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings
Exemple #3
0
def parse_html(html, encoding="utf8"):
    if not html:
        return html
    if type(html) != unicode:
        html = html.decode(encoding)
    try:
        html_obj = etree.XML(html)
    except:
        try:
            parser = HTMLParser()
            parser.feed(html)
            html_obj = parser.close()
        except:
            try:
                html_obj = etree.HTML(html)
            except:
                html_obj = soupparser.fromstring(html)
    return html_obj
Exemple #4
0
def parse_html(html, encoding='utf8'):
    if not html:
        return html
    if type(html) != unicode:
        html = html.decode(encoding)
    try:
        html_obj = etree.XML(html)
    except:
        try:
            parser = HTMLParser()
            parser.feed(html)
            html_obj = parser.close()
        except:
            try:
                html_obj = etree.HTML(html)
            except:
                html_obj = soupparser.fromstring(html)
    return html_obj