Python HTMLParser.feed Examples

Programming Language: Python

Namespace/Package Name: lxml.html

Class/Type: HTMLParser

Method/Function: feed

Examples at hotexamples.com: 5

Python HTMLParser.feed - 5 examples found. These are the top rated real world Python examples of lxml.html.HTMLParser.feed extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HTMLParser(30)

feed(3)

close(2)

parse(1)

Example #1

Show file

File: htmlstream.py Project: petergao1987/wextracto

    def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings

Example #2

Show file

    def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        self.response.seek(0)
        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings

Example #3

Show file

File: Util.py Project: c050226113/pack

 def translate_html_special_characters(string):
     html_parser = HTMLParser()
     print(dir(html_parser))
     data = '<br>'
     print(html_parser.feed(data))
     print(data)
     return html_parser.parse(string)

Example #4

Show file

File: urlutils.py Project: qiaohui/pygaga

def parse_html(html, encoding="utf8"):
    if not html:
        return html
    if type(html) != unicode:
        html = html.decode(encoding)
    try:
        html_obj = etree.XML(html)
    except:
        try:
            parser = HTMLParser()
            parser.feed(html)
            html_obj = parser.close()
        except:
            try:
                html_obj = etree.HTML(html)
            except:
                html_obj = soupparser.fromstring(html)
    return html_obj

Example #5

Show file

def parse_html(html, encoding='utf8'):
    if not html:
        return html
    if type(html) != unicode:
        html = html.decode(encoding)
    try:
        html_obj = etree.XML(html)
    except:
        try:
            parser = HTMLParser()
            parser.feed(html)
            html_obj = parser.close()
        except:
            try:
                html_obj = etree.HTML(html)
            except:
                html_obj = soupparser.fromstring(html)
    return html_obj