Python HTMLPullParser Examples, lxml.etree.HTMLPullParser Python Examples

Example #1

0

Show file

def Schools():
    global province
    mingdan = []
    province = []
    for url in Provinces():
        res = requests.get(url, headers=headers)
        res.encoding = 'utf-8'
        html = etree.HTML(res.text,
                          parser=etree.HTMLPullParser(encoding='utf-8'))
        lines = html.xpath("//div[@class='tablebox']/table/tbody/tr")[2:]
        province.append(len(lines))
        for line in lines:
            lable = line.xpath("td[1]/text()")[0]
            school = line.xpath("td[2]/text()")[0]
            code = line.xpath("td[3]/text()")[0]
            deparment = line.xpath("td[4]/text()")[0]
            location = line.xpath("td[5]/text()")[0]
            level = line.xpath("td[6]/text()")[0]
            if line.xpath("td[7]/text()") != ['民办']:
                note = '公办'
            else:
                note = '民办'
            mingdan.append(
                (lable, school, code, deparment, location, level, note))
    return mingdan

Example #2

0

Show file

File: parser.py Project: systori/bericht

def pumper(html_generator):
    """
    Pulls HTML from source generator,
    feeds it to the parser and yields
    DOM elements.
    """
    source = html_generator()
    parser = etree.HTMLPullParser(events=('start', 'end'),
                                  remove_comments=True)
    while True:
        for element in parser.read_events():
            yield element
        try:
            parser.feed(next(source))
        except StopIteration:
            # forces close of any unclosed tags
            parser.feed('</html>')
            for element in parser.read_events():
                yield element
            break

Example #3

0

Show file

File: douban-spider.py Project: 1417766861/python-

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36',
    'Host': 'movie.douban.com',
    'Referer': 'https://movie.douban.com'
}
url = 'https://movie.douban.com/cinema/nowplaying/chongqing/'
resp = requests.get(url=url, headers=headers)
# resp.content是原生数据，没有经过编码,是tytes类型
# resp.text是str类型
text = resp.text

#对数据进行提取解析

parser = etree.HTMLPullParser(encoding='utf-8')
html = etree.HTML(text=text, parser=parser)
ul = html.xpath('//ul[@class="lists"]')[0]
lis = ul.xpath('./li')
movie = []
for li in lis:
    titles = li.xpath('./@data-title')[0]
    scores = li.xpath('./@data-score')[0]
    years = li.xpath('./@data-release')[0]
    times = li.xpath('./@data-duration')[0]
    places = li.xpath('./@data-region')[0]
    directors = li.xpath('./@data-director')[0]
    actors = li.xpath('./@data-actors')[0]
    posters = li.xpath('.//li[@class="poster"]//img/@src')[0]
    movies = {
        'poster': posters,

Example #4

0

Show file

def iterparse(source,
              encoding=None,
              events=None,
              include_meta_charset_tag=False,
              **kwargs):
    """Incrementally parse HTML document into ElementTree.

    TODO:
        1. Make iterparse function take in a factory argument which
            defines the output of the generator.
        2. Modify the links function to be a subclass of Iterator
            and it should be passable to iterparse as factory arg.
        3. Make a additional no-op iterator and a forms iterator.

    """
    encoding = encoding or 'iso-8859-1'  # rfc default web encoding
    parser = etree.HTMLPullParser(events=events, encoding=encoding, **kwargs)
    lookup = etree.ElementDefaultClassLookup(ElementBase)
    parser.set_element_class_lookup(lookup)

    def iterator():
        # try:
        while True:
            # yield from chain.from_iterable(map(filter_, parser.read_events()))
            # for i in chain.from_iterable(
            #   (links(element) for event, element in parser.read_events())
            # ):
            #     yield i
            for event, element in parser.read_events():
                for child in links(element):
                    if child is None:
                        continue
                    yield child
            data = source.read(0o3000)
            if not data:
                break
            parser.feed(data)

        if include_meta_charset_tag:
            parser.feed(('<meta charset="%s" />' % encoding).encode(
                encoding, 'xmlcharrefreplace'))
            # try:
            #     head = root.xpath(
            #         "descendant-or-self::head|descendant-or-self::x:head",
            #         namespaces={'x': XHTML_NAMESPACE}
            #     )[0]
            # except (AttributeError, IndexError):
            #     head = parser.makeelement('head')
            #     root.insert(0, head)
            # #: Write the inferred charset to the html dom so that browsers read this
            # #: document in our specified encoding.
            # head.insert(0, parser.makeelement('meta', charset=encoding))
        try:
            root = parser.close()
        except etree.XMLSyntaxError:
            parser.feed('<html></html>'.encode(encoding, 'xmlcharrefreplace'))
            root = parser.close()

        # parser could generate end events for html and
        # body tags which the parser itself inserted.
        # for event, element in parser.read_events():
        #     for child in links(element):
        #         if child is None:
        #             continue
        #         yield child

        it.root = root
        # noinspection PyUnusedLocal
        root = None
        # XXX No implicit source closing
        # if close_source:
        #     source.close()

    class IterParseIterator(Iterator):
        next = __next__ = iterator().__next__

    it = IterParseIterator()
    it.root = None
    del IterParseIterator

    # close_source = False
    if not hasattr(source, "read"):
        # source = open(source, "rb")
        # close_source = True
        raise TypeError("Expected a readable object, got %r" % source)

    return it

Example #5

0

Show file

File: fetch_playlist.py Project: KFAFSP/yt-index

async def fetch_playlist(playlistId: str, chunk_size: int = 1024):
    """
    Fetch all metadata of a YouTube playlist.

    Performs multiple asynchronous HTTP requests to obtain all metadata and
    playlist items. Does not provide rich YouTube metadata: fields that are
    included in browse_request queries for modern JS YT-Clients.

    Result:
    {
        'id': string,
        'title': string,
        'description': string,
        'thumbnail': string,
        'length': integer,
        'views': integer,
        'uploader': {
            'name': string,
            'url': string
        },
        'items': [
            {
                'id': string,
                'title': string,
                'uploader': {
                    'name': string,
                    'url': string
                },
                'lengthSeconds': integer,
            }
        ]
    }
    """

    #
    # Initialize the result
    #

    playlist = {'id': playlistId, 'items': []}

    #
    # Build the headers for the ClientSession.
    #

    headers = get_default_headers()
    # Only accept HTML.
    headers['Accept'] = 'text/html'

    #
    # Retrieve landing page.
    #

    # Open a auto-raising ClientSession with default cookie handling.
    async with ClientSession(headers=headers,
                             raise_for_status=True) as session:
        # Step 1: Get the initial landing page.
        async with session.get('https://www.youtube.com/playlist',
                               params={'list': playlistId}) as response:
            # Assert that this really worked the way we wanted.
            assert response.status == 200
            assert response.content_type == 'text/html'
            encoding = response.get_encoding()
            assert is_valid_encoding(encoding)

            # Retrieve the '//div[@id=""]' node.
            is_content = lambda x: (x.tag, x.get('id')) == ('div', '')
            parser = etree.HTMLPullParser(events=('start', 'end'))
            content, discard = None, True
            while content is None and not response.content.at_eof():
                # Feed the parser the next chunk of data.
                parser.feed(
                    (await response.content.read(chunk_size)).decode(encoding))
                for event, node in parser.read_events():
                    if event == 'start':
                        if is_content(node):
                            # Content node reached, stop discarding.
                            discard = False
                        continue

                    if is_content(node):
                        # Content node finished, exit.
                        content = node
                        break

                    if discard:
                        # Discard everything before this point.
                        node.clear()
                        for ancestor in node.xpath('ancestor-or-self::*'):
                            while ancestor.getprevious() is not None:
                                del ancestor.getparent()[0]

        #
        # Parse the playlist header.
        #

        pl_header = content[0]
        assert pl_header.get('id') == 'pl-header'

        # Get the thumbnail.
        pl_thumb = pl_header[0][0]
        assert pl_thumb.tag == 'img'
        playlist['thumbnail'] = pl_thumb.get('src')

        # Get the title.
        pl_title = pl_header[1][0]
        assert pl_title.tag == 'h1'
        playlist['title'] = pl_title.text.strip()

        # Get the uploader.
        pl_uploader = pl_header[1][1][0][0]
        assert pl_uploader.tag == 'a'
        playlist['uploader'] = {
            'name': pl_uploader.text,
            'url': 'https://www.youtube.com' + pl_uploader.get('href')
        }

        # Get the length.
        pl_length = pl_header[1][1][1]
        assert pl_length.tag == 'li'
        playlist['length'] = parse_int(pl_length.text, aggressive=True)

        # Get the view count.
        pl_views = pl_header[1][1][2]
        assert pl_views.tag == 'li'
        playlist['views'] = parse_int(pl_views.text, aggressive=True)

        # Get the description.
        pl_description = pl_header[1][2][0]
        assert pl_description.tag == 'span'
        playlist['description'] = pl_description.text.strip()

        #
        # Parse the playlist items.
        #

        def parse_item(node):
            item = {'id': node.get('data-video-id')}

            # Get the video thumbnail.
            vid_thumb = node[2][0][0][0][0][0][0]
            assert vid_thumb.tag == 'img'
            item['thumbnail'] = vid_thumb.get('data-thumb')

            # Get video title.
            vid_title = node[3][0]
            assert vid_title.tag == 'a'
            item['title'] = vid_title.text.strip()

            # Get video uploader.
            vid_uploader = node[3][1][0]
            assert vid_uploader.tag == 'a'
            item['uploader'] = {
                'name': vid_uploader.text,
                'url': 'https://www.youtube.com' + vid_uploader.get('href')
            }

            # Get video length.
            vid_length = node[6][0][0][0]
            assert vid_length.tag == 'span'
            item['lengthSeconds'] = parse_ts(vid_length.text)

            return item

        pl_items = content[1][0][0][0][0]
        assert pl_items.get('id') == 'pl-load-more-destination'

        playlist['items'] += map(parse_item, pl_items)

        #
        # Fetch and parse all continuations.
        #

        load_more = index_s(content, 1, 0, 0, 1)
        assert load_more.tag == 'button'
        load_more = load_more.get(
            'data-uix-load-more-href') if load_more is not None else None
        while load_more is not None:
            # Request the continuation contents.
            async with session.get('https://www.youtube.com' + load_more,
                                   headers={'Accept':
                                            'application/json'}) as response:
                # Assert that this really worked the way we wanted.
                assert response.status == 200
                assert response.content_type == 'application/json'
                encoding = response.get_encoding()
                assert is_valid_encoding(encoding)

                # Parse the result data (large!)
                # TODO: This ought to be streamed as well.
                data = await response.json()
                assert 'content_html' in data
                assert 'load_more_widget_html' in data

                # Parse all new items.
                parser = etree.HTMLPullParser(events=('end', ))
                parser.feed(data['content_html'])
                for _, node in parser.read_events():
                    if node.tag == 'tr':
                        playlist['items'] += [parse_item(node)]

                        # Discard everything before this point.
                        node.clear()
                        for ancestor in node.xpath('ancestor-or-self::*'):
                            while ancestor.getprevious() is not None:
                                del ancestor.getparent()[0]

                # Extract the next continuation link
                match = re.search(r'data-uix-load-more-href=\"(.+?)\"',
                                  data['load_more_widget_html'])
                if match:
                    # Next continuation link.
                    load_more = match.group(1)
                else:
                    # No more continuations.
                    load_more = None

    return playlist