def Schools(): global province mingdan = [] province = [] for url in Provinces(): res = requests.get(url, headers=headers) res.encoding = 'utf-8' html = etree.HTML(res.text, parser=etree.HTMLPullParser(encoding='utf-8')) lines = html.xpath("//div[@class='tablebox']/table/tbody/tr")[2:] province.append(len(lines)) for line in lines: lable = line.xpath("td[1]/text()")[0] school = line.xpath("td[2]/text()")[0] code = line.xpath("td[3]/text()")[0] deparment = line.xpath("td[4]/text()")[0] location = line.xpath("td[5]/text()")[0] level = line.xpath("td[6]/text()")[0] if line.xpath("td[7]/text()") != ['民办']: note = '公办' else: note = '民办' mingdan.append( (lable, school, code, deparment, location, level, note)) return mingdan
def pumper(html_generator): """ Pulls HTML from source generator, feeds it to the parser and yields DOM elements. """ source = html_generator() parser = etree.HTMLPullParser(events=('start', 'end'), remove_comments=True) while True: for element in parser.read_events(): yield element try: parser.feed(next(source)) except StopIteration: # forces close of any unclosed tags parser.feed('</html>') for element in parser.read_events(): yield element break
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36', 'Host': 'movie.douban.com', 'Referer': 'https://movie.douban.com' } url = 'https://movie.douban.com/cinema/nowplaying/chongqing/' resp = requests.get(url=url, headers=headers) # resp.content是原生数据,没有经过编码,是tytes类型 # resp.text是str类型 text = resp.text #对数据进行提取解析 parser = etree.HTMLPullParser(encoding='utf-8') html = etree.HTML(text=text, parser=parser) ul = html.xpath('//ul[@class="lists"]')[0] lis = ul.xpath('./li') movie = [] for li in lis: titles = li.xpath('./@data-title')[0] scores = li.xpath('./@data-score')[0] years = li.xpath('./@data-release')[0] times = li.xpath('./@data-duration')[0] places = li.xpath('./@data-region')[0] directors = li.xpath('./@data-director')[0] actors = li.xpath('./@data-actors')[0] posters = li.xpath('.//li[@class="poster"]//img/@src')[0] movies = { 'poster': posters,
def iterparse(source, encoding=None, events=None, include_meta_charset_tag=False, **kwargs): """Incrementally parse HTML document into ElementTree. TODO: 1. Make iterparse function take in a factory argument which defines the output of the generator. 2. Modify the links function to be a subclass of Iterator and it should be passable to iterparse as factory arg. 3. Make a additional no-op iterator and a forms iterator. """ encoding = encoding or 'iso-8859-1' # rfc default web encoding parser = etree.HTMLPullParser(events=events, encoding=encoding, **kwargs) lookup = etree.ElementDefaultClassLookup(ElementBase) parser.set_element_class_lookup(lookup) def iterator(): # try: while True: # yield from chain.from_iterable(map(filter_, parser.read_events())) # for i in chain.from_iterable( # (links(element) for event, element in parser.read_events()) # ): # yield i for event, element in parser.read_events(): for child in links(element): if child is None: continue yield child data = source.read(0o3000) if not data: break parser.feed(data) if include_meta_charset_tag: parser.feed(('<meta charset="%s" />' % encoding).encode( encoding, 'xmlcharrefreplace')) # try: # head = root.xpath( # "descendant-or-self::head|descendant-or-self::x:head", # namespaces={'x': XHTML_NAMESPACE} # )[0] # except (AttributeError, IndexError): # head = parser.makeelement('head') # root.insert(0, head) # #: Write the inferred charset to the html dom so that browsers read this # #: document in our specified encoding. # head.insert(0, parser.makeelement('meta', charset=encoding)) try: root = parser.close() except etree.XMLSyntaxError: parser.feed('<html></html>'.encode(encoding, 'xmlcharrefreplace')) root = parser.close() # parser could generate end events for html and # body tags which the parser itself inserted. # for event, element in parser.read_events(): # for child in links(element): # if child is None: # continue # yield child it.root = root # noinspection PyUnusedLocal root = None # XXX No implicit source closing # if close_source: # source.close() class IterParseIterator(Iterator): next = __next__ = iterator().__next__ it = IterParseIterator() it.root = None del IterParseIterator # close_source = False if not hasattr(source, "read"): # source = open(source, "rb") # close_source = True raise TypeError("Expected a readable object, got %r" % source) return it
async def fetch_playlist(playlistId: str, chunk_size: int = 1024): """ Fetch all metadata of a YouTube playlist. Performs multiple asynchronous HTTP requests to obtain all metadata and playlist items. Does not provide rich YouTube metadata: fields that are included in browse_request queries for modern JS YT-Clients. Result: { 'id': string, 'title': string, 'description': string, 'thumbnail': string, 'length': integer, 'views': integer, 'uploader': { 'name': string, 'url': string }, 'items': [ { 'id': string, 'title': string, 'uploader': { 'name': string, 'url': string }, 'lengthSeconds': integer, } ] } """ # # Initialize the result # playlist = {'id': playlistId, 'items': []} # # Build the headers for the ClientSession. # headers = get_default_headers() # Only accept HTML. headers['Accept'] = 'text/html' # # Retrieve landing page. # # Open a auto-raising ClientSession with default cookie handling. async with ClientSession(headers=headers, raise_for_status=True) as session: # Step 1: Get the initial landing page. async with session.get('https://www.youtube.com/playlist', params={'list': playlistId}) as response: # Assert that this really worked the way we wanted. assert response.status == 200 assert response.content_type == 'text/html' encoding = response.get_encoding() assert is_valid_encoding(encoding) # Retrieve the '//div[@id=""]' node. is_content = lambda x: (x.tag, x.get('id')) == ('div', '') parser = etree.HTMLPullParser(events=('start', 'end')) content, discard = None, True while content is None and not response.content.at_eof(): # Feed the parser the next chunk of data. parser.feed( (await response.content.read(chunk_size)).decode(encoding)) for event, node in parser.read_events(): if event == 'start': if is_content(node): # Content node reached, stop discarding. discard = False continue if is_content(node): # Content node finished, exit. content = node break if discard: # Discard everything before this point. node.clear() for ancestor in node.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] # # Parse the playlist header. # pl_header = content[0] assert pl_header.get('id') == 'pl-header' # Get the thumbnail. pl_thumb = pl_header[0][0] assert pl_thumb.tag == 'img' playlist['thumbnail'] = pl_thumb.get('src') # Get the title. pl_title = pl_header[1][0] assert pl_title.tag == 'h1' playlist['title'] = pl_title.text.strip() # Get the uploader. pl_uploader = pl_header[1][1][0][0] assert pl_uploader.tag == 'a' playlist['uploader'] = { 'name': pl_uploader.text, 'url': 'https://www.youtube.com' + pl_uploader.get('href') } # Get the length. pl_length = pl_header[1][1][1] assert pl_length.tag == 'li' playlist['length'] = parse_int(pl_length.text, aggressive=True) # Get the view count. pl_views = pl_header[1][1][2] assert pl_views.tag == 'li' playlist['views'] = parse_int(pl_views.text, aggressive=True) # Get the description. pl_description = pl_header[1][2][0] assert pl_description.tag == 'span' playlist['description'] = pl_description.text.strip() # # Parse the playlist items. # def parse_item(node): item = {'id': node.get('data-video-id')} # Get the video thumbnail. vid_thumb = node[2][0][0][0][0][0][0] assert vid_thumb.tag == 'img' item['thumbnail'] = vid_thumb.get('data-thumb') # Get video title. vid_title = node[3][0] assert vid_title.tag == 'a' item['title'] = vid_title.text.strip() # Get video uploader. vid_uploader = node[3][1][0] assert vid_uploader.tag == 'a' item['uploader'] = { 'name': vid_uploader.text, 'url': 'https://www.youtube.com' + vid_uploader.get('href') } # Get video length. vid_length = node[6][0][0][0] assert vid_length.tag == 'span' item['lengthSeconds'] = parse_ts(vid_length.text) return item pl_items = content[1][0][0][0][0] assert pl_items.get('id') == 'pl-load-more-destination' playlist['items'] += map(parse_item, pl_items) # # Fetch and parse all continuations. # load_more = index_s(content, 1, 0, 0, 1) assert load_more.tag == 'button' load_more = load_more.get( 'data-uix-load-more-href') if load_more is not None else None while load_more is not None: # Request the continuation contents. async with session.get('https://www.youtube.com' + load_more, headers={'Accept': 'application/json'}) as response: # Assert that this really worked the way we wanted. assert response.status == 200 assert response.content_type == 'application/json' encoding = response.get_encoding() assert is_valid_encoding(encoding) # Parse the result data (large!) # TODO: This ought to be streamed as well. data = await response.json() assert 'content_html' in data assert 'load_more_widget_html' in data # Parse all new items. parser = etree.HTMLPullParser(events=('end', )) parser.feed(data['content_html']) for _, node in parser.read_events(): if node.tag == 'tr': playlist['items'] += [parse_item(node)] # Discard everything before this point. node.clear() for ancestor in node.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] # Extract the next continuation link match = re.search(r'data-uix-load-more-href=\"(.+?)\"', data['load_more_widget_html']) if match: # Next continuation link. load_more = match.group(1) else: # No more continuations. load_more = None return playlist