1-re: import re from chp1.advanced_link_crawler import download url = 'http://example.webscraping.com/places/default/view/Aland-Islands-2' html = download(url) #type(html)为<class 'str'> #urlopen(url).read()得到的是bytes-like object,正则表达式的string模式不适用该对象 #urlopen(url).read().decode('utf-8')即为<class 'str'>,可用正则表达式匹配 print(re.findall(r'<td class="w2p_fw">(.*?)</td>', html)) print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[1]) print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label id="places_area__label" class="readonly" for="places_area" >Area: </label></td><td class="w2p_fw">(.*?)</td>', html)) print(re.findall('''<tr id="places_area__row">.*?<td\s*class=["']w2p_fw["']>(.*?)</td>''', html)) 2——BeautifulSoup from bs4 import BeautifulSoup from pprint import pprint import html5lib broken_html='<ul class=country><li>Area<li>Population</ul>' soup=BeautifulSoup(broken_html,'html.parser') #<ul class="country"><li>Area<li>Population</li></li></ul>;代码闭合,但<li>标签嵌套 soup=BeautifulSoup(broken_html,'html5lib')#两种编译器的区别:html.parser,html5lib #<html><head></head><body><ul class="country"><li>Area</li><li>Population</li></ul></body></html> #更完整,更正确 soup.li#标签li内的内容 soup.body#标签body类的内容 soup.find('ul',attrs={'class':'country'}) soup.find(attrs={'class':'country'})#<ul class="country"><li>Area</li><li>Population</li></ul>
import re from chp1.advanced_link_crawler import download url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239' html = download(url) print(re.findall(r'<td class="w2p_fw">(.*?)</td>', html)) print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[1]) print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td>', html)) print(re.findall('''<tr id="places_area__row">.*?<td\s*class=["']w2p_fw["']>(.*?)</td>''', html))
import time import re from chp2.all_scrapers import re_scraper, bs_scraper, \ lxml_scraper, lxml_xpath_scraper from chp1.advanced_link_crawler import download NUM_ITERATIONS = 1000 # number of times to test each scraper html = download( 'http://example.python-scraping.com/places/view/United-Kingdom-233') scrapers = [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper), ('Xpath', lxml_xpath_scraper)] for name, scraper in scrapers: # record start time of scrape start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scraper: re.purge() result = scraper(html) # check scraped result is as expected assert result['area'] == '244,820 square kilometres' # record end time of scrape and output the total end = time.time() print('%s: %.2f seconds' % (name, end - start))