1-re:
import re
from chp1.advanced_link_crawler import download

url = 'http://example.webscraping.com/places/default/view/Aland-Islands-2'
html = download(url)
#type(html)为<class 'str'>
#urlopen(url).read()得到的是bytes-like object,正则表达式的string模式不适用该对象
#urlopen(url).read().decode('utf-8')即为<class 'str'>,可用正则表达式匹配

print(re.findall(r'<td class="w2p_fw">(.*?)</td>', html))
print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[1])
print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label id="places_area__label" class="readonly" for="places_area" >Area: </label></td><td class="w2p_fw">(.*?)</td>', html))
print(re.findall('''<tr id="places_area__row">.*?<td\s*class=["']w2p_fw["']>(.*?)</td>''', html))


2——BeautifulSoup
from bs4 import BeautifulSoup
from pprint import pprint
import html5lib
broken_html='<ul class=country><li>Area<li>Population</ul>'
soup=BeautifulSoup(broken_html,'html.parser')
#<ul class="country"><li>Area<li>Population</li></li></ul>;代码闭合,但<li>标签嵌套
soup=BeautifulSoup(broken_html,'html5lib')#两种编译器的区别:html.parser,html5lib
#<html><head></head><body><ul class="country"><li>Area</li><li>Population</li></ul></body></html>
#更完整,更正确
soup.li#标签li内的内容
soup.body#标签body类的内容

soup.find('ul',attrs={'class':'country'})
soup.find(attrs={'class':'country'})#<ul class="country"><li>Area</li><li>Population</li></ul>
Ejemplo n.º 2
0
import re
from chp1.advanced_link_crawler import download

url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
html = download(url)

print(re.findall(r'<td class="w2p_fw">(.*?)</td>', html))

print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[1])

print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td>', html))

print(re.findall('''<tr id="places_area__row">.*?<td\s*class=["']w2p_fw["']>(.*?)</td>''', html))
import time
import re
from chp2.all_scrapers import re_scraper, bs_scraper, \
    lxml_scraper, lxml_xpath_scraper
from chp1.advanced_link_crawler import download

NUM_ITERATIONS = 1000  # number of times to test each scraper
html = download(
    'http://example.python-scraping.com/places/view/United-Kingdom-233')

scrapers = [('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper),
            ('Lxml', lxml_scraper), ('Xpath', lxml_xpath_scraper)]

for name, scraper in scrapers:
    # record start time of scrape
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()
        result = scraper(html)
        # check scraped result is as expected
        assert result['area'] == '244,820 square kilometres'
    # record end time of scrape and output the total
    end = time.time()
    print('%s: %.2f seconds' % (name, end - start))