Beispiel #1
0
def main():
    times = {}
    html = download(
        'http://example.webscraping.com/places/default/view/United-Kingdom-239'
    )
    html = html.decode('utf-8')
    NUM_ITERATIONS = 1000  # number of times to test each scraper
    for name, scraper in ('Regular expressions', regex_scraper), (
            'Beautiful Soup', beautiful_soup_scraper), ('Lxml', lxml_scraper):
        times[name] = []
        # record start time of scrape
        start = time.time()
        for i in range(NUM_ITERATIONS):
            if scraper == regex_scraper:
                # the regular expression module will cache results
                # so need to purge this cache for meaningful timings
                re.purge()
            result = scraper(html)

            # check scraped result is as expected
            assert (result['area'] == '244,820 square kilometres')
            times[name].append(time.time() - start)
        # record end time of scrape and output the total
        end = time.time()
        print('{}: {:.2f} seconds'.format(name, end - start))
Beispiel #2
0
def soupTest2():
    html = download(
        'http://example.webscraping.com/places/default/view/United-Kingdom-239'
    )
    soup = BeautifulSoup(html, 'html.parser')
    tr = soup.find(attrs={'id': 'places_area__row'})
    td = tr.find(attrs={'class': 'w2p_fw'})
    print(td)
    area = td.text
    print(area)
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    while crawl_queue:
        url = crawl_queue.pop()
        print(url)
        html = download(url)
        html = html.decode('utf-8')
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_queue.append(link)
                print(crawl_queue)
def iteration():
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/places/default/view/-{}'.format(
            page)
        html = download(url)
        if html is None:
            # received an error trying to download this webpage
            # so assume have reached the last country ID and can stop downloading
            break
        else:
            # success - can scrape the result
            # ...
            pass
Beispiel #5
0
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        print(url)
        html = download(url)
        html = html.decode('utf-8')
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urllib.parse.urljoin(seed_url, link)
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
        print(crawl_queue)
def iteration():
    max_errors = 5  # maximum number of consecutive download errors allowed
    num_errors = 0  # current number of consecutive download errors
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/places/default/view/-{}'.format(
            page)
        html = download(url)
        if html is None:
            # received an error trying to download this webpage
            num_errors += 1
            if num_errors == max_errors:
                # reached maximum amount of errors in a row so exit
                break
            # so assume have reached the last country ID and can stop downloading
        else:
            # success - can scrape the result
            # ...
            num_errors = 0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import lxml.html
from user_agent_download import download


def lxmlTest():
    broken_html = '<ul class=country><li>Area<li>Population</ul>'
    tree = lxml.html.fromstring(broken_html)
    fixed_html = lxml.html.tostring(tree, pretty_print=True)
    print(fixed_html)
    print('==lxmlTest Over==')


def scrape(html):
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
    area = td.text_content()
    print(area)
    return area


if __name__ == '__main__':
    lxmlTest()
    html = download(
        'http://example.webscraping.com/places/default/view/United-Kingdom-239'
    )
    scrape(html)
Beispiel #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
from user_agent_download import download

url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
html = download(url)
html = html.decode('utf-8')

print(re.findall('<td class="w2p_fw">(.*?)</td>',html))

print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label class="readonly" for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td>',html))