Python download Beispiele

Programmiersprache: Python

Namespace / Paketname: user_agent_download

Methode / Funktion: download

Beispiele auf hotexamples.com: 8

Python download - 8 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die user_agent_download.download, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

def main():
    times = {}
    html = download(
        'http://example.webscraping.com/places/default/view/United-Kingdom-239'
    )
    html = html.decode('utf-8')
    NUM_ITERATIONS = 1000  # number of times to test each scraper
    for name, scraper in ('Regular expressions', regex_scraper), (
            'Beautiful Soup', beautiful_soup_scraper), ('Lxml', lxml_scraper):
        times[name] = []
        # record start time of scrape
        start = time.time()
        for i in range(NUM_ITERATIONS):
            if scraper == regex_scraper:
                # the regular expression module will cache results
                # so need to purge this cache for meaningful timings
                re.purge()
            result = scraper(html)

            # check scraped result is as expected
            assert (result['area'] == '244,820 square kilometres')
            times[name].append(time.time() - start)
        # record end time of scrape and output the total
        end = time.time()
        print('{}: {:.2f} seconds'.format(name, end - start))

Beispiel #2

Datei anzeigen

def soupTest2():
    html = download(
        'http://example.webscraping.com/places/default/view/United-Kingdom-239'
    )
    soup = BeautifulSoup(html, 'html.parser')
    tr = soup.find(attrs={'id': 'places_area__row'})
    td = tr.find(attrs={'class': 'w2p_fw'})
    print(td)
    area = td.text
    print(area)

Beispiel #3

Datei anzeigen

Datei: link_crawler1.py Projekt: CodeMinge/syntax_algorithm_of_python

def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    while crawl_queue:
        url = crawl_queue.pop()
        print(url)
        html = download(url)
        html = html.decode('utf-8')
        for link in get_links(html):
            if re.match(link_regex, link):
                crawl_queue.append(link)
                print(crawl_queue)

Beispiel #4

Datei anzeigen

Datei: iteration_crawler1.py Projekt: CodeMinge/syntax_algorithm_of_python

def iteration():
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/places/default/view/-{}'.format(
            page)
        html = download(url)
        if html is None:
            # received an error trying to download this webpage
            # so assume have reached the last country ID and can stop downloading
            break
        else:
            # success - can scrape the result
            # ...
            pass

Beispiel #5

Datei anzeigen

def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    seen = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        print(url)
        html = download(url)
        html = html.decode('utf-8')
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urllib.parse.urljoin(seed_url, link)
                if link not in seen:
                    seen.add(link)
                    crawl_queue.append(link)
        print(crawl_queue)

Beispiel #6

Datei anzeigen

Datei: iteration_crawler2.py Projekt: CodeMinge/syntax_algorithm_of_python

def iteration():
    max_errors = 5  # maximum number of consecutive download errors allowed
    num_errors = 0  # current number of consecutive download errors
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/places/default/view/-{}'.format(
            page)
        html = download(url)
        if html is None:
            # received an error trying to download this webpage
            num_errors += 1
            if num_errors == max_errors:
                # reached maximum amount of errors in a row so exit
                break
            # so assume have reached the last country ID and can stop downloading
        else:
            # success - can scrape the result
            # ...
            num_errors = 0

Beispiel #7

Datei anzeigen

Datei: myLxmlTest.py Projekt: CodeMinge/syntax_algorithm_of_python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import lxml.html
from user_agent_download import download


def lxmlTest():
    broken_html = '<ul class=country><li>Area<li>Population</ul>'
    tree = lxml.html.fromstring(broken_html)
    fixed_html = lxml.html.tostring(tree, pretty_print=True)
    print(fixed_html)
    print('==lxmlTest Over==')


def scrape(html):
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
    area = td.text_content()
    print(area)
    return area


if __name__ == '__main__':
    lxmlTest()
    html = download(
        'http://example.webscraping.com/places/default/view/United-Kingdom-239'
    )
    scrape(html)

Beispiel #8

Datei anzeigen

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
from user_agent_download import download

url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
html = download(url)
html = html.decode('utf-8')

print(re.findall('<td class="w2p_fw">(.*?)</td>',html))

print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label class="readonly" for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td>',html))