def crawl_sitemap(url):
    # download the sitemap file as sitemap.xml
    sitemap = Download(url)
    links = re.findall('<loc>(.*?)</loc>', sitemap.decode("utf-8"))
    # download each link
    for link in links:
        html = Download(link)
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = Download(url)
    #>Downloading: http://example.webscraping.com/sitemap.xml
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = Download(link)
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]  # the queue of URL's to download
    while crawl_queue:
        url = crawl_queue.pop()
        html = Download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                # add this link to the crawl queue
                crawl_queue.append(link)
def iteration():
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-%d' % page
        #url = 'http://example.webscraping.com/view/-{}'.format(page)
        html = Download(url)
        if html is None:
            # received an error trying to download this webpage
            # so assume have reached the last country ID and can stop downloading
            break
        else:
            # success - can scrape the result
            # ...
            pass
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    seen = set(crawl_queue) # keep track which URL's have seen before
    while crawl_queue:
        url = crawl_queue.pop()
        html = Download(url)
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urlparse.urljoin(seed_url, link)
                crawl_queue.append(link)
Exemple #6
0
def iteration():
    """连续N次下载出错后推出程序"""
    max_errors = 5  # maximum number of consecutive download errors allowed
    num_errors = 0  # current number of consecutive download errors
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/places/default/view/-{}'.format(
            page)
        html = Download(url)
        if html is None:
            # received an error trying to download this webpage
            num_errors += 1
            if num_errors == max_errors:
                # reached maximum amount of errors in a row so exit
                break
            # so assume have reached the last country ID and can stop downloading
        else:
            # success - can scrape the result
            # ...
            num_errors = 0
Exemple #7
0
from common import Download
from parsers import HtmlParser

dl = Download()
parse = HtmlParser()

content = dl.download('http://theater.mtime.com/China_Beijing/')
res = parse._parse_movies(content)
print(res)
Exemple #8
0
 def __init__(self):
     self.manager = URLManager()
     self.down = Download()
     self.parser = HtmlParser()
     self.output = DataOutput()