def crawl_sitemap(url): # download the sitemap file as sitemap.xml sitemap = Download(url) links = re.findall('<loc>(.*?)</loc>', sitemap.decode("utf-8")) # download each link for link in links: html = Download(link)
def crawl_sitemap(url): # download the sitemap file sitemap = Download(url) #>Downloading: http://example.webscraping.com/sitemap.xml # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = Download(link)
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] # the queue of URL's to download while crawl_queue: url = crawl_queue.pop() html = Download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): # add this link to the crawl queue crawl_queue.append(link)
def iteration(): for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page #url = 'http://example.webscraping.com/view/-{}'.format(page) html = Download(url) if html is None: # received an error trying to download this webpage # so assume have reached the last country ID and can stop downloading break else: # success - can scrape the result # ... pass
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] seen = set(crawl_queue) # keep track which URL's have seen before while crawl_queue: url = crawl_queue.pop() html = Download(url) for link in get_links(html): # check if link matches expected regex if re.match(link_regex, link): # form absolute link link = urlparse.urljoin(seed_url, link) crawl_queue.append(link)
def iteration(): """连续N次下载出错后推出程序""" max_errors = 5 # maximum number of consecutive download errors allowed num_errors = 0 # current number of consecutive download errors for page in itertools.count(1): url = 'http://example.webscraping.com/places/default/view/-{}'.format( page) html = Download(url) if html is None: # received an error trying to download this webpage num_errors += 1 if num_errors == max_errors: # reached maximum amount of errors in a row so exit break # so assume have reached the last country ID and can stop downloading else: # success - can scrape the result # ... num_errors = 0
from common import Download from parsers import HtmlParser dl = Download() parse = HtmlParser() content = dl.download('http://theater.mtime.com/China_Beijing/') res = parse._parse_movies(content) print(res)
def __init__(self): self.manager = URLManager() self.down = Download() self.parser = HtmlParser() self.output = DataOutput()