def crawl_sitemap(url): # download the sitemap file as sitemap.xml sitemap = Download(url) links = re.findall('<loc>(.*?)</loc>', sitemap.decode("utf-8")) # download each link for link in links: html = Download(link)
class SpiderMain(object): def __init__(self): self.manager = URLManager() self.down = Download() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.down.download(root_url) movie_ids = self.parser.parse_urls(content) count = 0 for mid in movie_ids: if count > 10: break movie_link = '''http://service.library.mtime.com/Movie.api?\ Ajax_CallBack=true\ &Ajax_CallBackType=Mtime.Library.Services\ &Ajax_CallBackMethod=GetMovieOverviewRating\ &Ajax_CrossDomain=1\ &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F{0}%2F\ &t={1}\ &Ajax_CallBackArgument0={2}\ '''.format(mid, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"), mid) res = self.down.download(movie_link.replace(' ', '')) self.parser.parser_json(res) count += 1 self.output.store_data(self.parser.items) self.output.close_connect()
def crawl_sitemap(url): # download the sitemap file sitemap = Download(url) #>Downloading: http://example.webscraping.com/sitemap.xml # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = Download(link)
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] # the queue of URL's to download while crawl_queue: url = crawl_queue.pop() html = Download(url) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): # add this link to the crawl queue crawl_queue.append(link)
def iteration(): for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page #url = 'http://example.webscraping.com/view/-{}'.format(page) html = Download(url) if html is None: # received an error trying to download this webpage # so assume have reached the last country ID and can stop downloading break else: # success - can scrape the result # ... pass
def link_crawler(seed_url, link_regex): """Crawl from the given seed URL following links matched by link_regex """ crawl_queue = [seed_url] seen = set(crawl_queue) # keep track which URL's have seen before while crawl_queue: url = crawl_queue.pop() html = Download(url) for link in get_links(html): # check if link matches expected regex if re.match(link_regex, link): # form absolute link link = urlparse.urljoin(seed_url, link) crawl_queue.append(link)
def iteration(): """连续N次下载出错后推出程序""" max_errors = 5 # maximum number of consecutive download errors allowed num_errors = 0 # current number of consecutive download errors for page in itertools.count(1): url = 'http://example.webscraping.com/places/default/view/-{}'.format( page) html = Download(url) if html is None: # received an error trying to download this webpage num_errors += 1 if num_errors == max_errors: # reached maximum amount of errors in a row so exit break # so assume have reached the last country ID and can stop downloading else: # success - can scrape the result # ... num_errors = 0
from common import Download from parsers import HtmlParser dl = Download() parse = HtmlParser() content = dl.download('http://theater.mtime.com/China_Beijing/') res = parse._parse_movies(content) print(res)
def __init__(self): self.manager = URLManager() self.down = Download() self.parser = HtmlParser() self.output = DataOutput()