def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxy=None, delay=3, max_depth=4, scrape_callback=None): """ Crawl from the given start URL following links matched by link_regex. In the current implementation, we do not actually scrapy any information. args: start_url (str): web site to start crawl link_regex (str): regex to match for links kwargs: robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) user_agent (str): user agent (default: wswp) proxy (str): proxy url, ex 'http://IP' (default: None) delay (int): seconds to throttle between requests to one domain (default: 3) max_depth (int): maximum crawl depth (to avoid traps) (default: 4) scrape_callback (function): function to call after each download (default: None) """ crawl_queue = [start_url] # keep track which URL's have seen before seen = {} data = [] if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxy=proxy) if not html: continue if scrape_callback: data.extend(scrape_callback(url, html) or []) # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxy=None, delay=3, max_depth=4, scrape_callback=None): """ 从给定的起始URL开始爬取所有正则表达式匹配到的链接 """ if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) crawl_queue = [start_url] # 定义一个set集合保存所有已经爬取过的链接 # seen = set(crawl_queue) # 新的seen为字典,增加了以发现链接的深度记录 seen = {} data = [] # 爬虫下载限速 throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() # 检查URL是否允许爬取(根据robots.txt文件中的定义) if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) # 到达最大深度,不再向队列中添加该网页中的链接 if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url=url, user_agent=user_agent, proxy=proxy) if not html: continue if scrape_callback: data.extend(scrape_callback(url, html) or []) # 在页面HTML中筛选出匹配我们正则表达式的链接 for link in get_links(html): if re.search(link_regex, link): abs_link = urljoin(start_url, link) # 检查是否已经爬取过该链接 if abs_link not in seen: # seen.add(abs_link) # 爬取深度增加 seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
class Downloader: def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = None # 每个请求我们都将设置此属性 self.cache = cache def __call__(self, url, num_retries=2): self.num_retries = num_retries try: result = self.cache[url] print('Loaded from cache:', url) except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: # 服务器错误所以忽略缓存加载的结果 # 重新下载 result = None if result is None: # 未能从缓存中加载,因此仍然需要下载 self.throttle.wait(url) proxies = choice(self.proxies) if self.proxies else None headers = {'User-Agent': self.user_agent} result = self.download(url, headers, proxies) if self.cache: # 将结果保存进缓存中 self.cache[url] = result return result['html'] def download(self, url, headers, proxies): print('Downloading:', url) try: resp = requests.get(url, headers=headers, proxies=proxies) html = resp.text if resp.status_code >= 400: print('Download error:', resp.text) html = None if self.num_retries and 500 <= resp.status_code < 600: self.num_retries -= 1 return self.download(url, headers, proxies) except requests.exceptions.RequestException as e: print('Download error:', e) return {'html': None, 'code': 500} return {'html': html, 'code': resp.status_code}
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxy=None, delay=3, max_depth=4, scrape_callback=None): crawl_queue = [start_url] seen = {} data = [] if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxy=proxy) if not html: continue if scrape_callback: data.extend(scrape_callback(url, html) or []) for link in get_links(html): if re.search(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = None # 每个请求我们都将设置此属性 self.cache = cache
class Downloader: """ Downloader class to use cache and requests for downloading pages. For contructor, pass: delay (int): # of secs delay between requests (default: 5) user_agent (str): user agent string (default: 'wswp') proxies (list[dict]): list of possible proxies, each must be a dict with http / https keys and proxy values cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code) timeout (float/int): number of seconds to wait until timeout """ def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout def __call__(self, url, num_retries=2): """ Call the downloader class, which will return HTML from cache or download it args: url (str): url to download kwargs: num_retries (int): # times to retry if 5xx code (default: 2) """ self.num_retries = num_retries try: result = self.cache[url] print('Loaded from cache:', url) except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: # server error so ignore result from cache # and re-download result = None if result is None: # result was not loaded from cache, need to download self.throttle.wait(url) proxies = choice(self.proxies) if self.proxies else None headers = {'User-Agent': self.user_agent} result = self.download(url, headers, proxies) self.cache[url] = result return result['html'] def download(self, url, headers, proxies): """ Download a and return the page content args: url (str): URL headers (dict): dict of headers (like user_agent) proxies (dict): proxy dict w/ keys 'http'/'https', values are strs (i.e. 'http(s)://IP') (default: None) """ print('Downloading:', url) try: resp = requests.get(url, headers=headers, proxies=proxies, timeout=self.timeout) html = resp.text if resp.status_code >= 400: print('Download error:', resp.text) html = None if self.num_retries and 500 <= resp.status_code < 600: # recursively retry 5xx HTTP errors self.num_retries -= 1 return self.download(url, headers, proxies) except requests.exceptions.RequestException as e: print('Download error:', e) return {'html': None, 'code': 500} return {'html': html, 'code': resp.status_code}