def link_crawler(seed_url, link_regex, max_depth=1, scrape_callback=None): crawl_queue = [seed_url] seen = {seed_url: 0} rp = robotparser.RobotFileParser() while crawl_queue: url = crawl_queue.pop() rp.set_url(url + '/robots.txt') rp.read() user_agent = 'wswp' if rp.can_fetch(user_agent, url): throttle = Throttle.Throttle(5) throttle.wait(url) html = download(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) depth = seen[url] if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen[link] = depth + 1 # seen.add(link) crawl_queue.append(link) else: print 'Blocked by robots.txt:', url
def link_crawler(seed_url, link_regex): """ crawlfrom the given seed URL following links matched by link_regex :param seed_url: :param link_regex: :return: """ #read the robots.txt rp = robotparser.RobotFileParser() rp.set_url('http://example.webscraping.com/robots.txt') rp.read() #set the agent's name user_agent = "667's Python Spider" #set the delay for crawl speed 5 second th = Throttle.Throttle(5) #set the crawl queue for crawled url crawl_queue = [seed_url] visited = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): th.wait(url) html = download_network_page(url) print html # filter for links matching out regular expression for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in visited: visited.add(link) crawl_queue.append(link)
def __init__(self, delay=5, user_agent='wswp', proxies=None, num_tries=1, catch=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_tries = num_tries self.catch = catch
def linked_download(seed_url, linked_rex=None, user_agent='wswp', proxy=None, max_depth=2, delay=3): # 按照正则匹配规则,下载关联的所有网页 print("linked_download start") # 设置延迟访问休眠对象 throttle = Throttle.Throttle(delay) # 访问过的url字典缓存 searched_urls = {} # 需要遍历的url列表 url_list = [seed_url] # 设置user-agent和代理 opener = urllib.request.build_opener(urllib.request.ProxyHandler(proxy)) opener.addheaders = [('User-agent', user_agent)] urllib.request.install_opener(opener) # 读取robot.txt rp = get_robots(seed_url) # 遍历所有的url while url_list: # 弹出当前第一个url url = url_list.pop() # robot.txt中当前代理是否允许爬取 if rp.can_fetch(user_agent, url): # 获得当前url访问过的次数(默认为0) depth = searched_urls.get(url, 0) # 如果url最大访问次数未达到次数 if depth != max_depth: # 判断当前访问是否需要延迟 throttle.wait(url) # 访问url,获得html数据 html = download(url, user_agent, proxy) # 从html中获得所有的a标签链接 linked_urls = get_linked_url(html.decode('utf-8')) # 将符合规则的a标签加入url列表 for url_item in linked_urls: # 是否符合传入的url规则 if re.search(linked_rex, url_item): # 是否还未被爬取过 if url_item not in searched_urls: # 将已经爬取过的网页保存起来,并且设置爬取的次数加1 searched_urls[url_item] = depth + 1 # 将url拼接为绝对路径 url_item = urlparse.urljoin(seed_url, url_item) # 加入当前url_list url_list.append(url_item) else: # 被robot.txt 拒绝 print('Blocked by robots.txt:' + url)
# In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd from data_pre import * from Throttle import * # In[2]: data = load_data() # In[3]: m = Throttle(data) m.update_vars(m.data) m.filter_obs() # In[4]: m.update_vars(m.data2) m.fit_by_batches() # In[5]: # Estimated parameters for the first 5 batches m.mus[:5], m.Sigs[:5] # In[6]:
def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=2, cache=None): self.throttle = Throttle.Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache
# coding=utf-8 import urllib2 import re import urlparse import Throttle # 设置下载限速(秒) throttle = Throttle.Throttle(5) data_list = [] def download(url, user_agent = 'wswp', proxy=None, re_times = 2): '''可以设置用户代理的下载方法''' print 'DownLoad....', url # 限制下载速度 throttle.wait(url) # 设置请求头 headers = {'User-agent': user_agent} request = urllib2.Request(url, headers = headers) opener = urllib2.build_opener() # 添加代理的支持 if proxy: proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = opener.open(request).read() # html = urllib2.urlopen(request).read() except urllib2.URLError as e: print "DownLoad Error: ", e.reason