class Downloader: def __init__(self, delay=5, user_agent='wswp', proxies=None, num_tries=1, catch=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_tries = num_tries self.catch = catch def __call__(self, url): result = None if self.catch: try: result = self.catch[url] except KeyError: pass else: if self.num_tries > 0 and 500 <= result['code'] < 600: result = None if result is None: self.throttle.wait(url) proxy = random.choice(self.proxies) if self.proxies else None headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36' } result = self.download(url, headers, proxy, self.num_tries) if self.catch: self.catch[url] = result return result['html'] def download(self, url, headers, proxy, num_tries, data=None): print 'downloading:', url request = urllib2.Request(url, headers=headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) html = None try: response = opener.open(request, timeout=30) html = response.read() code = response.code except urllib2.URLError as e: print 'Url error:', e html = None if hasattr(e, 'code'): code = e.code if num_tries > 0 and 500 <= code < 600: return None except Exception as e: print 'error:', e return {'html': html, 'code': code}
def __init__(self, delay=5, user_agent='wswp', proxies=None, num_tries=1, catch=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_tries = num_tries self.catch = catch
def link_crawler(seed_url, link_regex, max_depth=1, scrape_callback=None): crawl_queue = [seed_url] seen = {seed_url: 0} rp = robotparser.RobotFileParser() while crawl_queue: url = crawl_queue.pop() rp.set_url(url + '/robots.txt') rp.read() user_agent = 'wswp' if rp.can_fetch(user_agent, url): throttle = Throttle.Throttle(5) throttle.wait(url) html = download(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) depth = seen[url] if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in seen: seen[link] = depth + 1 # seen.add(link) crawl_queue.append(link) else: print 'Blocked by robots.txt:', url
def __call__(self, url): ''' 先从缓存中取出该url对应的数据,如果缓存中有该数据则不必下载也不必限速 如果缓存中没有该数据,则需要重新下载,并且下载前需要限速throttle ''' result = None if self.cache: try: result = self.cache[url] except KeyError: # 如果这个url不在缓存中 pass else: if self.num_retries > 0 and 500 <= result['code'] < 600: # 如果有服务器错误,说明之前缓存的数据有误不可用 #并且num_retries>0,则重新下载 result = None if result is None: # 此时才是真正发生下载,不是从缓存中下载获取,故需要限速,防止被封 self.throttle = Throttle.wait(self, url) proxy = random.choice(self.proxies) if self.proxies else None headers = {'User-agent': self.user_agent} self.num_retries = 1 result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries) if self.cache: # 把下载得到的html网页存进缓存中 self.cache[url] = result return result['html']
def link_crawler(seed_url, link_regex): """ crawlfrom the given seed URL following links matched by link_regex :param seed_url: :param link_regex: :return: """ #read the robots.txt rp = robotparser.RobotFileParser() rp.set_url('http://example.webscraping.com/robots.txt') rp.read() #set the agent's name user_agent = "667's Python Spider" #set the delay for crawl speed 5 second th = Throttle.Throttle(5) #set the crawl queue for crawled url crawl_queue = [seed_url] visited = set(crawl_queue) while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): th.wait(url) html = download_network_page(url) print html # filter for links matching out regular expression for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) if link not in visited: visited.add(link) crawl_queue.append(link)
def linked_download(seed_url, linked_rex=None, user_agent='wswp', proxy=None, max_depth=2, delay=3): # 按照正则匹配规则,下载关联的所有网页 print("linked_download start") # 设置延迟访问休眠对象 throttle = Throttle.Throttle(delay) # 访问过的url字典缓存 searched_urls = {} # 需要遍历的url列表 url_list = [seed_url] # 设置user-agent和代理 opener = urllib.request.build_opener(urllib.request.ProxyHandler(proxy)) opener.addheaders = [('User-agent', user_agent)] urllib.request.install_opener(opener) # 读取robot.txt rp = get_robots(seed_url) # 遍历所有的url while url_list: # 弹出当前第一个url url = url_list.pop() # robot.txt中当前代理是否允许爬取 if rp.can_fetch(user_agent, url): # 获得当前url访问过的次数(默认为0) depth = searched_urls.get(url, 0) # 如果url最大访问次数未达到次数 if depth != max_depth: # 判断当前访问是否需要延迟 throttle.wait(url) # 访问url,获得html数据 html = download(url, user_agent, proxy) # 从html中获得所有的a标签链接 linked_urls = get_linked_url(html.decode('utf-8')) # 将符合规则的a标签加入url列表 for url_item in linked_urls: # 是否符合传入的url规则 if re.search(linked_rex, url_item): # 是否还未被爬取过 if url_item not in searched_urls: # 将已经爬取过的网页保存起来,并且设置爬取的次数加1 searched_urls[url_item] = depth + 1 # 将url拼接为绝对路径 url_item = urlparse.urljoin(seed_url, url_item) # 加入当前url_list url_list.append(url_item) else: # 被robot.txt 拒绝 print('Blocked by robots.txt:' + url)
# In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd from data_pre import * from Throttle import * # In[2]: data = load_data() # In[3]: m = Throttle(data) m.update_vars(m.data) m.filter_obs() # In[4]: m.update_vars(m.data2) m.fit_by_batches() # In[5]: # Estimated parameters for the first 5 batches m.mus[:5], m.Sigs[:5] # In[6]:
def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=2, cache=None): self.throttle = Throttle.Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache
# coding=utf-8 import urllib2 import re import urlparse import Throttle # 设置下载限速(秒) throttle = Throttle.Throttle(5) data_list = [] def download(url, user_agent = 'wswp', proxy=None, re_times = 2): '''可以设置用户代理的下载方法''' print 'DownLoad....', url # 限制下载速度 throttle.wait(url) # 设置请求头 headers = {'User-agent': user_agent} request = urllib2.Request(url, headers = headers) opener = urllib2.build_opener() # 添加代理的支持 if proxy: proxy_params = {urlparse.urlparse(url).scheme:proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: html = opener.open(request).read() # html = urllib2.urlopen(request).read() except urllib2.URLError as e: print "DownLoad Error: ", e.reason
sleep(0.2) self.setLights(kOff) sleep(0.2) if __name__ == "__main__": gLog.open() # Start the simulator and controller sak = StartAndKill() sak.start("simulator") sak.start("controller") # Create the communication resources for 1 user comRes = CommunicationResources(name = 'throttle-test', host = 'localhost', port = 1235, numberOfPackages = 1) myThrottle = Throttle(name = 'Bill', comPkg = comRes.getNextPackage()) # Tell the throttle to read the layout file gLog.print("Main reading layout") msg = myThrottle.readLayout("../../runSoftware/Layout.xml") sleep(2) testing = 9 #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< set test case print ("Testing option == {0}". format(testing)) if testing == 1: # Initialize train 1111 gLog.print("Main initializing train") msg = myThrottle.initTrain(1111, [5, 1]) gLog.print("physAdd = {0}, physSlot = {1}, virtAdd = {2}, virtSlot = {3}".