def __init__(self, delay=1, headers=None, cookies=None, proxies=None, num_retries=5, cache=None): self.throttle = Throttle(delay) self.headers = headers self.cookies = cookies self.proxies = ProxiesPool() self.num_retries = num_retries self.cache = cache
def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=1, cache=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache
def link_crawler(seed_url, link_regex, user_agent, proxyD=None, headers=None, num_retries=1, delay=2, max_depth=-1, max_urls=-1): rp = get_robots(seed_url) #Crawl from seed_url following links que cumplan el link_regex crawl_queue = Queue(0) crawl_queue.put(seed_url) # the URL's that have been seen and at what depth seen = {seed_url: 0} num_urls = 0 throttle = Throttle(delay) headers = headers or {} if user_agent: headers['User-agent'] = user_agent while crawl_queue: url = crawl_queue.get() if rp.can_fetch(user_agent, url): throttle.wait(url) pagina = download(url, user_agent, proxyD, num_retries) links = [] depth = seen[url] if depth != max_depth: links = get_links(pagina) #filtro para links que encajan con el link_regex for link in links: link = normalize(seed_url, link) if link not in seen and re.match(link_regex, link): seen[link] = depth + 1 if same_domain(seed_url, link): # success! add this new link to queue crawl_queue.put(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == max_urls: break else: print('Blocked by robots.txt:', url)
class Downloader: def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=1, cache=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache def __call__(self, url): '''对象被直接调用时执行''' # 延时下载 self.throttle.wait(url) # 如果代理列表不为空则随机返回一个代理地址 proxy = random.choice(self.proxies) if self.proxies else None headers = {'User-agent': self.user_agent} return self.download(url, headers, proxy, self.num_retries) def download(self, url, headers, proxy, num_retries, data=None): print 'Downloading:', url request = urllib2.Request(url, data, headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib2.URLError as e: print 'Download error:', e.reason html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: # retry 5XX HTTP errors return self.download(url, headers, proxy, num_retries - 1, data) else: code = None return {'code': code, 'html': html}
class Downloader(object): def __init__(self, delay=1, headers=None, cookies=None, proxies=None, num_retries=5, cache=None): self.throttle = Throttle(delay) self.headers = headers self.cookies = cookies self.proxies = ProxiesPool() self.num_retries = num_retries self.cache = cache def __call__(self, url): result = None if self.cache: try: # load cache result = self.cache[url] if result['code'] == 200: print('Loading html from cache ' + url) else: result = None except KeyError: pass if result is None: result = self.download(url, self.headers, self.cookies, self.proxies, self.num_retries) if self.cache: self.cache[url] = result # 字典中存放字典{'html': html, 'code': code} return result['html'] def download(self, url, headers, cookies, proxies, num_retries): print('Downloading ' + url) html = '' # 当异常发生以及服务多次响应5XX,也能返回空字符串 code = None proxies_to_pass = None proxy_ip = proxies.get_proxy() if proxies else None # 随机获取代理ip if proxy_ip: proxies_to_pass = {'http': "http://{}".format(proxy_ip[0])} if num_retries > 0: try: self.throttle.wait(url) print(proxies_to_pass) r = requests.get(url, headers=headers, cookies=cookies, proxies=proxies_to_pass, timeout=30) code = r.status_code print(code) r.raise_for_status() if code == 200: html = r.text # print(html) elif code and 500 < code < 600: # retry 5XX HTTP errors return self.download(url, headers, cookies, proxies, num_retries-1) except requests.exceptions.HTTPError: print('----Download error: ' + r.reason) return self.download(url, headers, cookies, proxies, num_retries-1) except requests.exceptions.Timeout: print('----Download error: Timeout, we download it again with another proxy.') return self.download(url, headers, cookies, proxies, num_retries-1) except: print('----Unknow error:May be CANNOT connect to proxy.') return self.download(url, headers, cookies, proxies, num_retries-1) else: print('----Unknow error:We download to much times.') result = {'html': html, 'code': code} if self.cache: self.cache[url] = result # print(result) return result
def main(): if len(sys.argv) > 1: config_path = sys.argv[1] else: config_path = './configs/config_default.txt' if not Path(config_path).is_file(): logging.error("Could not find config file!") sys.exit(1) # exiting with error code # load config config = configparser.ConfigParser() config.read(config_path) log_dir = config['PATHS']['log_dir'] log_file_name = config['PATHS']['log_file_name'] # check if config dir is present if not Path(log_dir).is_dir(): logging.error("Logging directory is not present!") sys.exit(1) # exiting with error code file_handler = TimedRotatingFileHandler(os.path.join( os.path.dirname(__file__), log_dir, log_file_name), when='midnight', interval=1) console_handler = logging.StreamHandler() logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[file_handler, console_handler]) logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) logging.getLogger("apscheduler.scheduler").setLevel(logging.WARNING) logging.getLogger("apscheduler.executors.default").setLevel( logging.WARNING) logging.getLogger("chardet.charsetprober").setLevel(logging.WARNING) logger.info("=======Starting=Crawler=========") # store config preferences in variables article_download_pattern = ([ (int(config['ARTICLE_DOWNLOAD_PATTERN']['number']), int(config['ARTICLE_DOWNLOAD_PATTERN']['delay'])), ]) # [(application number, period in seconds) ... ] number_download_worker = int(config['CRAWLING']['number_download_worker']) website_request_timeout = int( config['REQUESTS']['website_request_timeout']) rss_feed_crawl_period = int(config['CRAWLING']['rss_feed_crawl_period']) rss_feed_request_timeout = int( config['REQUESTS']['rss_feed_request_timeout']) warmup_iterations = int(config['CRAWLING']['warmup_iterations']) throttle_velocity = float(config['CRAWLING']['throttle_velocity']) max_offset = int(config['CRAWLING']['max_offset']) downloads_path = config['PATHS']['downloads'] crawled_rss_articles_path = config['PATHS']['rss_articles'] feed_path = config['PATHS']['feeds_list'] requests_path = config['PATHS']['requests'] # partly validating the config if not Path(feed_path).is_file(): logging.error("Could not find RSS feeds list file!") sys.exit(1) # exiting with error code parent_dir = os.path.dirname(requests_path) if not Path(parent_dir).is_dir(): logging.error("Could not find requests directory!") sys.exit(1) # exiting with error code writer = Writer() writer.start() throttle = Throttle(request_velocity=throttle_velocity) rss_requester = Requester(tag="RSS Requester", path=requests_path, throttle=throttle) website_requester = Requester(tag="Website Requester", path=requests_path, throttle=throttle) scheduler = Scheduler(patterns=article_download_pattern) crawler = Crawler(requester=rss_requester, scheduler=scheduler, feed_path=feed_path, crawled_rss_articles_path=crawled_rss_articles_path, rss_feed_crawl_period=rss_feed_crawl_period, rss_feed_request_timeout=rss_feed_request_timeout, warmup_iterations=warmup_iterations, max_offset=max_offset) crawler.start() for i in range(number_download_worker): logger.info("Starting download worker #%d", i) DownloadWorker(requester=website_requester, timeout=website_request_timeout, path=downloads_path).start() while True: time.sleep(60) logger.debug("Number of threads running: %d", threading.active_count()) process = psutil.Process(os.getpid()) ram_usage = process.memory_full_info() # percent = absolute/mem.total logger.info("RAM usage: %s%%, %s", process.memory_percent(), ram_usage)
depth = seen[url] # 如果深度到达约定数,则不再爬取链接 if depth != max_depth: # 将每个html页面符合条件的url加入队列 for link in get_links(html): if re.search(link_regex, link): # 补全链接 link = urlparse.urljoin(seed_url, link) # 如果已经入队,则不必再入队,避免重复下载 if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) def get_links(html): """返回一个页面的所有链接""" webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) return webpage_regex.findall(html) # 回调函数 import lxml.html def scrae_call(url, html): FIELLDS = ['area'] if re.search('/view', url): tree = lxml.html.fromstring(html) row = [tree.cssselect('tr#places_%s__row > td.w2p_fw' % field)[0].text_content() for field in FIELLDS] print url, row import Callback link_crawler('http://example.webscraping.com/places/default/index', '/(index|view)', throttle=Throttle(1), max_depth=-1, scrae_callback=Callback.ScraeCallback())
if __name__ == "__main__": gLog.open() name = "Bill" # Start the simulator and controller sak = StartAndKill() sak.start("simulator") sak.start("controller") # Create the communication resources for 5 users comRes = CommunicationResources(name = 'gui thr test', host = 'localhost', port = 1235, numberOfPackages = 5) # Create a throttle to read the layout file throttle = Throttle(name = name, comPkg = comRes.getNextPackage()) gLog.print("Main: read layout file") msg = throttle.readLayout("../../runSoftware/Layout.xml") sleep(2) responseFlag, code = (msg.responseFlag, msg.code) gLog.print("Main: finished reading layout file: responseFlag = {0} and code ={1}".format(responseFlag, code)) if responseFlag != 1: print("ABEND") print("Error in XML file with flag = {0} and code = {1}".format(responseFlag, code)) print("THE END") input("press enter to quit") throttle.close() # Start four gui throttles gLog.print("Main: begin start four GuiThrottleProcess") for i in range(4):
from Controller import Controller # Sensor object initialization a = BMP180() rp = MPU6050() initialAltitude = a.getAltitude() (initialRoll, initialPitch) = rp.getAngles() initialYaw = 0 # currently unused # Motor object initialization frequency = 55.5 maxThrottle = 10 minThrottle = 5 # motorNumber, motorPin, frequency, maxThrottle, minThrottle t1 = Throttle(1, 12, frequency, maxThrottle, minThrottle) t2 = Throttle(2, 13, frequency, maxThrottle, minThrottle) t3 = Throttle(3, 22, frequency, maxThrottle, minThrottle) t4 = Throttle(4, 18, frequency, maxThrottle, minThrottle) # Controller object initialization # roll controller variables rSetPoint = 0 rIntegralThreshold = 90 rKP = 0.9 rKI = 0 rKD = 0.1 # pitch controller variables pSetPoint = 0 pIntegralThreshold = 90