async def page_download(self, url, parser): content = await self.fetch(url) if content: self.page_parser(content, parser) return None else: count = 0 proxy_list = Sqlhandler.select(10) if not proxy_list: return None while count < self.retry: try: random_proxy = random.choice(proxy_list) ip = random_proxy['ip'] port = random_proxy['port'] use_proxy = 'http://%s:%s' % (ip, port) proxy_content = await self.fetch(url, use_proxy) if proxy_content: self.page_parser(proxy_content, parser) return None except Exception: count += 1 with await self.lock: proxylog.info( 'Retry many times pagedown Failed:{},'.format(url)) return None
async def crawl(self): proxylog.info('start proxy crawl') works = [ asyncio.Task(self.work(), loop=self.loop) for _ in range(self.max_tasks) ] await self.proxyq.join() proxylog.info('proxy crawl done') for w in works: w.cancel()
async def fetch(self, url, proxys=None): with await self.lock: proxylog.info('start fetch: {}'.format(url)) try: with async_timeout.timeout(self.timeout): async with self.session.get(url, headers=self.get_headers(), proxy=proxys) as response: assert response.status == 200 content = await response.text() return content except Exception as e: with await self.lock: proxylog.info('pagedown Failed:{}'.format(url)) return None
async def sucess_proxy_count(self): with await self.lock: self.sucess_proxy_num += 1 proxylog.info('Now Sucess Proxy Number is {}'.format( self.sucess_proxy_num))
async def total_proxy_count(self): with await self.lock: self.total_proxy_num += 1 if self.total_proxy_num % 50 == 0: proxylog.info('Now Total Proxy Number is {} '.format( self.total_proxy_num))
async def work(self): try: while True: seed = await self.proxyq.get() seed_flag = seed['flag'] if seed_flag == 1: url = seed['url'] await self.page_download(url, seed) self.proxyq.task_done() elif seed_flag == 2: ip = seed['ip'] port = seed['port'] proxys = 'http://%s:%s' % (ip, port) valid, speed = await self.proxy_validator(proxys) if valid: try: seed['speed'] = speed protocol, types = await self.protocolcheck(proxys) seed['protocol'] = protocol seed['proxytypes'] = types self.sqlhelper.insert(seed) with await self.lock: proxylog.info( 'sucess insert {} into database'.format( proxys)) self.proxyq.task_done() except Exception as e: with await self.lock: proxylog.exception( 'Failed insert {} into database'.format( proxys)) self.proxyq.task_done() else: self.proxyq.task_done() elif seed_flag == 3: ip = seed['ip'] port = seed['port'] proxys = 'http://%s:%s' % (ip, port) valid, speed = await self.proxy_validator(proxys) if valid: try: update_speed = {'speed': speed} update_proxy = {'ip': ip, 'port': port} self.sqlhelper.update(update_proxy, update_speed) with await self.lock: proxylog.info( 'sucess update {} in database'.format( proxys)) self.proxyq.task_done() except Exception as e: with await self.lock: proxylog.exception( 'Failed update {} in database'.format( proxys)) self.proxyq.task_done() else: try: self.sqlhelper.delete(seed) self.proxyq.task_done() except Exception as e: with await self.lock: proxylog.exception( 'Failed delete {} in database'.format( proxys)) self.proxyq.task_done() except asyncio.CancelledError: pass