Esempio n. 1
0
 async def page_download(self, url, parser):
     content = await self.fetch(url)
     if content:
         self.page_parser(content, parser)
         return None
     else:
         count = 0
         proxy_list = Sqlhandler.select(10)
         if not proxy_list:
             return None
         while count < self.retry:
             try:
                 random_proxy = random.choice(proxy_list)
                 ip = random_proxy['ip']
                 port = random_proxy['port']
                 use_proxy = 'http://%s:%s' % (ip, port)
                 proxy_content = await self.fetch(url, use_proxy)
                 if proxy_content:
                     self.page_parser(proxy_content, parser)
                     return None
             except Exception:
                 count += 1
         with await self.lock:
             proxylog.info(
                 'Retry many times pagedown Failed:{},'.format(url))
         return None
Esempio n. 2
0
 async def crawl(self):
     proxylog.info('start proxy crawl')
     works = [
         asyncio.Task(self.work(), loop=self.loop)
         for _ in range(self.max_tasks)
     ]
     await self.proxyq.join()
     proxylog.info('proxy crawl done')
     for w in works:
         w.cancel()
Esempio n. 3
0
 async def fetch(self, url, proxys=None):
     with await self.lock:
         proxylog.info('start fetch: {}'.format(url))
     try:
         with async_timeout.timeout(self.timeout):
             async with self.session.get(url,
                                         headers=self.get_headers(),
                                         proxy=proxys) as response:
                 assert response.status == 200
                 content = await response.text()
                 return content
     except Exception as e:
         with await self.lock:
             proxylog.info('pagedown Failed:{}'.format(url))
         return None
Esempio n. 4
0
 async def sucess_proxy_count(self):
     with await self.lock:
         self.sucess_proxy_num += 1
         proxylog.info('Now Sucess Proxy Number is {}'.format(
             self.sucess_proxy_num))
Esempio n. 5
0
 async def total_proxy_count(self):
     with await self.lock:
         self.total_proxy_num += 1
         if self.total_proxy_num % 50 == 0:
             proxylog.info('Now Total Proxy Number is {} '.format(
                 self.total_proxy_num))
Esempio n. 6
0
    async def work(self):
        try:
            while True:
                seed = await self.proxyq.get()
                seed_flag = seed['flag']
                if seed_flag == 1:
                    url = seed['url']
                    await self.page_download(url, seed)
                    self.proxyq.task_done()
                elif seed_flag == 2:
                    ip = seed['ip']
                    port = seed['port']
                    proxys = 'http://%s:%s' % (ip, port)
                    valid, speed = await self.proxy_validator(proxys)
                    if valid:
                        try:
                            seed['speed'] = speed
                            protocol, types = await self.protocolcheck(proxys)
                            seed['protocol'] = protocol
                            seed['proxytypes'] = types
                            self.sqlhelper.insert(seed)
                            with await self.lock:
                                proxylog.info(
                                    'sucess insert {} into database'.format(
                                        proxys))
                            self.proxyq.task_done()
                        except Exception as e:
                            with await self.lock:
                                proxylog.exception(
                                    'Failed insert {} into database'.format(
                                        proxys))
                            self.proxyq.task_done()
                    else:
                        self.proxyq.task_done()

                elif seed_flag == 3:
                    ip = seed['ip']
                    port = seed['port']
                    proxys = 'http://%s:%s' % (ip, port)
                    valid, speed = await self.proxy_validator(proxys)
                    if valid:
                        try:
                            update_speed = {'speed': speed}
                            update_proxy = {'ip': ip, 'port': port}
                            self.sqlhelper.update(update_proxy, update_speed)
                            with await self.lock:
                                proxylog.info(
                                    'sucess update {} in database'.format(
                                        proxys))
                            self.proxyq.task_done()
                        except Exception as e:
                            with await self.lock:
                                proxylog.exception(
                                    'Failed update {} in database'.format(
                                        proxys))
                            self.proxyq.task_done()
                    else:
                        try:
                            self.sqlhelper.delete(seed)
                            self.proxyq.task_done()
                        except Exception as e:
                            with await self.lock:
                                proxylog.exception(
                                    'Failed delete {} in database'.format(
                                        proxys))
                            self.proxyq.task_done()

        except asyncio.CancelledError:
            pass