Esempio n. 1
0
 async def crawl(self):
     logger.info(f'{self._name}开始爬取...')
     try:
         return await self.do_crawl()
     except Exception as e:
         logger.exception(f'{self._name}爬取失败:e:{e}')
     return []
Esempio n. 2
0
 def get_all_proxies(self):
     session = self._DBSession()
     try:
         return session.query(ProxyEntity).all()
     except Exception as e:
         logger.exception(e)
     finally:
         session.close()
     return []
Esempio n. 3
0
 def get_all_in_page(self):
     session = self._DBSession()
     try:
         return session.query(ProxyEntity).filter(
             ProxyEntity.reliability > 0).all()
     except Exception as e:
         logger.exception(e)
     finally:
         session.close()
     return None
Esempio n. 4
0
 def get_one_in_page(self):
     session = self._DBSession()
     try:
         return session.query(ProxyEntity).order_by(
             desc(ProxyEntity.reliability)).first()
     except Exception as e:
         logger.exception(e)
     finally:
         session.close()
     return None
Esempio n. 5
0
 def get_unknown_anonymity_proxies(self):
     session = self._DBSession()
     try:
         return (session.query(ProxyEntity).filter(
             ProxyEntity.reliability > 0).filter(
                 ProxyEntity.proxy_cover ==
                 ProxyCoverEnum.UNKNOWN.value).all())
     except Exception as e:
         logger.exception(e)
     finally:
         session.close()
     return []
Esempio n. 6
0
 def update_anonymity(self, url, value):
     conn = self._get_connect()
     cursor = conn.cursor()
     try:
         cursor.execute(f"""
         UPDATE {DB["table_name"]} SET proxy_cover = {value}
         WHERE url='{url}'
         """)
         conn.commit()
     except Exception as e:
         logger.exception(e)
     finally:
         cursor.close()
         conn.close()
Esempio n. 7
0
 async def crawl(self):
     logger.info(f'{self._name}开始爬取...')
     res = []
     for url in self._urls:
         try:
             for page in self.get_page_range():
                 async with aiohttp.ClientSession() as session:
                     async with session.get(self.get_page_url(url, page),
                                            headers=HEADERS) as resp:
                         resp.encoding = self.get_encoding()
                         temp = self.do_crawl(await resp.text())
                         res.extend(temp)
                         await asyncio.sleep(self.get_interval())
         except Exception as e:
             logger.exception(f'{self._name}爬取失败url: {url}, :e:{e}')
     return res