async def crawl(self): logger.info(f'{self._name}开始爬取...') try: return await self.do_crawl() except Exception as e: logger.exception(f'{self._name}爬取失败:e:{e}') return []
def get_all_proxies(self): session = self._DBSession() try: return session.query(ProxyEntity).all() except Exception as e: logger.exception(e) finally: session.close() return []
def get_all_in_page(self): session = self._DBSession() try: return session.query(ProxyEntity).filter( ProxyEntity.reliability > 0).all() except Exception as e: logger.exception(e) finally: session.close() return None
def get_one_in_page(self): session = self._DBSession() try: return session.query(ProxyEntity).order_by( desc(ProxyEntity.reliability)).first() except Exception as e: logger.exception(e) finally: session.close() return None
def get_unknown_anonymity_proxies(self): session = self._DBSession() try: return (session.query(ProxyEntity).filter( ProxyEntity.reliability > 0).filter( ProxyEntity.proxy_cover == ProxyCoverEnum.UNKNOWN.value).all()) except Exception as e: logger.exception(e) finally: session.close() return []
def update_anonymity(self, url, value): conn = self._get_connect() cursor = conn.cursor() try: cursor.execute(f""" UPDATE {DB["table_name"]} SET proxy_cover = {value} WHERE url='{url}' """) conn.commit() except Exception as e: logger.exception(e) finally: cursor.close() conn.close()
async def crawl(self): logger.info(f'{self._name}开始爬取...') res = [] for url in self._urls: try: for page in self.get_page_range(): async with aiohttp.ClientSession() as session: async with session.get(self.get_page_url(url, page), headers=HEADERS) as resp: resp.encoding = self.get_encoding() temp = self.do_crawl(await resp.text()) res.extend(temp) await asyncio.sleep(self.get_interval()) except Exception as e: logger.exception(f'{self._name}爬取失败url: {url}, :e:{e}') return res