def config(): site = SiteData() site.name = '快代理' site.enabled = True site.pages = ['https://www.kuaidaili.com/free/{}/{}'.format(i, ii) for i in ['inha', 'intr'] for ii in range(1, 10)] return site
def config(): site = SiteData() site.name = '免费代理IP库' site.pages = [ 'http://ip.jiangxianli.com/?page=%d' % i for i in range(1, 5) ] return site
def config(): site = SiteData() site.name = '齐乐分享' site.pages = [ 'https://bbs.76fx.com/ip/pt.php?sxb=&tqsl=1000&port=&export=&ktip=&sxa=&Api=2' ] return site
def config(): site = SiteData() site.name = 'Github proxy list' site.pages = [ 'https://raw.githubusercontent.com/clarketm/proxy-list/master/proxy-list-raw.txt' ] return site
def config(): site = SiteData() site.name = '云代理 ip3366' site.enabled = True site.pages = [ 'http://www.ip3366.net/free/?stype=%s&page=%s' % (i, ii) for i in range(1, 3) for ii in range(1, 5) ] return site
def config(): site = SiteData() site.name = '小幻HTTP代理' site.use_proxy = True site.pages = ['https://ip.ihuan.me/'] site.base_url = 'https://ip.ihuan.me/' site.page_limit = 20 site.current_page = 1 return site
def config(): site = SiteData() site.name = '西刺代理' site.enabled = True site.use_proxy = True site.pages = [ 'http://www.xicidaili.com/{}/{}'.format(i, ii) for i in ['nn', 'nt', 'wn', 'wt'] for ii in range(1, 5) ] return site
async def crawl_site(self, site: SiteData, page_limit: int = 0): headers = {'User-Agent': self.get_user_agent()} headers.update(site.headers) async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout( Config.DEFAULT_REQUEST_TIME_OUT), headers=headers) as session: pages = site.pages if page_limit == 0 else site.pages[0:page_limit] for page in pages: try: await self.crawl_single_page(session, site, site.to_request(page)) except MaxRetryException as e: Logger.warn('[get] Max retry skip, message: %s' % str(e)) continue finally: if site.page_interval: await asyncio.sleep(site.page_interval)
def config(): site = SiteData() site.name = 'Spys.me' site.pages = ['http://spys.me/proxy.txt'] return site
def config(): site = SiteData() site.name = 'Proxy daily' site.pages = ['https://proxy-daily.com/'] return site
def config(): site = SiteData() site.name = '全网代理IP' site.pages = ['http://www.goubanjia.com/'] return site