async def parse_site(self, session, site: SiteData, resp: SiteResponse): parser = self._parsers.get(site.key) if not parser: return try: result = parser(resp) if not self._test_model: await self.save_parse_result(session, site, result) else: await self.show_result(session, site, result, resp=resp) except Exception as e: Logger.error('[get] Parse error, message: %s' % str(e))
async def main(): argv = None if len(sys.argv) > 1: argv = sys.argv[1] if argv and argv.find('://') > 0: return await load_from_url(argv) res = os.listdir('.') ip_file_lists = [name for name in res if name.find('.ip.txt') > 0] if argv: if argv not in ip_file_lists: Logger.error('file %s doesn\'t exists' % argv) return else: ip_file_lists = [argv] for fn in ip_file_lists: await load_file(fn)
async def crawl_single_page(self, session, site, request: SiteRequestData): proxy = None if request.use_proxy is True: random_proxy = await IPFactory.get_random_ip( request.url.find('https') == 0) if random_proxy: proxy = random_proxy.to_http() try: async with session.get(request.url, proxy=proxy) as resp: text = await resp.text() if not text: raise EmptyResponseException('empty text') site_resp = SiteResponse(text, url=request.url, site=site) await self.parse_site(session, site, site_resp) except Exception as e: Logger.error('[get] Get page %s error, message: %s' % (request.url, str(e))) raise RetryException() from e
async def handle_task_exception(self, e): Logger.error('[error] ' + str(e)) await asyncio.sleep(5) #