class ServerCtrlMiddleware(Middleware): logger = getLogger(__name__) def process_request(self, request): s = request.spider if s.status == 'PAUSE': self.logger.debug(f'PAUSE {s.name} {request}') s._hanged.append(request) if s.urlfilter: s.urlfilter.delete(_to_feature(request)) raise DropRequest elif s.status == 'RUNNING': return request elif s.status in ['STOP', 'CLOSE']: if s.urlfilter: s.urlfilter.delete(_to_feature(request)) self.logger.debug(f'STOP/CLOSE {s.name} {request}') raise DropRequest def process_response(self, response): spider = response.spider if spider.status == 'STOP': raise DropResponse return response
def __init__(self, settings): self.req_limits = settings.gets('CONCURRENCY') self.recv_req = [] self.waiting = False self.spiders = None self.logger = getLogger(__name__) self.logger.debug('Loaded scheduler.')
def __init__(self, settings): super(WebCrawler, self).__init__() self.settings = settings self.logger = getLogger(__name__) self.semaphore = asyncio.Semaphore( self.settings['project'].CONCURRENCY) self._install_requester()
def __init__(self, settings): self.settings = settings self._spiders = {} self.project_path = settings['project'].PROJECT_NAME self._found = defaultdict(list) self.warn_only = True self.logger = getLogger(__name__)
def __init__(self, settings, spiders): self._settings = settings self.spiders = spiders self._attrs = ('mw', 'resp_mw', 'req_mw') self.logger = getLogger(__name__) MiddleWareManager.logger = self.logger self.load_middlewares()
def __init__(self, settings): self.settings = settings self.spider_loader = SpiderLoader(settings) self.spiders = self.spider_loader.load_all_spiders() self.crawler = WebCrawler(settings) self.scheduler = Scheduler(settings) self.looper = Looper() self.spider_hub = SpiderHub(settings, self.crawler) self.logger = getLogger(__name__)
def __init__(self,*args,**kwargs): super(BaseSpider, self).__init__(*args,**kwargs) self.status = 'CREATED' self.requests = [] self.session = None self.logger = getLogger(__name__) self._load_settings() self._load_filters() self._init_session()
def __init__(self, settings, crawler): super(SpiderHub, self).__init__() self.settings = settings self._success_counter = 0 self._failed_counter = 0 self._exception_counter = 0 self.active = False self.looper = None self._crawler = crawler self.logger = getLogger(__name__) self._set_queue()
class MediaRequester(CrawlRequester): _down_type = 'media' logger = getLogger(__name__) async def crawl(self,request): delay = request.delay url = request.url session = request.spider.session proxy = request.proxy buffer = request.spider.settings.DEFAULT_DOWNLOAD_BUFFER path = os.path.normpath(request.save_path) if not os.path.exists(os.path.dirname(path)): self.logger.error(f'No path:{os.path.dirname(path)}.') return name = os.path.basename(path) try: self.logger.info(f'Downloading {name}.') async with self._crawler.semaphore: resp = await send_async_http( session, request.method, url, path=path, retries=request.retry, timeout=request.timeout, proxies=proxy, buffer=buffer ) if resp is None: return body = resp['body'] exception = resp['exception'] if exception and body != True: return Response(url, status=-1, request=request, exc=exception) await asyncio.sleep(delay) size = get_file_size(size=int(resp['size'])) self.logger.info(f'Finished downloading:[{name} {size}]') return except asyncio.CancelledError: print(f'Task "{request}" canceled.') return Response(url, status=0, request=request) except Exception as e: return Response(url, status=-1, request=request, exc=e.__class__())
class RetryPagesMiddleware(Middleware): logger = getLogger(__name__) def process_response(self, response): spider = response.spider codes = spider.settings.REQUESTS_ERROR_RETRY_STATUS if spider.settings.REQUESTS_FAIL_RETRY_ENABLE: if response.status != 200: if (response.status in codes and response.status != -1) or \ (response.exception.__class__ in exceptions): _tried = response.request._tried if _tried > spider.settings.REQUESTS_FAIL_RETRY_DEPTH: return response response.request._tried = _tried + 1 response.request.proxy = None self.logger.debug( f'{response.request} scheduled to retry.Tried:{_tried}' ) spider._retries.append(response.request) return response
def __init__(self): self.rparser = {} self.rubbish = set() self.logger = getLogger(__name__)
def __init__(self): self.loop = asyncio.get_event_loop() self.logger = getLogger(__name__)
class HttpProxyMiddleware(Middleware): inited = False invalid_pool = {} proxy_pool = set() logger = getLogger(__name__) def _proxy_invalid(self, proxy, url): domain = parse_url(url).netloc if proxy in self.invalid_pool: if domain in self.invalid_pool[proxy]: return True return False def process_request(self, request): if not request.spider.settings.HTTP_PROXY_ENABLE: request.proxy = None return request _type = request.down_type proxy = request.proxy url = request.url if proxy: if not is_proxy_valid(proxy): if request.spider.settings.HTTP_PROXY_FILL_ENABLE: request.proxy = self.get_proxy(request) if request.proxy: self.logger.warn( f'Filling a new proxy {request.proxy} to {url}.') else: self.logger.error(f'Not a valid http proxy:{proxy}') request.proxy = None return request elif self._proxy_invalid(proxy, url): self.logger.warn(f'Proxy {proxy} is invalid for {url} before.') if request.spider.settings.HTTP_PROXY_FILL_ENABLE: request.proxy = self.get_proxy(request) if request.proxy: self.logger.warn( f'Filling a new proxy {request.proxy} to {url}.') else: self.logger.warn(f'Dropped proxy {proxy} for {url}.') request.proxy = None return request request.proxy = gen_proxy(proxy, _type) self.logger.debug( f'[{request.spider.name}]Using proxy {request.proxy} ' f'for {request.method}-{request.url}') else: _proxy = None while 1: _proxy = self.get_proxy(request) if _proxy is None: break proxy = extract_ip_port(_proxy) if self._proxy_invalid(proxy, url): continue break request.proxy = _proxy return request def process_response(self, response): settings = response.spider.settings fakes = settings.HTTP_PROXY_FAKE_STATUS domain = parse_url(response.url).netloc if not response.spider.settings.HTTP_PROXY_ENABLE: return response if response.request.proxy and response.status != 200 \ and response.status not in fakes: proxy = extract_ip_port(response.request.proxy) if proxy not in self.invalid_pool: self.invalid_pool[proxy] = set() self.logger.debug(f'Proxy {proxy} is invalid for ' f'{domain}.') self.invalid_pool[proxy].add(domain) elif response.request.proxy and (response.status == 200 or response.status in fakes): proxy = extract_ip_port(response.request.proxy) if proxy in self.invalid_pool: self.invalid_pool[proxy].discard(domain) self.proxy_pool.add(proxy) return response def get_proxy(self, req): http_proxy = req.spider.settings.HTTP_PROXY if http_proxy: if is_proxy_valid(http_proxy): proxy = gen_proxy(http_proxy, req.down_type) return proxy elif is_url(http_proxy): return http_proxy else: if not req.spider.settings.HTTP_PROXY_FILL_ENABLE: self.logger.debug(f'Invalid proxy format:{http_proxy}') return _proxy = self.get_proxy_by_api(req) proxy = gen_proxy(_proxy, req.down_type) return proxy def get_proxy_by_api(self, request): domain = parse_url(request.url).netloc def _get_from_pool(): while self.proxy_pool: proxy = self.proxy_pool.pop() if proxy not in self.invalid_pool or\ (domain not in self.invalid_pool.get(proxy)): return proxy else: continue proxy = _get_from_pool() if not proxy: self.logger.debug(f'No proxy in proxy pool.Getting some.') while 1: spider = request.spider req = amipy.Request(spider, spider.settings.HTTP_PROXY_API, delay=0, ignore=True) crawler = spider.binding_hub._crawler looper = spider.binding_hub.looper coro = crawler.requesters[req.down_type].crawl(req) resp = looper.run_coroutine(coro) if not resp: self.logger.error( f'[{resp.status}]Getting Http proxy by api failed.') continue _results = [i.strip() for i in resp.text().split('\n')] results = [ is_proxy_valid(i)[0] for i in _results if is_proxy_valid(i) ] self.proxy_pool.update(results) self.logger.debug( f'Got {len(results)} http proxies from HTTP_PROXY_API.') proxy = _get_from_pool() if not proxy: continue break return proxy