class FronteraScheduler(Scheduler): def __init__(self, crawler, manager=None): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings, manager) self._delay_on_empty = self.frontier.manager.settings.get( 'DELAY_ON_EMPTY') self._delay_next_call = 0.0 self.logger = getLogger( 'frontera.contrib.scrapy.schedulers.FronteraScheduler') @classmethod def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): if not self._request_is_redirected(request): self.frontier.add_seeds([request]) self.stats_manager.add_seeds() return True elif self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True return False def next_request(self): request = self._get_next_request() if request: self.stats_manager.add_returned_requests() return request def process_spider_output(self, response, result, spider): links = [] for element in result: if isinstance(element, Request): links.append(element) yield element frontier_request = response.meta[b'frontier_request'] self.frontier.page_crawled( response) # removed frontier part from .meta # putting it back, to persist .meta from original request response.meta[b'frontier_request'] = frontier_request self.frontier.links_extracted(response.request, links) self.stats_manager.add_crawled_page(response.status, len(links)) def process_exception(self, request, exception, spider): error_code = self._get_exception_code(exception) self.frontier.request_error(request=request, error=error_code) self.stats_manager.add_request_error(error_code) def open(self, spider): self.frontier.set_spider(spider) self.logger.info("Starting frontier") if not self.frontier.manager.auto_start: self.frontier.start() def close(self, reason): self.logger.info("Finishing frontier (%s)", reason) self.frontier.stop() self.stats_manager.set_iterations(self.frontier.manager.iteration) self.stats_manager.set_pending_requests(len(self)) def __len__(self): return len(self._pending_requests) def has_pending_requests(self): return not self.frontier.finished() def _get_next_request(self): if not self.frontier.manager.finished and \ len(self) < self.crawler.engine.downloader.total_concurrency and \ self._delay_next_call < time(): info = self._get_downloader_info() requests = self.frontier.get_next_requests( key_type=info['key_type'], overused_keys=info['overused_keys']) for request in requests: self._add_pending_request(request) self._delay_next_call = time( ) + self._delay_on_empty if not requests else 0.0 return self._get_pending_request() def _add_pending_request(self, request): return self._pending_requests.append(request) def _get_pending_request(self): return self._pending_requests.popleft( ) if self._pending_requests else None def _get_exception_code(self, exception): try: return exception.__class__.__name__ except: return '?' def _request_is_redirected(self, request): return request.meta.get(b'redirect_times', 0) > 0 def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { 'key_type': 'ip' if downloader.ip_concurrency else 'domain', 'overused_keys': [] } for key, slot in six.iteritems(downloader.slots): overused_factor = len(slot.active) / float(slot.concurrency) if overused_factor > self.frontier.manager.settings.get( 'OVERUSED_SLOT_FACTOR'): info['overused_keys'].append(key) return info
class FronteraScheduler(Scheduler): def __init__(self, crawler, manager=None): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() self.redirect_enabled = crawler.settings.get("REDIRECT_ENABLED") settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings, manager) self._delay_on_empty = self.frontier.manager.settings.get("DELAY_ON_EMPTY") self._delay_next_call = 0.0 self.logger = getLogger("frontera.contrib.scrapy.schedulers.FronteraScheduler") @classmethod def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): if not self._request_is_redirected(request): self.frontier.add_seeds([request]) self.stats_manager.add_seeds() return True elif self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True return False def next_request(self): request = self._get_next_request() if request: self.stats_manager.add_returned_requests() return request def process_spider_output(self, response, result, spider): links = [] for element in result: if isinstance(element, Request): links.append(element) else: yield element frontier_request = response.meta[b"frontier_request"] self.frontier.page_crawled(response) # removed frontier part from .meta # putting it back, to persist .meta from original request response.meta[b"frontier_request"] = frontier_request self.frontier.links_extracted(response.request, links) self.stats_manager.add_crawled_page(response.status, len(links)) def process_exception(self, request, exception, spider): error_code = self._get_exception_code(exception) self.frontier.request_error(request=request, error=error_code) self.stats_manager.add_request_error(error_code) def open(self, spider): self.frontier.set_spider(spider) self.logger.info("Starting frontier") if not self.frontier.manager.auto_start: self.frontier.start() def close(self, reason): self.logger.info("Finishing frontier (%s)", reason) self.frontier.stop() self.stats_manager.set_iterations(self.frontier.manager.iteration) self.stats_manager.set_pending_requests(len(self)) def __len__(self): return len(self._pending_requests) def has_pending_requests(self): return not self.frontier.finished() def _get_next_request(self): if ( not self.frontier.manager.finished and len(self) < self.crawler.engine.downloader.total_concurrency and self._delay_next_call < time() ): info = self._get_downloader_info() requests = self.frontier.get_next_requests(key_type=info["key_type"], overused_keys=info["overused_keys"]) for request in requests: self._add_pending_request(request) self._delay_next_call = time() + self._delay_on_empty if not requests else 0.0 return self._get_pending_request() def _add_pending_request(self, request): return self._pending_requests.append(request) def _get_pending_request(self): return self._pending_requests.popleft() if self._pending_requests else None def _get_exception_code(self, exception): try: return exception.__class__.__name__ except: return "?" def _request_is_redirected(self, request): return request.meta.get(b"redirect_times", 0) > 0 def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = {"key_type": "ip" if downloader.ip_concurrency else "domain", "overused_keys": []} for key, slot in six.iteritems(downloader.slots): overused_factor = len(slot.active) / float(slot.concurrency) if overused_factor > self.frontier.manager.settings.get("OVERUSED_SLOT_FACTOR"): info["overused_keys"].append(key) return info
class FronteraScheduler(Scheduler): def __init__(self, crawler): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') frontier_settings = crawler.settings.get('FRONTERA_SETTINGS', None) if not frontier_settings: log.msg('FRONTERA_SETTINGS not found! Using default Frontera settings...', log.WARNING) self.frontier = ScrapyFrontierManager(frontier_settings) self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY') self._delay_next_call = 0.0 @classmethod def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): if not self._request_is_redirected(request): self.frontier.add_seeds([request]) self.stats_manager.add_seeds() return True elif self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True return False def next_request(self): request = self._get_next_request() if request: self.stats_manager.add_returned_requests() return request def process_spider_output(self, response, result, spider): links = [] for element in result: if isinstance(element, Request): links.append(element) else: yield element self.frontier.page_crawled(response=response, links=links) self.stats_manager.add_crawled_page(response.status, len(links)) def process_exception(self, request, exception, spider): error_code = self._get_exception_code(exception) self.frontier.request_error(request=request, error=error_code) self.stats_manager.add_request_error(error_code) def open(self, spider): log.msg('Starting frontier', log.INFO) if not self.frontier.manager.auto_start: self.frontier.start() def close(self, reason): log.msg('Finishing frontier (%s)' % reason, log.INFO) self.frontier.stop() self.stats_manager.set_iterations(self.frontier.manager.iteration) self.stats_manager.set_pending_requests(len(self)) def __len__(self): return len(self._pending_requests) def has_pending_requests(self): return len(self) > 0 def _get_next_request(self): if not self.frontier.manager.finished and \ len(self) < self.crawler.engine.downloader.total_concurrency and \ self._delay_next_call < time(): info = self._get_downloader_info() requests = self.frontier.get_next_requests(key_type=info['key_type'], overused_keys=info['overused_keys']) for request in requests: self._add_pending_request(request) self._delay_next_call = time() + self._delay_on_empty if not requests else 0.0 return self._get_pending_request() def _add_pending_request(self, request): return self._pending_requests.append(request) def _get_pending_request(self): return self._pending_requests.popleft() if self._pending_requests else None def _get_exception_code(self, exception): try: return exception.__class__.__name__ except: return '?' def _request_is_redirected(self, request): return request.meta.get('redirect_times', 0) > 0 def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { 'key_type': 'ip' if downloader.ip_concurrency else 'domain', 'overused_keys': [] } for key, slot in downloader.slots.iteritems(): overused_factor = len(slot.active) / float(slot.concurrency) if overused_factor > self.frontier.manager.settings.get('OVERUSED_SLOT_FACTOR'): info['overused_keys'].append(key) return info
class FronteraScheduler(Scheduler): def __init__(self, crawler): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings) self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY') self._delay_next_call = 0.0 self.logger = getLogger('frontera.contrib.scrapy.schedulers.FronteraScheduler') @classmethod def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): if not self._request_is_redirected(request): self.frontier.add_seeds([request]) self.stats_manager.add_seeds() return True elif self.redirect_enabled: self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True return False def next_request(self): request = self._get_next_request() if request: self.stats_manager.add_returned_requests() return request def process_spider_output(self, response, result, spider): links = [] for element in result: if isinstance(element, Request): links.append(element) else: yield element self.frontier.page_crawled(response=response, links=links) self.stats_manager.add_crawled_page(response.status, len(links)) def process_exception(self, request, exception, spider): error_code = self._get_exception_code(exception) self.frontier.request_error(request=request, error=error_code) self.stats_manager.add_request_error(error_code) def open(self, spider): self.frontier.set_spider(spider) self.logger.info("Starting frontier") if not self.frontier.manager.auto_start: self.frontier.start() def close(self, reason): self.logger.info("Finishing frontier (%s)", reason) self.frontier.stop() self.stats_manager.set_iterations(self.frontier.manager.iteration) self.stats_manager.set_pending_requests(len(self)) def __len__(self): return len(self._pending_requests) def has_pending_requests(self): return not self.frontier.finished() def _get_next_request(self): if not self.frontier.manager.finished and \ len(self) < self.crawler.engine.downloader.total_concurrency and \ self._delay_next_call < time(): info = self._get_downloader_info() requests = self.frontier.get_next_requests(key_type=info['key_type'], overused_keys=info['overused_keys']) for request in requests: self._add_pending_request(request) self._delay_next_call = time() + self._delay_on_empty if not requests else 0.0 return self._get_pending_request() def _add_pending_request(self, request): return self._pending_requests.append(request) def _get_pending_request(self): return self._pending_requests.popleft() if self._pending_requests else None def _get_exception_code(self, exception): try: return exception.__class__.__name__ except: return '?' def _request_is_redirected(self, request): return request.meta.get('redirect_times', 0) > 0 def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { 'key_type': 'ip' if downloader.ip_concurrency else 'domain', 'overused_keys': [] } for key, slot in downloader.slots.iteritems(): overused_factor = len(slot.active) / float(slot.concurrency) if overused_factor > self.frontier.manager.settings.get('OVERUSED_SLOT_FACTOR'): info['overused_keys'].append(key) return info
class FronteraScheduler(Scheduler): """ Custom Scheduler for Scrapy. Adapts Frontera manager interface to Scrapy. Important remarks: - it doesn't enqueue majority of requests produced by middlewares or direct calls to engine, see self.enqueue_request method, and override if needed, - requires SchedulerSpiderMiddleware and SchedulerDownloaderMiddleware. """ def __init__(self, crawler, manager=None): self.crawler = crawler self.stats_manager = StatsManager(crawler.stats) self._pending_requests = deque() settings = ScrapySettingsAdapter(crawler.settings) self.frontier = ScrapyFrontierManager(settings, manager) self._delay_on_empty = self.frontier.manager.settings.get( 'DELAY_ON_EMPTY') self._redirect_enabled = crawler.settings.get('REDIRECT_ENABLED') self._delay_next_call = 0.0 self.logger = getLogger( 'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler') @classmethod def from_crawler(cls, crawler): return cls(crawler) def enqueue_request(self, request): # add directly to in-memory queue if request comes as part of redirect chain from RedirectMiddleware if self._redirect_enabled and self._request_is_redirected(request): self._add_pending_request(request) self.stats_manager.add_redirected_requests() return True # add as seed if request is explicitly marked as seed if self._request_is_seed(request): self.frontier.add_seeds([request]) self.stats_manager.add_seeds() return True self.logger.warning("Request to URL %s is skipped.", request.url) return False def next_request(self): request = self._get_next_request() if request: self.stats_manager.add_returned_requests() return request def process_spider_output(self, response, result, spider): links = [] for element in result: if isinstance(element, Request): links.append(element) else: yield element frontier_request = response.meta[b'frontier_request'] self.frontier.page_crawled( response) # removed frontier part from .meta # putting it back, to persist .meta from original request response.meta[b'frontier_request'] = frontier_request self.frontier.links_extracted(response.request, links) self.stats_manager.add_crawled_page(response.status, len(links)) def process_exception(self, request, exception, spider): error_code = self._get_exception_code(exception) self.frontier.request_error(request=request, error=error_code) self.stats_manager.add_request_error(error_code) def open(self, spider): self.frontier.set_spider(spider) self.logger.info("Starting frontier") if not self.frontier.manager.auto_start: self.frontier.start() def close(self, reason): self.logger.info("Finishing frontier (%s)", reason) self.frontier.stop() self.stats_manager.set_iterations(self.frontier.manager.iteration) self.stats_manager.set_pending_requests(len(self)) def __len__(self): return len(self._pending_requests) def has_pending_requests(self): return not self.frontier.finished() def _get_next_request(self): if not self.frontier.manager.finished and \ len(self) < self.crawler.engine.downloader.total_concurrency and \ self._delay_next_call < time(): info = self._get_downloader_info() requests = self.frontier.get_next_requests( key_type=info['key_type'], overused_keys=info['overused_keys']) for request in requests: self._add_pending_request(request) self._delay_next_call = time( ) + self._delay_on_empty if not requests else 0.0 return self._get_pending_request() def _add_pending_request(self, request): return self._pending_requests.append(request) def _get_pending_request(self): return self._pending_requests.popleft( ) if self._pending_requests else None def _get_exception_code(self, exception): try: return exception.__class__.__name__ except: return '?' def _request_is_redirected(self, request): return request.meta.get('redirect_times', 0) > 0 def _request_is_seed(self, request): return bool(request.meta.get('seed', False)) def _get_downloader_info(self): downloader = self.crawler.engine.downloader info = { 'key_type': 'ip' if downloader.ip_concurrency else 'domain', 'overused_keys': [] } for key, slot in six.iteritems(downloader.slots): overused_factor = len(slot.active) / float(slot.concurrency) if overused_factor > self.frontier.manager.settings.get( 'OVERUSED_SLOT_FACTOR'): info['overused_keys'].append(key) return info