Beispiel #1
0
class FronteraScheduler(Scheduler):
    def __init__(self, crawler, manager=None):
        self.crawler = crawler
        self.stats_manager = StatsManager(crawler.stats)
        self._pending_requests = deque()
        self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED')
        settings = ScrapySettingsAdapter(crawler.settings)
        self.frontier = ScrapyFrontierManager(settings, manager)
        self._delay_on_empty = self.frontier.manager.settings.get(
            'DELAY_ON_EMPTY')
        self._delay_next_call = 0.0
        self.logger = getLogger(
            'frontera.contrib.scrapy.schedulers.FronteraScheduler')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def enqueue_request(self, request):
        if not self._request_is_redirected(request):
            self.frontier.add_seeds([request])
            self.stats_manager.add_seeds()
            return True
        elif self.redirect_enabled:
            self._add_pending_request(request)
            self.stats_manager.add_redirected_requests()
            return True
        return False

    def next_request(self):
        request = self._get_next_request()
        if request:
            self.stats_manager.add_returned_requests()
        return request

    def process_spider_output(self, response, result, spider):
        links = []
        for element in result:
            if isinstance(element, Request):
                links.append(element)
            yield element
        frontier_request = response.meta[b'frontier_request']
        self.frontier.page_crawled(
            response)  # removed frontier part from .meta
        # putting it back, to persist .meta from original request
        response.meta[b'frontier_request'] = frontier_request
        self.frontier.links_extracted(response.request, links)
        self.stats_manager.add_crawled_page(response.status, len(links))

    def process_exception(self, request, exception, spider):
        error_code = self._get_exception_code(exception)
        self.frontier.request_error(request=request, error=error_code)
        self.stats_manager.add_request_error(error_code)

    def open(self, spider):
        self.frontier.set_spider(spider)
        self.logger.info("Starting frontier")
        if not self.frontier.manager.auto_start:
            self.frontier.start()

    def close(self, reason):
        self.logger.info("Finishing frontier (%s)", reason)
        self.frontier.stop()
        self.stats_manager.set_iterations(self.frontier.manager.iteration)
        self.stats_manager.set_pending_requests(len(self))

    def __len__(self):
        return len(self._pending_requests)

    def has_pending_requests(self):
        return not self.frontier.finished()

    def _get_next_request(self):
        if not self.frontier.manager.finished and \
                len(self) < self.crawler.engine.downloader.total_concurrency and \
                self._delay_next_call < time():

            info = self._get_downloader_info()
            requests = self.frontier.get_next_requests(
                key_type=info['key_type'], overused_keys=info['overused_keys'])
            for request in requests:
                self._add_pending_request(request)
            self._delay_next_call = time(
            ) + self._delay_on_empty if not requests else 0.0
        return self._get_pending_request()

    def _add_pending_request(self, request):
        return self._pending_requests.append(request)

    def _get_pending_request(self):
        return self._pending_requests.popleft(
        ) if self._pending_requests else None

    def _get_exception_code(self, exception):
        try:
            return exception.__class__.__name__
        except:
            return '?'

    def _request_is_redirected(self, request):
        return request.meta.get(b'redirect_times', 0) > 0

    def _get_downloader_info(self):
        downloader = self.crawler.engine.downloader
        info = {
            'key_type': 'ip' if downloader.ip_concurrency else 'domain',
            'overused_keys': []
        }
        for key, slot in six.iteritems(downloader.slots):
            overused_factor = len(slot.active) / float(slot.concurrency)
            if overused_factor > self.frontier.manager.settings.get(
                    'OVERUSED_SLOT_FACTOR'):
                info['overused_keys'].append(key)
        return info
Beispiel #2
0
class FronteraScheduler(Scheduler):

    def __init__(self, crawler):
        self.crawler = crawler
        self.stats_manager = StatsManager(crawler.stats)
        self._pending_requests = deque()
        self.redirect_enabled = crawler.settings.get('REDIRECT_ENABLED')
        settings = ScrapySettingsAdapter(crawler.settings)
        self.frontier = ScrapyFrontierManager(settings)
        self._delay_on_empty = self.frontier.manager.settings.get('DELAY_ON_EMPTY')
        self._delay_next_call = 0.0
        self.logger = getLogger('frontera.contrib.scrapy.schedulers.FronteraScheduler')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def enqueue_request(self, request):
        if not self._request_is_redirected(request):
            self.frontier.add_seeds([request])
            self.stats_manager.add_seeds()
            return True
        elif self.redirect_enabled:
            self._add_pending_request(request)
            self.stats_manager.add_redirected_requests()
            return True
        return False

    def next_request(self):
        request = self._get_next_request()
        if request:
            self.stats_manager.add_returned_requests()
        return request

    def process_spider_output(self, response, result, spider):
        links = []
        for element in result:
            if isinstance(element, Request):
                links.append(element)
            else:
                yield element
        self.frontier.page_crawled(response=response,
                                   links=links)
        self.stats_manager.add_crawled_page(response.status, len(links))

    def process_exception(self, request, exception, spider):
        error_code = self._get_exception_code(exception)
        self.frontier.request_error(request=request, error=error_code)
        self.stats_manager.add_request_error(error_code)

    def open(self, spider):
        self.frontier.set_spider(spider)
        self.logger.info("Starting frontier")
        if not self.frontier.manager.auto_start:
            self.frontier.start()

    def close(self, reason):
        self.logger.info("Finishing frontier (%s)", reason)
        self.frontier.stop()
        self.stats_manager.set_iterations(self.frontier.manager.iteration)
        self.stats_manager.set_pending_requests(len(self))

    def __len__(self):
        return len(self._pending_requests)

    def has_pending_requests(self):
        return not self.frontier.finished()

    def _get_next_request(self):
        if not self.frontier.manager.finished and \
                len(self) < self.crawler.engine.downloader.total_concurrency and \
                self._delay_next_call < time():

            info = self._get_downloader_info()
            requests = self.frontier.get_next_requests(key_type=info['key_type'], overused_keys=info['overused_keys'])
            for request in requests:
                self._add_pending_request(request)
            self._delay_next_call = time() + self._delay_on_empty if not requests else 0.0
        return self._get_pending_request()

    def _add_pending_request(self, request):
        return self._pending_requests.append(request)

    def _get_pending_request(self):
        return self._pending_requests.popleft() if self._pending_requests else None

    def _get_exception_code(self, exception):
        try:
            return exception.__class__.__name__
        except:
            return '?'

    def _request_is_redirected(self, request):
        return request.meta.get('redirect_times', 0) > 0

    def _get_downloader_info(self):
        downloader = self.crawler.engine.downloader
        info = {
            'key_type': 'ip' if downloader.ip_concurrency else 'domain',
            'overused_keys': []
        }
        for key, slot in downloader.slots.iteritems():
            overused_factor = len(slot.active) / float(slot.concurrency)
            if overused_factor > self.frontier.manager.settings.get('OVERUSED_SLOT_FACTOR'):
                info['overused_keys'].append(key)
        return info
Beispiel #3
0
class FronteraScheduler(Scheduler):
    def __init__(self, crawler, manager=None):
        self.crawler = crawler
        self.stats_manager = StatsManager(crawler.stats)
        self._pending_requests = deque()
        self.redirect_enabled = crawler.settings.get("REDIRECT_ENABLED")
        settings = ScrapySettingsAdapter(crawler.settings)
        self.frontier = ScrapyFrontierManager(settings, manager)
        self._delay_on_empty = self.frontier.manager.settings.get("DELAY_ON_EMPTY")
        self._delay_next_call = 0.0
        self.logger = getLogger("frontera.contrib.scrapy.schedulers.FronteraScheduler")

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def enqueue_request(self, request):
        if not self._request_is_redirected(request):
            self.frontier.add_seeds([request])
            self.stats_manager.add_seeds()
            return True
        elif self.redirect_enabled:
            self._add_pending_request(request)
            self.stats_manager.add_redirected_requests()
            return True
        return False

    def next_request(self):
        request = self._get_next_request()
        if request:
            self.stats_manager.add_returned_requests()
        return request

    def process_spider_output(self, response, result, spider):
        links = []
        for element in result:
            if isinstance(element, Request):
                links.append(element)
            else:
                yield element
        frontier_request = response.meta[b"frontier_request"]
        self.frontier.page_crawled(response)  # removed frontier part from .meta
        # putting it back, to persist .meta from original request
        response.meta[b"frontier_request"] = frontier_request
        self.frontier.links_extracted(response.request, links)
        self.stats_manager.add_crawled_page(response.status, len(links))

    def process_exception(self, request, exception, spider):
        error_code = self._get_exception_code(exception)
        self.frontier.request_error(request=request, error=error_code)
        self.stats_manager.add_request_error(error_code)

    def open(self, spider):
        self.frontier.set_spider(spider)
        self.logger.info("Starting frontier")
        if not self.frontier.manager.auto_start:
            self.frontier.start()

    def close(self, reason):
        self.logger.info("Finishing frontier (%s)", reason)
        self.frontier.stop()
        self.stats_manager.set_iterations(self.frontier.manager.iteration)
        self.stats_manager.set_pending_requests(len(self))

    def __len__(self):
        return len(self._pending_requests)

    def has_pending_requests(self):
        return not self.frontier.finished()

    def _get_next_request(self):
        if (
            not self.frontier.manager.finished
            and len(self) < self.crawler.engine.downloader.total_concurrency
            and self._delay_next_call < time()
        ):

            info = self._get_downloader_info()
            requests = self.frontier.get_next_requests(key_type=info["key_type"], overused_keys=info["overused_keys"])
            for request in requests:
                self._add_pending_request(request)
            self._delay_next_call = time() + self._delay_on_empty if not requests else 0.0
        return self._get_pending_request()

    def _add_pending_request(self, request):
        return self._pending_requests.append(request)

    def _get_pending_request(self):
        return self._pending_requests.popleft() if self._pending_requests else None

    def _get_exception_code(self, exception):
        try:
            return exception.__class__.__name__
        except:
            return "?"

    def _request_is_redirected(self, request):
        return request.meta.get(b"redirect_times", 0) > 0

    def _get_downloader_info(self):
        downloader = self.crawler.engine.downloader
        info = {"key_type": "ip" if downloader.ip_concurrency else "domain", "overused_keys": []}
        for key, slot in six.iteritems(downloader.slots):
            overused_factor = len(slot.active) / float(slot.concurrency)
            if overused_factor > self.frontier.manager.settings.get("OVERUSED_SLOT_FACTOR"):
                info["overused_keys"].append(key)
        return info
Beispiel #4
0
class FronteraScheduler(Scheduler):
    """
    Custom Scheduler for Scrapy.
    Adapts Frontera manager interface to Scrapy.

    Important remarks:
     - it doesn't enqueue majority of requests produced by middlewares or direct calls to engine, see self.enqueue_request
       method, and override if needed,
     - requires SchedulerSpiderMiddleware and SchedulerDownloaderMiddleware.
    """
    def __init__(self, crawler, manager=None):
        self.crawler = crawler
        self.stats_manager = StatsManager(crawler.stats)
        self._pending_requests = deque()
        settings = ScrapySettingsAdapter(crawler.settings)
        self.frontier = ScrapyFrontierManager(settings, manager)
        self._delay_on_empty = self.frontier.manager.settings.get(
            'DELAY_ON_EMPTY')
        self._redirect_enabled = crawler.settings.get('REDIRECT_ENABLED')
        self._delay_next_call = 0.0
        self.logger = getLogger(
            'frontera.contrib.scrapy.schedulers.frontier.FronteraScheduler')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def enqueue_request(self, request):
        # add directly to in-memory queue if request comes as part of redirect chain from RedirectMiddleware
        if self._redirect_enabled and self._request_is_redirected(request):
            self._add_pending_request(request)
            self.stats_manager.add_redirected_requests()
            return True
        # add as seed if request is explicitly marked as seed
        if self._request_is_seed(request):
            self.frontier.add_seeds([request])
            self.stats_manager.add_seeds()
            return True
        self.logger.warning("Request to URL %s is skipped.", request.url)
        return False

    def next_request(self):
        request = self._get_next_request()
        if request:
            self.stats_manager.add_returned_requests()
        return request

    def process_spider_output(self, response, result, spider):
        links = []
        for element in result:
            if isinstance(element, Request):
                links.append(element)
            else:
                yield element
        frontier_request = response.meta[b'frontier_request']
        self.frontier.page_crawled(
            response)  # removed frontier part from .meta
        # putting it back, to persist .meta from original request
        response.meta[b'frontier_request'] = frontier_request
        self.frontier.links_extracted(response.request, links)
        self.stats_manager.add_crawled_page(response.status, len(links))

    def process_exception(self, request, exception, spider):
        error_code = self._get_exception_code(exception)
        self.frontier.request_error(request=request, error=error_code)
        self.stats_manager.add_request_error(error_code)

    def open(self, spider):
        self.frontier.set_spider(spider)
        self.logger.info("Starting frontier")
        if not self.frontier.manager.auto_start:
            self.frontier.start()

    def close(self, reason):
        self.logger.info("Finishing frontier (%s)", reason)
        self.frontier.stop()
        self.stats_manager.set_iterations(self.frontier.manager.iteration)
        self.stats_manager.set_pending_requests(len(self))

    def __len__(self):
        return len(self._pending_requests)

    def has_pending_requests(self):
        return not self.frontier.finished()

    def _get_next_request(self):
        if not self.frontier.manager.finished and \
                len(self) < self.crawler.engine.downloader.total_concurrency and \
                self._delay_next_call < time():

            info = self._get_downloader_info()
            requests = self.frontier.get_next_requests(
                key_type=info['key_type'], overused_keys=info['overused_keys'])
            for request in requests:
                self._add_pending_request(request)
            self._delay_next_call = time(
            ) + self._delay_on_empty if not requests else 0.0
        return self._get_pending_request()

    def _add_pending_request(self, request):
        return self._pending_requests.append(request)

    def _get_pending_request(self):
        return self._pending_requests.popleft(
        ) if self._pending_requests else None

    def _get_exception_code(self, exception):
        try:
            return exception.__class__.__name__
        except:
            return '?'

    def _request_is_redirected(self, request):
        return request.meta.get('redirect_times', 0) > 0

    def _request_is_seed(self, request):
        return bool(request.meta.get('seed', False))

    def _get_downloader_info(self):
        downloader = self.crawler.engine.downloader
        info = {
            'key_type': 'ip' if downloader.ip_concurrency else 'domain',
            'overused_keys': []
        }
        for key, slot in six.iteritems(downloader.slots):
            overused_factor = len(slot.active) / float(slot.concurrency)
            if overused_factor > self.frontier.manager.settings.get(
                    'OVERUSED_SLOT_FACTOR'):
                info['overused_keys'].append(key)
        return info