Esempio n. 1
0
 def open(self, spider):
     self.spider = spider
     self.queue = SpiderQueue(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
     # notice if there are requests already in the queue
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
Esempio n. 2
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key

    def __len__(self):
        return len(self.queue)

    # @classmethod
    # def from_settings(cls, settings):
    # host = settings.get('REDIS_HOST', REDIS_HOST)
    # port = settings.get('REDIS_PORT', REDIS_PORT)
    # persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
    # queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
    # server = redis.Redis(host, port)
    # return cls(server, persist, queue_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        host = settings.get("REDIS_HOST", REDIS_HOST)
        port = settings.get("REDIS_PORT", REDIS_PORT)
        persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
        queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name})
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
Esempio n. 3
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, dupe_filter):
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.dupe_filter = dupe_filter

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_crawler(cls, crawler):
	settings = crawler.settings
        host = settings.get('REDIS_HOST', REDIS_HOST)
        port = settings.get('REDIS_PORT', REDIS_PORT)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        dupe_filter = settings.get('DUPE_FILTER', True)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, dupe_filter)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
	if self.dupe_filter:
            self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
        # notice if there are requests already in the queue
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
	    if self.dupe_filter:
                self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.dupe_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0
Esempio n. 4
0
 def open(self, spider):
     self.spider = spider
     self.queue = SpiderQueue(self.server, spider, self.queue_key)
     self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name})
     # notice if there are requests already in the queue
     if len(self.queue):
         spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
Esempio n. 5
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key):
        """Initialize scheduler.

        Parameters
        ----------
        server : Redis instance
        persist : bool
        queue_key : str
        queue_cls : queue class
        dupefilter_key : str
        """
        self.server = server
        self.persist = persist
        self.queue_key = queue_key
        self.queue_cls = queue_cls
        self.dupefilter_key = dupefilter_key

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        host = settings.get("REDIS_HOST", REDIS_HOST)
        port = settings.get("REDIS_PORT", REDIS_PORT)
        persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST)
        queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY)
        queue_cls = load_object(settings.get("SCHEDULER_QUEUE_CLASS", QUEUE_CLASS))
        dupefilter_key = settings.get("DUPEFILTER_KEY", DUPEFILTER_KEY)
        server = redis.Redis(host, port)
        return cls(server, persist, queue_key, queue_cls, dupefilter_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls.from_settings(settings)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, self.dupefilter_key % {"spider": spider.name})
        # notice if there are requests already in the queue to resume the crawl
        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        if not self.persist:
            self.df.clear()
            self.queue.clear()

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        self.queue.push(request)

    def next_request(self):
        return self.queue.pop()

    def has_pending_requests(self):
        return len(self) > 0