Example #1
0
class Scheduler(object):
    """Redis-based scheduler"""

    def __init__(self, redis, persist, queue_key):
        self.server = redis
        self.persist = persist
        self.queue_key = queue_key
        # in-memory queue
        self.own_queue = []

    def __len__(self):
        return len(self.queue)

    @classmethod
    def from_settings(cls, settings):
        redisConfig = settings.get('REDIS_CONFIG', REDIS_CONFIG)
        persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
        queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
        server = redis.Redis(redisConfig['host'], redisConfig['port'])
        return cls(server, persist, queue_key)

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        return cls.from_settings(settings)

    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
        # notice if there are requests already in the queue
        if not self.persist:
            self.df.clear()
            self.queue.clear()

        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

    def close(self, reason):
        pass

    def enqueue_request(self, request):
        if not request.dont_filter and self.df.request_seen(request):
            return
        if self.spider.logined:
            self.queue.push(request)
        else:
            self.own_queue.append(request)

    def next_request(self):
        if self.spider.logined:
            return self.queue.pop()
        if len(self.own_queue) > 0:
            return self.own_queue.pop()

    def has_pending_requests(self):
        if self.spider.logined:
            return len(self) > 0
        return len(self.own_queue)
Example #2
0
    def open(self, spider):
        self.spider = spider
        self.queue = SpiderQueue(self.server, spider, self.queue_key)
        self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
        # notice if there are requests already in the queue
        if not self.persist:
            self.df.clear()
            self.queue.clear()

        if len(self.queue):
            spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))