def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key): self.server = server self.persist = persist self.queue_key = queue_key def __len__(self): return len(self.queue) # @classmethod # def from_settings(cls, settings): # host = settings.get('REDIS_HOST', REDIS_HOST) # port = settings.get('REDIS_PORT', REDIS_PORT) # persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) # queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) # server = redis.Redis(host, port) # return cls(server, persist, queue_key) @classmethod def from_crawler(cls, crawler): settings = crawler.settings host = settings.get("REDIS_HOST", REDIS_HOST) port = settings.get("REDIS_PORT", REDIS_PORT) persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST) queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY) server = redis.Redis(host, port) return cls(server, persist, queue_key) def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return self.queue.push(request) def next_request(self): return self.queue.pop() def has_pending_requests(self): return len(self) > 0
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key, dupe_filter): self.server = server self.persist = persist self.queue_key = queue_key self.dupe_filter = dupe_filter def __len__(self): return len(self.queue) @classmethod def from_crawler(cls, crawler): settings = crawler.settings host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST) queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY) dupe_filter = settings.get('DUPE_FILTER', True) server = redis.Redis(host, port) return cls(server, persist, queue_key, dupe_filter) def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) if self.dupe_filter: self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: if self.dupe_filter: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.dupe_filter and self.df.request_seen(request): return self.queue.push(request) def next_request(self): return self.queue.pop() def has_pending_requests(self): return len(self) > 0
def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {"spider": spider.name}) # notice if there are requests already in the queue if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
class Scheduler(object): """Redis-based scheduler""" def __init__(self, server, persist, queue_key, queue_cls, dupefilter_key): """Initialize scheduler. Parameters ---------- server : Redis instance persist : bool queue_key : str queue_cls : queue class dupefilter_key : str """ self.server = server self.persist = persist self.queue_key = queue_key self.queue_cls = queue_cls self.dupefilter_key = dupefilter_key def __len__(self): return len(self.queue) @classmethod def from_settings(cls, settings): host = settings.get("REDIS_HOST", REDIS_HOST) port = settings.get("REDIS_PORT", REDIS_PORT) persist = settings.get("SCHEDULER_PERSIST", SCHEDULER_PERSIST) queue_key = settings.get("SCHEDULER_QUEUE_KEY", QUEUE_KEY) queue_cls = load_object(settings.get("SCHEDULER_QUEUE_CLASS", QUEUE_CLASS)) dupefilter_key = settings.get("DUPEFILTER_KEY", DUPEFILTER_KEY) server = redis.Redis(host, port) return cls(server, persist, queue_key, queue_cls, dupefilter_key) @classmethod def from_crawler(cls, crawler): settings = crawler.settings return cls.from_settings(settings) def open(self, spider): self.spider = spider self.queue = SpiderQueue(self.server, spider, self.queue_key) self.df = RFPDupeFilter(self.server, self.dupefilter_key % {"spider": spider.name}) # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue)) def close(self, reason): if not self.persist: self.df.clear() self.queue.clear() def enqueue_request(self, request): if not request.dont_filter and self.df.request_seen(request): return self.queue.push(request) def next_request(self): return self.queue.pop() def has_pending_requests(self): return len(self) > 0