Ejemplo n.º 1
0
class Scheduler(object):
  """Redis-based scheduler"""

  def __init__(self, redis, persist, queue_key):
    self.server = redis
    self.persist = persist
    self.queue_key = queue_key
    # in-memory queue
    self.own_queue = []

  def __len__(self):
    return len(self.queue)

  @classmethod
  def from_settings(cls, settings):
    host = settings.get('REDIS_HOST', REDIS_HOST)
    port = settings.get('REDIS_PORT', REDIS_PORT)
    persist = settings.get('SCHEDULER_PERSIST', SCHEDULER_PERSIST)
    queue_key = settings.get('SCHEDULER_QUEUE_KEY', QUEUE_KEY)
    server = redis.Redis(host, port)
    return cls(server, persist, queue_key)

  @classmethod
  def from_crawler(cls, crawler):
    settings = crawler.settings
    return cls.from_settings(settings)

  def open(self, spider):
    self.spider = spider
    self.queue = SpiderQueue(self.server, spider, self.queue_key)
    self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
    # notice if there are requests already in the queue
    if not self.persist:
      self.df.clear()
      self.queue.clear()

    if len(self.queue):
      spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))

  def close(self, reason):
    pass

  def enqueue_request(self, request):
    if not request.dont_filter and self.df.request_seen(request):
      return
    if self.spider.logined:
      self.queue.push(request)
    else:
      self.own_queue.append(request)

  def next_request(self):
    if self.spider.logined:
      return self.queue.pop()
    if len(self.own_queue) > 0:
      return self.own_queue.pop()

  def has_pending_requests(self):
    if self.spider.logined:
      return len(self) > 0
    return len(self.own_queue)
Ejemplo n.º 2
0
  def open(self, spider):
    self.spider = spider
    self.queue = SpiderQueue(self.server, spider, self.queue_key)
    self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
    # notice if there are requests already in the queue
    if not self.persist:
      self.df.clear()
      self.queue.clear()

    if len(self.queue):
      spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
Ejemplo n.º 3
0
  def open(self, spider):
    self.spider = spider
    self.queue = SpiderQueue(self.server, spider, self.queue_key)
    self.df = RFPDupeFilter(self.server, DUPEFILTER_KEY % {'spider': spider.name})
    # notice if there are requests already in the queue
    if not self.persist:
      self.df.clear()
      self.queue.clear()

    if len(self.queue):
      spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))