Example #1
0
 def _dq(self):
     activef = join(self.dqdir, 'active.json')
     if exists(activef):
         with open(activef) as f:
             prios = json.load(f)
     else:
         prios = ()
     q = PriorityQueue(self._newdq, startprios=prios)
     if q:
         logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                     {'queuesize': len(q)}, extra={'spider': self.spider})
     return q
Example #2
0
 def _dq(self):
     activef = join(self.dqdir, 'active.json')
     if exists(activef):
         with open(activef) as f:
             prios = json.load(f)
     else:
         prios = ()
     q = PriorityQueue(self._newdq, startprios=prios)
     if q:
         log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)",
                 spider=self.spider, queuesize=len(q))
     return q
Example #3
0
 def __init__(self, crawler, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None,run_as_daemon=False):
     self.df = dupefilter
     self.dqdir = self._dqdir(jobdir)
     self.dqclass = dqclass
     self.mqclass = mqclass
     self.logunser = logunser
     self.stats = stats
     self.run_as_daemon = run_as_daemon
     self.dqs = None 
     self.mqs = PriorityQueue(self._newmq)
     #Scheduler.__init__(self, dupefilter, jobdir, dqclass, mqclass, logunser, stats)
     self.redis_handler = redis_handler('localhost', 6379, 0)
     self.redis_handler.connect_db()
     crawler.signals.connect(self.enqueue, signal=signals.request_scheduled)
    def open(self, spider):
        #打开调度器,绑定spider,实例化mqs,dqs。
        self.spider = spider

        #PriorityQueue见import部分。
        #mqs=memery_queues,queues使用的是自己实现的PriorityQueue。
        #队列中没一项都是一个memeryqueue-scrapy.squeues.LifoMemoryQueue
        self.mqs = PriorityQueue(self._newmq)

        #当指定了JOBDIR,self.dqdir是JOBDIR的子目录。
        #self.dqdir = setting['JOBDIR'] + '/request.queue'
        #在获取这个文件夹path同时,也创建了这个文件夹。
        #self._dq():返回了一个diskqueue的PriorityQueue,会处理文件中的内容读入到内存。
        self.dqs = self._dq() if self.dqdir else None

        #返回dupefilter实例。
        #RFPDupeFilter没有实现open方法,
        #其基类BaseDupeFilter.open(),只有一句pass
        #所以这个scheduler.open(),在不改变调度器和去重类的情况下,直接返回的是None
        return self.df.open()
    def _dq(self):
        #self.dqdir = setting['JOBDIR'] + '/request.queue'
        #activef = setting['JOBDIR'] + '/request.queue' +'/active.json'
        activef = join(self.dqdir, 'active.json')

        #如果这个文件存在,则打开,并将文件中的内容,读取进入prios(json结构)
        if exists(activef):
            with open(activef) as f:
                prios = json.load(f)

        #如果不存在这个文件,则prios为空集合。
        else:
            prios = ()

        q = PriorityQueue(self._newdq, startprios=prios)
        if q:
            logger.info("Resuming crawl (%(queuesize)d requests scheduled)",
                        {'queuesize': len(q)},
                        extra={'spider': self.spider})
        return q
Example #6
0
 def open(self, spider):
     self.spider = spider
     self.mqs = PriorityQueue(self._newmq)
     self.dqs = self._dq() if self.dqdir else None
     return self.df.open()
Example #7
0
 def __init__(self, settings):
     self.settings = settings
     self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE'])
     self.mqs = PriorityQueue(self.priority)
     self.status = ScheduleStatus()
Example #8
0
 def push(self, request, priority=0):
     slot = request.get('meta', {}).get('scheduler_slot', None)
     if slot not in self.pqueues:
         self.pqueues[slot] = PriorityQueue(self.qfactory)
         self._slots.append(slot)
     self.pqueues[slot].push(request, priority)