def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q
def _dq(self): activef = join(self.dqdir, 'active.json') if exists(activef): with open(activef) as f: prios = json.load(f) else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: log.msg(format="Resuming crawl (%(queuesize)d requests scheduled)", spider=self.spider, queuesize=len(q)) return q
def __init__(self, crawler, dupefilter, jobdir=None, dqclass=None, mqclass=None, logunser=False, stats=None,run_as_daemon=False): self.df = dupefilter self.dqdir = self._dqdir(jobdir) self.dqclass = dqclass self.mqclass = mqclass self.logunser = logunser self.stats = stats self.run_as_daemon = run_as_daemon self.dqs = None self.mqs = PriorityQueue(self._newmq) #Scheduler.__init__(self, dupefilter, jobdir, dqclass, mqclass, logunser, stats) self.redis_handler = redis_handler('localhost', 6379, 0) self.redis_handler.connect_db() crawler.signals.connect(self.enqueue, signal=signals.request_scheduled)
def open(self, spider): #打开调度器,绑定spider,实例化mqs,dqs。 self.spider = spider #PriorityQueue见import部分。 #mqs=memery_queues,queues使用的是自己实现的PriorityQueue。 #队列中没一项都是一个memeryqueue-scrapy.squeues.LifoMemoryQueue self.mqs = PriorityQueue(self._newmq) #当指定了JOBDIR,self.dqdir是JOBDIR的子目录。 #self.dqdir = setting['JOBDIR'] + '/request.queue' #在获取这个文件夹path同时,也创建了这个文件夹。 #self._dq():返回了一个diskqueue的PriorityQueue,会处理文件中的内容读入到内存。 self.dqs = self._dq() if self.dqdir else None #返回dupefilter实例。 #RFPDupeFilter没有实现open方法, #其基类BaseDupeFilter.open(),只有一句pass #所以这个scheduler.open(),在不改变调度器和去重类的情况下,直接返回的是None return self.df.open()
def _dq(self): #self.dqdir = setting['JOBDIR'] + '/request.queue' #activef = setting['JOBDIR'] + '/request.queue' +'/active.json' activef = join(self.dqdir, 'active.json') #如果这个文件存在,则打开,并将文件中的内容,读取进入prios(json结构) if exists(activef): with open(activef) as f: prios = json.load(f) #如果不存在这个文件,则prios为空集合。 else: prios = () q = PriorityQueue(self._newdq, startprios=prios) if q: logger.info("Resuming crawl (%(queuesize)d requests scheduled)", {'queuesize': len(q)}, extra={'spider': self.spider}) return q
def open(self, spider): self.spider = spider self.mqs = PriorityQueue(self._newmq) self.dqs = self._dq() if self.dqdir else None return self.df.open()
def __init__(self, settings): self.settings = settings self.mq_class = load_object(settings['SCHEDULER_MEMORY_QUEUE']) self.mqs = PriorityQueue(self.priority) self.status = ScheduleStatus()
def push(self, request, priority=0): slot = request.get('meta', {}).get('scheduler_slot', None) if slot not in self.pqueues: self.pqueues[slot] = PriorityQueue(self.qfactory) self._slots.append(slot) self.pqueues[slot].push(request, priority)