def init_request(self): urls = self.task.get_exist_url() self.urlcount += len(urls) for url in urls: request = Request(url.url, url.method, url.params, url.referer) if self.visited[request] < self.duplicates: if not discard(request.url) and not url.end_time: request.id = url.id self.pendings.put(request) DEBUG("-----request:%s not crawler,add queue" % request) self.visited[request] += 1 else: #DEBUG("duplicates url:%s" %request) pass return self.urlcount
def addRequest(self, request): """ 0. judge discard or not,e.g. .css .png 1. judge max depth 2. judge whether duplicate 3. judge max url count """ if self.visited[request] < self.duplicates: self.judgeUrlCount() if not discard(request.url): request.id = pipeline(request) self.pendings.put(request) #DEBUG("--*--:%s" % request) else: #.png等url不放入队列 pipeline(request) self.visited[request] += 1 else: #DEBUG("duplicates url:%s" %request) pass