def findPendingUrls4JinghuaByStatusAndSpiderName(self, spiderName): ''' 精华游记url处理:不下载item只要url ''' whereJson = {"status": {"$gte": 300, "$lt": 1001}, "priority": 1} return mongoApt.find(self.urlDbnamekey, self.urlCollectionsMap[spiderName], whereJson=whereJson, sortField='priority') #self.urlIncreasement * 20
def findUrlsForDupfilter(self, spiderName): ''' 加载用于排重的url的md5值 ''' whereJson = {"status": {"$lt": 400}} return mongoApt.find(self.urlDbnamekey, self.urlCollectionsMap[spiderName], whereJson=whereJson, sortField='status')
def findUnparsedPageByStatus(self, spiderName): ''' 查询待解析的Page,通过状态 ''' colName = 'Page' whereJson = {'status': {'$lt': 200, '$gt': 0}} # whereJson={'_id':ObjectId('4e5e0e7df77648068901556d')} # whereJson={'url':'http://www.mafengwo.cn/i/704654.html'} cursor = mongoApt.find(spiderName, colName, whereJson=whereJson) return cursor
def findPendingUrlsByStatusAndSpiderName(self, spiderName, statusBegin=400, statusEnd=800): ''' 未被下载或下载失败的url,以便相应爬虫的恢复 ''' whereJson = {"status": {"$gte": statusBegin, "$lt": statusEnd}} return mongoApt.find(self.urlDbnamekey, self.urlCollectionsMap[spiderName], whereJson=whereJson, sortField='priority') #self.urlIncreasement * 20
def getRequestsToSupplyPendingreqeust(self, spiderName): """ 从数据库加载新url,创建request,补充pengdingRequest """ whereJson = {"status": 1000} cursor = mongoApt.find(self.urlDbnamekey, self.urlCollectionsMap[spiderName], whereJson=whereJson, sortField='priority', limitNum=self.urlIncreasement) requests = [] for p in cursor: req = self.makeRequest(p["url"], callBackFunctionName=p["callBack"], urlId=p['_id'], priority=p["priority"]) requests.append(req) if len(requests) >= self.urlIncreasement: break return requests
def getRequestWithUpdateStrategy(self, spiderName): """ 更新策略 """ whereJson = { "status": { "$lt": 400 }, "spiderName": spiderName, 'updateInterval': { '$exists': True } } cursor = mongoApt.find(self.urlDbnamekey, self.urlCollectionsMap[spiderName], whereJson=whereJson, sortField='status') requests = [] for p in cursor: if 'updateInterval' in p and p['status'] in [ 200, 304 ] and datetime.datetime.now() - datetime.timedelta( days=p["updateInterval"]) > p["dateTime"]: meta = {} headers = {} if 'reference' in p: meta['reference'] = p['reference'] if self.updateStrategy in p: meta[self.updateStrategy] = p[self.updateStrategy] headers['If-Modified-Since'] = self.getGMTFormatDate( p['dateTime']) req = self.makeRequest(p["url"], callBackFunctionName=p["callBack"], meta=meta, urlId=p['_id'], priority=p["priority"], headers=headers) requests.append(req) return requests
def findKerwordsForSespider(self): ''' 加载Se爬虫的搜素关键字 ''' colName = 'keyword' return mongoApt.find(self.urlDbnamekey, colName)