def init(): db = tools.getConnectedDB() # 设唯一索引 db.urls.ensure_index('url', unique=True) db.text_info.ensure_index('url', unique=True)
# encoding=utf8 import sys sys.path.append("..") import threading import time import utils.tools as tools import base.constance as Constance from html_parser.parsers import * from base.collector import Collector from utils.log import log db = tools.getConnectedDB() class PaserControl(threading.Thread): def __init__(self): super(PaserControl, self).__init__() self._collector = Collector() self._urlCount = int(tools.getConfValue("html_parser", "url_count")) self._interval = int(tools.getConfValue("html_parser", "sleep_time")) def run(self): while True: try: urls = self._collector.getUrls(self._urlCount) print("取到的url大小 %d" % len(urls)) # 判断是否结束 if self._collector.isFinished(): log.debug("-------------- 结束 --------------") break
class Collector(threading.Thread, Singleton): _db = tools.getConnectedDB() _threadStop = False _urls = [] _interval = int(tools.getConfValue("collector", "sleep_time")) #初始时将正在做的任务至为未做 _db.urls.update({'status': Constance.DOING}, {'$set': { 'status': Constance.TODO }}, multi=True) if DEBUG: log.debug("is debug depth = %s" % DEPTH) def __init__(self): super(Collector, self).__init__() def run(self): while not Collector._threadStop: self.__inputData() time.sleep(Collector._interval) def stop(self): Collector._threadStop = False def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) urlsList = list(urlsList) Collector._urls.extend(urlsList) #更新已取到的url状态为doing for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) mylock.release() def getUrls(self, count): mylock.acquire() #加锁 urls = Collector._urls[:count] del Collector._urls[:count] mylock.release() return urls
class Collector(threading.Thread, Singleton): _db = tools.getConnectedDB() _threadStop = False _urls = [] _nullTimes = 0 _readPos = -1 _writePos = -1 _maxSize = int(tools.getConfValue("collector", "max_size")) _interval = int(tools.getConfValue("collector", "sleep_time")) _allowedNullTimes = int( tools.getConfValue("collector", 'allowed_null_times')) _website = tools.getConfValue("collector", "website") _depth = int(tools.getConfValue("collector", "depth")) _urlCount = int(tools.getConfValue("collector", "url_count")) #初始时将正在做的任务至为未做 beginTime = time.time() # _db.urls.update({'status':Constance.DOING}, {'$set':{'status':Constance.TODO}}, multi=True) endTime = time.time() log.debug('update url time' + str(endTime - beginTime)) if DEBUG: log.debug("is debug depth = %s" % DEPTH) def __init__(self): super(Collector, self).__init__() def run(self): while not Collector._threadStop: self.__inputData() time.sleep(Collector._interval) def stop(self): Collector._threadStop = True @tools.log_function_time def __inputData(self): log.debug('buffer size %d' % self.getMaxReadSize()) log.debug('buffer can write size = %d' % self.getMaxWriteSize()) if self.getMaxWriteSize() == 0: log.debug("collector 已满 size = %d" % self.getMaxReadSize()) return beginTime = time.time() urlCount = Collector._urlCount if Collector._urlCount <= self.getMaxWriteSize( ) else self.getMaxWriteSize() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif Collector._website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": Collector._depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(Collector._website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": Collector._depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) # 存url self.putUrls(urlsList) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export() def isFinished(self): return Collector._threadStop def isAllHaveDone(self): if Collector._urls == []: Collector._nullTimes += 1 if Collector._nullTimes >= Collector._allowedNullTimes: return True else: Collector._nullTimes = 0 return False def getMaxWriteSize(self): size = 0 if Collector._readPos == Collector._writePos: size = Collector._maxSize elif Collector._readPos < Collector._writePos: size = Collector._maxSize - (Collector._writePos - Collector._readPos) else: size = Collector._readPos - Collector._writePos return size def getMaxReadSize(self): return Collector._maxSize - self.getMaxWriteSize() def putUrls(self, urlsList): # 添加url 到 _urls urlCount = len((urlsList)) endPos = urlCount + Collector._writePos + 1 # 判断是否超出队列容量 超出的话超出的部分需要从头写 # 超出部分 overflowEndPos = endPos - Collector._maxSize # 没超出部分 inPos = endPos if endPos <= Collector._maxSize else Collector._maxSize # 没超出部分的数量 urlsListCutPos = inPos - Collector._writePos - 1 beginTime = time.time() mylock.acquire() #加锁 Collector._urls[Collector._writePos + 1:inPos] = urlsList[:urlsListCutPos] if overflowEndPos > 0: Collector._urls[:overflowEndPos] = urlsList[urlsListCutPos:] mylock.release() log.debug('put url time ' + str(time.time() - beginTime) + " size " + str(len(urlsList))) Collector._writePos += urlCount Collector._writePos %= Collector._maxSize @tools.log_function_time def getUrls(self, count): mylock.acquire() #加锁 urls = [] count = count if count <= self.getMaxReadSize( ) else self.getMaxReadSize() endPos = Collector._readPos + count + 1 if endPos > Collector._maxSize: urls.extend(Collector._urls[Collector._readPos + 1:]) urls.extend(Collector._urls[:endPos % Collector._maxSize]) else: urls.extend(Collector._urls[Collector._readPos + 1:endPos]) Collector._readPos += len(urls) Collector._readPos %= Collector._maxSize mylock.release() return urls
class Collector(threading.Thread, Singleton): _db = tools.getConnectedDB() _threadStop = False _urls = [] _nullTimes = 0 _interval = int(tools.getConfValue("collector", "sleep_time")) #初始时将正在做的任务至为未做 beginTime = time.time() _db.urls.update({'status': Constance.DOING}, {'$set': { 'status': Constance.TODO }}, multi=True) endTime = time.time() log.debug('update url time' + str(endTime - beginTime)) if DEBUG: log.debug("is debug depth = %s" % DEPTH) def __init__(self): super(Collector, self).__init__() def run(self): while not Collector._threadStop: self.__inputData() time.sleep(Collector._interval) def stop(self): Collector._threadStop = True @tools.log_function_time def __inputData(self): if len(Collector._urls) > int( tools.getConfValue("collector", "max_size")): log.debug("collector 已满 size = %d" % len(Collector._urls)) return mylock.acquire() #加锁 website = tools.getConfValue("collector", "website") depth = int(tools.getConfValue("collector", "depth")) urlCount = int(tools.getConfValue("collector", "url_count")) beginTime = time.time() if DEBUG: urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": DEPTH }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) elif website == 'all': urlsList = Collector._db.urls.find( { "status": Constance.TODO, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) #sort -1 降序 1 升序 else: websiteId = tools.getWebsiteId(website) urlsList = Collector._db.urls.find( { "status": Constance.TODO, "website_id": websiteId, "depth": { "$lte": depth } }, { "url": 1, "_id": 0, "depth": 1, "description": 1, "website_id": 1 }).sort([("depth", 1)]).limit(urlCount) endTime = time.time() urlsList = list(urlsList) log.debug('get url time ' + str(endTime - beginTime) + " size " + str(len(urlsList))) beginTime = time.time() Collector._urls.extend(urlsList) log.debug('put get url time ' + str(time.time() - beginTime) + " size " + str(len(urlsList))) #更新已取到的url状态为doing beginTime = time.time() for url in urlsList: Collector._db.urls.update(url, {'$set': { 'status': Constance.DOING }}) endTime = time.time() log.debug('update url time ' + str(endTime - beginTime)) if self.isAllHaveDone(): self.stop() exportData.export() mylock.release() def isFinished(self): return Collector._threadStop def isAllHaveDone(self): allowedNullTimes = int( tools.getConfValue("collector", 'allowed_null_times')) if Collector._urls == []: Collector._nullTimes += 1 if Collector._nullTimes >= allowedNullTimes: return True else: Collector._nullTimes = 0 return False @tools.log_function_time def getUrls(self, count): mylock.acquire() #加锁 urls = Collector._urls[:count] del Collector._urls[:count] mylock.release() return urls