Esempio n. 1
0
class Scheduler(object):
    '''初始化并开启爬虫程序所有模块,并为下载模块分配url'''
    def __init__(self, dbName, threadNum, logLevel, startUrls, depth, keyword, downloadMode):
        self.__threadNum = threadNum
        self.__startUrls = startUrls
        self.__depth = depth
        self.__keyword = keyword
        self.__downloadMode = downloadMode
        self.__dbName = dbName
        self.__logLevel = logLevel
        
        self.__exitEvent = threading.Event()
        # url队列存储待下载的url节点
        self.__urlQueue = Queue.Queue()
        # html队列存储已经下载完成等待解析的html节点
        self.__htmlQueue = Queue.Queue()
        # data队列存储已解析完成并符合存入数据库条件的html节点
        self.__dataQueue = Queue.Queue()
        # 存储为各个下载模块分配的下载队列
        self.__downloadQueueList = []
	# 创建线程池
        self.__threadPool = ThreadPool(threadNum + 2)
        self.__downloadingFlag = 0


    def __initUrlQueue(self, urlList):
        '''将url封装为内部数据格式'''
        for url in urlList:
            urlNode = UrlModel(url, '', timestamp(), 0)
            self.__urlQueue.put(urlNode)


    def start(self):
	'''创建并启动各个模块'''
        logger.debug('Init start urls...')
        self.__initUrlQueue(self.__startUrls)
        
	# 启动threadNum个下载器并为它们分配下载队列 
        logger.debug('Put downloader to thread pool...')
        for i in range(self.__threadNum):
            dlQueue = Queue.Queue()
            self.__downloadQueueList.append(dlQueue)
            downloadReq = Downloader(dlQueue, self.__downloadMode, self.__htmlQueue, self.__exitEvent, self.__downloadingFlag)
            self.__threadPool.putRequest(downloadReq)

	# 创建解析模块并添加到线程池运行
        logger.debug('Put parser to thread pool...')
        parserReq = Parser(self.__depth, self.__startUrls, self.__keyword, self.__htmlQueue, self.__dataQueue, self.__urlQueue, self.__exitEvent)
        self.__threadPool.putRequest(parserReq)

	# 创建存储模块并添加到线程池运行
        logger.debug('Put storage to thread pool...')
        storageReq = Storage(self.__dbName, self.__dataQueue, self.__exitEvent)
        self.__threadPool.putRequest(storageReq)

	# 主循环用于为各个下载队列分配url以及输出日志信息
        logger.debug('start main loop...')
        lastTime = time.time()
        while True:
            for dlQueue in self.__downloadQueueList:
                if self.__urlQueue.qsize() > 0 and dlQueue.qsize() < 1:
                    node = self.__urlQueue.get()
                    dlQueue.put(node)

            now = time.time()
            if now - lastTime > PRINT_TIME_INTERVAL:
                logger.info('URL QUEUE SIZE : %d', self.__urlQueue.qsize())
                logger.info('HTML QUEUE SIZE : %d', self.__htmlQueue.qsize())
                logger.info('DATA QUEUE SIZE : %d', self.__dataQueue.qsize())
                logger.info('REPEAT SET SIZE : %d', parserReq.getRepeatSetSize())
                # 延迟检测退出事件,防止程序启动时即退出 
                if now - lastTime > 30:
                    if self.__urlQueue.qsize() < 1 and self.__htmlQueue.qsize() < 1 and \
	                          self.__dataQueue.qsize() < 1 and self.__downloadingFlag < 1:
                        self.__exitEvent.set()
                        self.__threadPool.close(True)
                        return
                lastTime = now