class Spider(object): def __init__(self, thread_num, logfile, debug_level, dbfile, keyword, degree): self.urls = [] self.logfile = logfile self.debug_level = debug_level self.dbfile = dbfile self.thread_num = thread_num self.keyword = keyword self.degree = degree def run(self, urls): logger.info("开始运行爬虫程序") self.thread_pool = ThreadPool(self.dbfile, self.thread_num) for url in urls: # import spider template from knowquiz.modules import get_my_blog logger.info("添加任务 %s 到队列中" % get_my_blog.__name__) args = (url, self.keyword, self.degree) self.thread_pool.add_job(get_my_blog, args) while self.thread_pool.check_job() > 0: try: logger.info("当前可用任务 %d" % self.thread_pool.check_job()) time.sleep(5) except KeyboardInterrupt: self.thread_pool.stop_job() def quit(self): logger.warn("退出程序") self.thread_pool.stop_all()
def run(self, urls): logger.info("开始运行爬虫程序") self.thread_pool = ThreadPool(self.dbfile, self.thread_num) for url in urls: # import spider template from knowquiz.modules import get_my_blog logger.info("添加任务 %s 到队列中" % get_my_blog.__name__) args = (url, self.keyword, self.degree) self.thread_pool.add_job(get_my_blog, args) while self.thread_pool.check_job() > 0: try: logger.info("当前可用任务 %d" % self.thread_pool.check_job()) time.sleep(5) except KeyboardInterrupt: self.thread_pool.stop_job()