def testcase_ThreadPool_get_result_success(self): """测试用例3:get_result,所有任务执行完后结果为1""" jobs = [i for i in xrange(2)] pool = ThreadPool(3, test_function, jobs, 0) pool.wait_allcomplete() sum = 0 while True: try: res = pool.get_result() arr_res = json.loads(res) sum += int(arr_res['url']) except Queue.Empty as e: self.logging.info(e) break self.assertEqual(1, sum)
def begin_crawl(self, conf): """爬虫开始的地方。。。 Args: conf: 配置文件 returns: none """ ret = self.build_params(conf) if False == ret: logging.info("初始化失败,退出....") return False crawl_depth = int(self.params['max_depth']) #读取种子url arr_source_url = [] with open(self.params['url_list_file']) as fopen: for line in fopen: arr_source_url.append(line.strip()) if 0 == len(arr_source_url): logging.info("种子文件为空,请检查种子文件,我走了") return False thread_count = self.params['thread_count'] interval = self.params['crawl_interval'] # 设置爬虫超时时间 socket.setdefaulttimeout(self.params['crawl_timeout']) for depth in xrange(crawl_depth): # 开始第i层的爬取,然后解析网页,提取url work_manager = ThreadPool(thread_count, self.crawl_html, arr_source_url, interval) work_manager.wait_allcomplete() while True: try: logging.info("开始解析网页") res = work_manager.get_result() # 爬虫线程中将url和url对应的网页内容存储成json res = json.loads(res) origin_url, html = [res['url'], res['html']] parse_url_set = Crawler.parse_html( origin_url, html, self.params['target_url']) next_url = parse_url_set[0] js_url = parse_url_set[1] ret = parse_url_set[2] # 解析js文件中的目标url和下一级未访问过的url for url in js_url: resp = self.crawl_html(url) js_parse_url_set = Crawler.parse_html( url, resp, self.params['target_url']) tmp_next_url = js_parse_url_set[0] tmp_ret = js_parse_url_set[2] next_url.extend(tmp_next_url) ret.extend(tmp_ret) arr_target_url = Crawler.filter_unvisited_url( ret, self.target_visited_url) for t_url in arr_target_url: self.save_file(t_url) if self.is_enough(): return True arr_source_url = Crawler.filter_unvisited_url( next_url, self.visited_url) except Queue.Empty as e: logging.warning(e) logging.info("第 %d 级抓取完毕..." % depth) break if self.is_enough(): return True else: return False