def test_parse_links_from_page(self): """test parse_links_from_page method""" content = """ <div> <a href="a.cpp"> </a> <img src="image.jpg" style="display: block;"> </div> """ url = "http://pycm.baidu.com:8081/3/page3_4.html" links = spider_util.parse_links_from_page(content, url) self.assertItemsEqual(links, ['http://pycm.baidu.com:8081/3/a.cpp', 'http://pycm.baidu.com:8081/3/image.jpg'])
def process(self, url, depth): """ 处理单个页面抓取任务 Args: url: 页面路径 depth: 当前页面深度 """ logging.info("thread [%d] process begin, url: %s, depth: %s", self.idx, url, depth) content = self.page_downloader.download(url) if content: self.page_saver.save_to_file(url, content) links = spider_util.parse_links_from_page(content, url) for link in links: if link not in self.url_set: self.url_queue.put((link, depth + 1)) self.url_set.add(link) logging.info("thread [%d] process end, url: %s, depth: %s", self.idx, url, depth)