def _process_html(self, html): ''' * 解析内容中的url,存入集合中 ''' urls = set() soup = BeautifulSoup(html) soup_res = soup.findAll(True, {'href': re.compile(self._target['grab_url_reg'])}) for res in soup_res: url = res['href'] #如果是资源文件,则丢弃 if StringUtil.is_resource_url(url): continue urls.add(url) ''' * 将解析的url set存入到redis中 ''' for url in urls: self._redis.sset('sohu::url', url) print " >>>>finish push %s urls in `sohu::url` of redis" %len(urls)
def _process_html(self, html): ''' * 解析内容中的url,存入集合中 ''' urls = set() soup = BeautifulSoup(html) soup_res = soup.findAll( True, {'href': re.compile(self._target['grab_url_reg'])}) for res in soup_res: url = res['href'] #如果是资源文件,则丢弃 if StringUtil.is_resource_url(url): continue urls.add(url) ''' * 将解析的url set存入到redis中 ''' for url in urls: self._redis.sset('sohu::url', url) print " >>>>finish push %s urls in `sohu::url` of redis" % len( urls)