Example #1
0
	def _process_html(self, html):
		'''
		 * 解析内容中的url,存入集合中
		'''
		urls = set()
		soup = BeautifulSoup(html)
		soup_res = soup.findAll(True, {'href': re.compile(self._target['grab_url_reg'])})
		for res in soup_res:
			url = res['href']
			#如果是资源文件,则丢弃
			if StringUtil.is_resource_url(url): continue
			urls.add(url)
		'''
		 * 将解析的url set存入到redis中
		'''
		for url in urls:
			self._redis.sset('sohu::url', url)
		print "     >>>>finish push %s urls in `sohu::url` of redis" %len(urls)
Example #2
0
    def _process_html(self, html):
        '''
		 * 解析内容中的url,存入集合中
		'''
        urls = set()
        soup = BeautifulSoup(html)
        soup_res = soup.findAll(
            True, {'href': re.compile(self._target['grab_url_reg'])})
        for res in soup_res:
            url = res['href']
            #如果是资源文件,则丢弃
            if StringUtil.is_resource_url(url): continue
            urls.add(url)
        '''
		 * 将解析的url set存入到redis中
		'''
        for url in urls:
            self._redis.sset('sohu::url', url)
        print "     >>>>finish push %s urls in `sohu::url` of redis" % len(
            urls)