コード例 #1
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testMoreTag(self):
     target = 'div.share .my'
     html = '''
     <div class="share">
         <a class="my" href="#"></a>
         <a class="your" href="#">OK</a>
     </div>
     '''
     soup = BeautifulSoup(html)
     ret = findAll(target,soup)
     self.assertEqual(1,len(ret))
コード例 #2
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testPosition(self):
     target = "h2 + ul > li > a"
     html = '''
     <h2>title</h2>
     <ul>
         <li><a href="#">nothing</a></li>
         <li><a href="#">ok</a></li>
         <li><a href="#">come on!</a></li>
     </ul>
     '''
     soup = BeautifulSoup(html)
     ret = findAll(target,soup)
     self.assertEqual(3,len(ret))
コード例 #3
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testTagWithAttrs(self):
     target = "div.item#one[width=100]"
     html = '''
     <div class="item" id="one" width="100">
         hey
     </div>
     <div class="item" id="two">
         man,
     </div>
     <div class="the">
         fuckup
     </div>
     '''
     soup = BeautifulSoup(html)
     ret = findAll(target,soup)
     self.assertEqual(1,len(ret))
コード例 #4
0
ファイル: htmllib.py プロジェクト: alexliyu/mobilesystem
def parsehtml(html_content, feed, url, feed_url):
		"""
		格式化采集后的内容
		"""
		start_target = feed.start_target
		allow_target = feed.allow_target
		mid_target = feed.mid_target
		end_target = feed.end_target
		stop_target = feed.stop_target
		pagehtml = decoding(html_content)
		target = decoding(start_target)
		if not stop_target or stop_target == 'nohtml':
				stop_target = None
		else:
				stop_target = encoding(stop_target, 'utf-8')
		ret = ''
		try:
			"""
			解析元数据和url
			"""
			soup = BeautifulSoup(pagehtml, fromEncoding="utf-8")
			try:
				tret = findAll(target, soup)
				"""
				may be somethimes,when we type the tag like p,there will raise error
				because the tret is a list and str(tret) is not fit for us,
				so there we use join to change it
				"""
				tret = ''.join('%s' % tmpstr for tmpstr in list(tret))
				if stop_target:
					tret = tret.split(stop_target)[0]
				minisoup = BeautifulSoup(tret, fromEncoding="utf-8")
				minisoup.prettify()
				ret = GetAllclean(mid_target, end_target, allow_target, url, minisoup)
				if len(ret) == 0:
					logging.error('The feed %s `s target %s is bad target', feed_url, target)
					return None
				else:
					logging.info('add the new article now,the new one len to %s,type to %s', len(ret), type(ret))
					return ret
			except  Exception, data:
				logging.error('something is wrong at url %s ,the error is %s ', feed_url, data)
				return None

		except Exception:
			logging.error('Could not parse this,the html has misstake is %s', feed_url)
			return None
コード例 #5
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testMixSelection(self):
     target = "#header > div#name > a.highlight"
     html = '''
     <div id="header">
         <div id="name">
             <a class="target">test</a>
             <a class="highlight">right</a>
             <a class="highlight">ok</a>
         </div>
         <div id="your">
         </div>
     </div>
     <div id="body">fk
     </div>
     '''
     soup = BeautifulSoup(html)
     ret = findAll(target,soup)
     self.assertEqual(2,len(ret))
コード例 #6
0
ファイル: htmllib.py プロジェクト: alexliyu/lincdm
def parsehtml(html_content, feed, feed_url, url):
		start_target = feed.start_target
		allow_target = feed.allow_target
		mid_target = feed.mid_target
		end_target = feed.end_target
		stop_target = feed.stop_target
		pagehtml = decoding(html_content)
		target = decoding(start_target)
		if not stop_target or stop_target == 'nohtml':
				stop_target = None
		else:
				stop_target = encoding(stop_target, 'utf-8')
		ret = ''
		try:
			"""
			解析元数据和url
			"""
			soup = BeautifulSoup(pagehtml, fromEncoding="utf-8")
			try:
				tret = findAll(target, soup)
				"""
				may be somethimes,when we type the tag like p,there will raise error
				because the tret is a list and str(tret) is not fit for us,
				so there we use join to change it
				"""
				tret = ''.join('%s' % tmpstr for tmpstr in list(tret))
				if stop_target:
					tret = tret.split(stop_target)[0]
				minisoup = BeautifulSoup(tret, fromEncoding="utf-8")
				minisoup.prettify()
				ret = GetAllclean(mid_target, end_target, allow_target, url, minisoup)
				if len(ret) == 0:
					logging.error('The feed %s `s target %s is bad target', feed_url, target)
					return None
				else:
					logging.info('add the new article now,the new one len to %s,type to %s', len(ret), type(ret))
					return ret
			except  Exception, data:
				logging.error('something is wrong at url %s ,the error is %s ', feed_url, data)
				return None

		except Exception:
			logging.error('Could not parse this,the html has misstake is %s', feed_url)
			return None
コード例 #7
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testId(self):
     target = "#header"
     soup = BeautifulSoup('<div id="header">hey</div><div id="hello">you</div>')
     ret = findAll(target,soup)
     self.assertEqual(1,len(ret))
コード例 #8
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testClass(self):
     target = ".item"
     soup = BeautifulSoup('<a class="iwill" href="#"></a><a class="item" href="#">hello</a><img class="item" src="cc.png"/>')
     ret = findAll(target,soup)
     self.assertEqual(2,len(ret))
コード例 #9
0
ファイル: testSelector.py プロジェクト: maddogfyg/iCrawler
 def testTag(self):
     target = "h3"
     soup = BeautifulSoup("<h1>hello</h1><h3>heyhey</h3>")
     ret = findAll(target,soup)
     self.assertEqual(1,len(ret))