def testMoreTag(self): target = 'div.share .my' html = ''' <div class="share"> <a class="my" href="#"></a> <a class="your" href="#">OK</a> </div> ''' soup = BeautifulSoup(html) ret = findAll(target,soup) self.assertEqual(1,len(ret))
def testPosition(self): target = "h2 + ul > li > a" html = ''' <h2>title</h2> <ul> <li><a href="#">nothing</a></li> <li><a href="#">ok</a></li> <li><a href="#">come on!</a></li> </ul> ''' soup = BeautifulSoup(html) ret = findAll(target,soup) self.assertEqual(3,len(ret))
def testTagWithAttrs(self): target = "div.item#one[width=100]" html = ''' <div class="item" id="one" width="100"> hey </div> <div class="item" id="two"> man, </div> <div class="the"> fuckup </div> ''' soup = BeautifulSoup(html) ret = findAll(target,soup) self.assertEqual(1,len(ret))
def parsehtml(html_content, feed, url, feed_url): """ 格式化采集后的内容 """ start_target = feed.start_target allow_target = feed.allow_target mid_target = feed.mid_target end_target = feed.end_target stop_target = feed.stop_target pagehtml = decoding(html_content) target = decoding(start_target) if not stop_target or stop_target == 'nohtml': stop_target = None else: stop_target = encoding(stop_target, 'utf-8') ret = '' try: """ 解析元数据和url """ soup = BeautifulSoup(pagehtml, fromEncoding="utf-8") try: tret = findAll(target, soup) """ may be somethimes,when we type the tag like p,there will raise error because the tret is a list and str(tret) is not fit for us, so there we use join to change it """ tret = ''.join('%s' % tmpstr for tmpstr in list(tret)) if stop_target: tret = tret.split(stop_target)[0] minisoup = BeautifulSoup(tret, fromEncoding="utf-8") minisoup.prettify() ret = GetAllclean(mid_target, end_target, allow_target, url, minisoup) if len(ret) == 0: logging.error('The feed %s `s target %s is bad target', feed_url, target) return None else: logging.info('add the new article now,the new one len to %s,type to %s', len(ret), type(ret)) return ret except Exception, data: logging.error('something is wrong at url %s ,the error is %s ', feed_url, data) return None except Exception: logging.error('Could not parse this,the html has misstake is %s', feed_url) return None
def testMixSelection(self): target = "#header > div#name > a.highlight" html = ''' <div id="header"> <div id="name"> <a class="target">test</a> <a class="highlight">right</a> <a class="highlight">ok</a> </div> <div id="your"> </div> </div> <div id="body">fk </div> ''' soup = BeautifulSoup(html) ret = findAll(target,soup) self.assertEqual(2,len(ret))
def parsehtml(html_content, feed, feed_url, url): start_target = feed.start_target allow_target = feed.allow_target mid_target = feed.mid_target end_target = feed.end_target stop_target = feed.stop_target pagehtml = decoding(html_content) target = decoding(start_target) if not stop_target or stop_target == 'nohtml': stop_target = None else: stop_target = encoding(stop_target, 'utf-8') ret = '' try: """ 解析元数据和url """ soup = BeautifulSoup(pagehtml, fromEncoding="utf-8") try: tret = findAll(target, soup) """ may be somethimes,when we type the tag like p,there will raise error because the tret is a list and str(tret) is not fit for us, so there we use join to change it """ tret = ''.join('%s' % tmpstr for tmpstr in list(tret)) if stop_target: tret = tret.split(stop_target)[0] minisoup = BeautifulSoup(tret, fromEncoding="utf-8") minisoup.prettify() ret = GetAllclean(mid_target, end_target, allow_target, url, minisoup) if len(ret) == 0: logging.error('The feed %s `s target %s is bad target', feed_url, target) return None else: logging.info('add the new article now,the new one len to %s,type to %s', len(ret), type(ret)) return ret except Exception, data: logging.error('something is wrong at url %s ,the error is %s ', feed_url, data) return None except Exception: logging.error('Could not parse this,the html has misstake is %s', feed_url) return None
def testId(self): target = "#header" soup = BeautifulSoup('<div id="header">hey</div><div id="hello">you</div>') ret = findAll(target,soup) self.assertEqual(1,len(ret))
def testClass(self): target = ".item" soup = BeautifulSoup('<a class="iwill" href="#"></a><a class="item" href="#">hello</a><img class="item" src="cc.png"/>') ret = findAll(target,soup) self.assertEqual(2,len(ret))
def testTag(self): target = "h3" soup = BeautifulSoup("<h1>hello</h1><h3>heyhey</h3>") ret = findAll(target,soup) self.assertEqual(1,len(ret))