def testIgnoreComment(self): exp = '''<html></html>''' # ignore the comment actual = '''<html><!--some comment--></html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents())
def testIgnoreStyle(self): exp = '''<html></html>''' # ignore the style actual = '''<html><style type="text/css"></style></html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents()) # ignore the style and its content actual = '''<html><style type="text/css">body {some style}</style></html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents())
def parsebbsDomDetail(self, dom_block_str, config): try: dom_row_pattern = config['dom_row_pattern'] #make dom block string become dom again, #Unreasonable for: string->dom->blockdom->blockstring->blockdom->rowdom->rowstring need to be revised doc = CustomizedSoup(dom_block_str) scraper = Scraper(dom_row_pattern) #setup scraper to scrape row string ret = scraper.match(doc) #values = scraper.extract(ret[0]); parsed_result = [] index = 1 for item in ret: value = scraper.extract(item) self.fixitem(value, config) #value['boardlink'] = config['root'] + value['boardlink']; #value['titlelink'] = config['root'] + value['titlelink']; #print value['titlelink'] value['title'] = unescape(value['title']) #SAFE TITLE #value['authorlink'] = config['root'] + value['authorlink']; parsed_result.append(value) index = index + 1 if index > 10: break except Exception, e: logging.error("failed to parse bbs in Domdetail ;schoolname= %s", config['locate']) #print e; raise
def testIgnoreEmpytString(self): exp = '''<html></html>''' # ignore the style actual = '''<html>\n\n\n\n</html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents()) exp = ''' <html> <head>some text </head> </html> ''' # ignore the style actual = '''<html><head>some text</head></html>''' self.assertEqual(CustomizedSoup(exp), CustomizedSoup(actual))
def testIgnoreScript(self): exp = '''<html></html>''' # ignore the script actual = '''<html><script type="text/javascript" src="http://image2.sina.com.cn/home/sinaflash.js"></script></html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents()) # ignore the script actual = '''<html><SCRIPT type="text/javascript" src="http://image2.sina.com.cn/home/sinaflash.js"></script></html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents()) # ignore the script and its content actual = '''<html><script type="text/javascript">funcion some(){}</script></html>''' doc = CustomizedSoup(actual) self.assertEqual(exp, doc.renderContents())
def __init__(self, pattern): self.pattern = CustomizedSoup(pattern).contents[0]