def testIgnoreComment(self):
     exp = '''<html></html>'''
     
     # ignore the comment
     actual = '''<html><!--some comment--></html>'''
     doc = CustomizedSoup(actual)
     self.assertEqual(exp, doc.renderContents())
 def testIgnoreStyle(self):
     exp = '''<html></html>'''
     
     # ignore the style
     actual = '''<html><style type="text/css"></style></html>'''
     doc = CustomizedSoup(actual)
     self.assertEqual(exp, doc.renderContents())
     
      # ignore the style and its content
     actual = '''<html><style type="text/css">body {some style}</style></html>'''
     doc = CustomizedSoup(actual)
     self.assertEqual(exp, doc.renderContents())
Example #3
0
    def parsebbsDomDetail(self, dom_block_str, config):
        try:
            dom_row_pattern = config['dom_row_pattern']
            #make dom block string become dom again,
            #Unreasonable for: string->dom->blockdom->blockstring->blockdom->rowdom->rowstring need to be revised
            doc = CustomizedSoup(dom_block_str)
            scraper = Scraper(dom_row_pattern)
            #setup scraper to scrape row string
            ret = scraper.match(doc)
            #values = scraper.extract(ret[0]);
            parsed_result = []
            index = 1

            for item in ret:
                value = scraper.extract(item)
                self.fixitem(value, config)
                #value['boardlink']  = config['root'] + value['boardlink'];
                #value['titlelink'] = config['root'] + value['titlelink'];
                #print value['titlelink']
                value['title'] = unescape(value['title'])
                #SAFE TITLE
                #value['authorlink'] = config['root'] + value['authorlink'];

                parsed_result.append(value)
                index = index + 1
                if index > 10: break
        except Exception, e:
            logging.error("failed to parse bbs in Domdetail ;schoolname= %s",
                          config['locate'])
            #print e;
            raise
    def testIgnoreEmpytString(self):
        exp = '''<html></html>'''
        
        # ignore the style
        actual = '''<html>\n\n\n\n</html>'''
        doc = CustomizedSoup(actual)
        self.assertEqual(exp, doc.renderContents())


        exp = '''
            <html>
                <head>some text
                </head>
            </html>
            '''
        
        # ignore the style
        actual = '''<html><head>some text</head></html>'''
        
        
        self.assertEqual(CustomizedSoup(exp), CustomizedSoup(actual))
 def testIgnoreScript(self):
     exp = '''<html></html>'''
     
     # ignore the script
     actual = '''<html><script type="text/javascript" src="http://image2.sina.com.cn/home/sinaflash.js"></script></html>'''
     doc = CustomizedSoup(actual)
     self.assertEqual(exp, doc.renderContents())
     
     # ignore the script
     actual = '''<html><SCRIPT type="text/javascript" src="http://image2.sina.com.cn/home/sinaflash.js"></script></html>'''
     doc = CustomizedSoup(actual)
     self.assertEqual(exp, doc.renderContents())
     
     # ignore the script and its content
     actual = '''<html><script type="text/javascript">funcion some(){}</script></html>'''
     doc = CustomizedSoup(actual)
     self.assertEqual(exp, doc.renderContents())
Example #6
0
 def __init__(self, pattern):
     self.pattern = CustomizedSoup(pattern).contents[0]