Example #1
0
class HTMLParserTC(unittest.TestCase):
    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % ROW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        self.assertEquals(title, '')
        self.assertEquals(text, ROW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(
            filename, 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1',
            'ISO-8859-1',
            'iso-88591',
            'ISO-8859-1',
        ]

    def test_parseDifficultFile(self):
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text) > 10)
class HTMLParserTC(unittest.TestCase):

    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % ROW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        self.assertEquals(title, '')
        self.assertEquals(text, ROW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
    

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename, 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1', 'ISO-8859-1',
            'iso-88591', 'ISO-8859-1',
            
            ]

    def test_parseDifficultFile(self):
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser 
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text)>10)
 def setUp(self):
     self.parser = MaayHTMLParser()
class HTMLParserTC(unittest.TestCase):
    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % RAW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        # parseString() should return empty title when non available in the HTML
        self.assertEquals(title, '')
        self.assertEquals(normalizeText(text), RAW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testTitleGuess(self):
        """Make sure the title is the filename when we treat a text file
           or no title could be found
        """
        title, text, links, offset = self.parser.parseFile(
            join(DATADIR, "notitle.html"), 'notitle.html')
        self.assertEquals(title, 'notitle.html')
        self.assertEquals(normalizeText(text), "maille maay")
        self.assertEquals(links, [])

    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            normalizeText(text),
            'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(
            filename, 'encoded.html', 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            normalizeText(text),
            'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(
            filename, 'encoded.html')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            normalizeText(text),
            'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1',
            'ISO-8859-1',
            'iso-88591',
            'ISO-8859-1',
        ]

    def _test_parseDifficultFile(self):
        """test_parseDifficultFile: This test fails for now"""
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text) > 10)
 def setUp(self):
     self.parser = MaayHTMLParser()
class HTMLParserTC(unittest.TestCase):

    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % RAW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        # parseString() should return empty title when non available in the HTML
        self.assertEquals(title, '')
        self.assertEquals(normalizeText(text),
                          RAW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testTitleGuess(self):
        """Make sure the title is the filename when we treat a text file
           or no title could be found
        """
        title, text, links, offset = self.parser.parseFile(join(DATADIR, "notitle.html"), 'notitle.html')
        self.assertEquals(title, 'notitle.html')
        self.assertEquals(normalizeText(text), "maille maay")
        self.assertEquals(links, [])


    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(normalizeText(text),
                          'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
    

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename, 'encoded.html', 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(normalizeText(text),
                          'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename, 'encoded.html')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(normalizeText(text),
                          'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1', 'ISO-8859-1',
            'iso-88591', 'ISO-8859-1',
            
            ]

    def _test_parseDifficultFile(self):
        """test_parseDifficultFile: This test fails for now"""
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser 
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text)>10)