class HTMLParserTC(unittest.TestCase): def setUp(self): self.parser = MaayHTMLParser() def testParseRaw(self): html = '<body>%s</body>' % ROW_TEXT title, text, links, offset = self.parser.parseString(html) self.assertEquals(title, '') self.assertEquals(text, ROW_TEXT.replace(u'é', 'e')) self.assertEquals(links, []) def testParseSimpleHtml(self): title, text, links, offset = self.parser.parseString(SIMPLE_HTML) self.assertEquals(title, 'maille Maay') self.assertEquals( text, 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileWithEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile( filename, 'iso-8859-1') self.assertEquals(title, 'maille Maay') self.assertEquals( text, 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileAndGuessEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile(filename) self.assertEquals(title, 'maille Maay') self.assertEquals( text, 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def test_normalizeHTMLEncoding(self): data = [ 'latin1', 'ISO-8859-1', 'iso-88591', 'ISO-8859-1', ] def test_parseDifficultFile(self): # This file has got some weird, non HTML compliant content # and is not handled properly by HTMLParser stream = file(join(DATADIR, 'node22.html')) data = stream.read() stream.close() title, text, links, offset = self.parser.parseString(data) self.assertEquals(title, u'21 Porting to Python 2.3') self.failUnless(len(text) > 10)
class HTMLParserTC(unittest.TestCase): def setUp(self): self.parser = MaayHTMLParser() def testParseRaw(self): html = '<body>%s</body>' % ROW_TEXT title, text, links, offset = self.parser.parseString(html) self.assertEquals(title, '') self.assertEquals(text, ROW_TEXT.replace(u'é', 'e')) self.assertEquals(links, []) def testParseSimpleHtml(self): title, text, links, offset = self.parser.parseString(SIMPLE_HTML) self.assertEquals(title, 'maille Maay') self.assertEquals(text, 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileWithEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile(filename, 'iso-8859-1') self.assertEquals(title, 'maille Maay') self.assertEquals(text, 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileAndGuessEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile(filename) self.assertEquals(title, 'maille Maay') self.assertEquals(text, 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def test_normalizeHTMLEncoding(self): data = [ 'latin1', 'ISO-8859-1', 'iso-88591', 'ISO-8859-1', ] def test_parseDifficultFile(self): # This file has got some weird, non HTML compliant content # and is not handled properly by HTMLParser stream = file(join(DATADIR, 'node22.html')) data = stream.read() stream.close() title, text, links, offset = self.parser.parseString(data) self.assertEquals(title, u'21 Porting to Python 2.3') self.failUnless(len(text)>10)
class HTMLParserTC(unittest.TestCase): def setUp(self): self.parser = MaayHTMLParser() def testParseRaw(self): html = '<body>%s</body>' % RAW_TEXT title, text, links, offset = self.parser.parseString(html) # parseString() should return empty title when non available in the HTML self.assertEquals(title, '') self.assertEquals(normalizeText(text), RAW_TEXT.replace(u'é', 'e')) self.assertEquals(links, []) def testTitleGuess(self): """Make sure the title is the filename when we treat a text file or no title could be found """ title, text, links, offset = self.parser.parseFile( join(DATADIR, "notitle.html"), 'notitle.html') self.assertEquals(title, 'notitle.html') self.assertEquals(normalizeText(text), "maille maay") self.assertEquals(links, []) def testParseSimpleHtml(self): title, text, links, offset = self.parser.parseString(SIMPLE_HTML) self.assertEquals(title, 'maille Maay') self.assertEquals( normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileWithEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile( filename, 'encoded.html', 'iso-8859-1') self.assertEquals(title, 'maille Maay') self.assertEquals( normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileAndGuessEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile( filename, 'encoded.html') self.assertEquals(title, 'maille Maay') self.assertEquals( normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def test_normalizeHTMLEncoding(self): data = [ 'latin1', 'ISO-8859-1', 'iso-88591', 'ISO-8859-1', ] def _test_parseDifficultFile(self): """test_parseDifficultFile: This test fails for now""" # This file has got some weird, non HTML compliant content # and is not handled properly by HTMLParser stream = file(join(DATADIR, 'node22.html')) data = stream.read() stream.close() title, text, links, offset = self.parser.parseString(data) self.assertEquals(title, u'21 Porting to Python 2.3') self.failUnless(len(text) > 10)
class HTMLParserTC(unittest.TestCase): def setUp(self): self.parser = MaayHTMLParser() def testParseRaw(self): html = '<body>%s</body>' % RAW_TEXT title, text, links, offset = self.parser.parseString(html) # parseString() should return empty title when non available in the HTML self.assertEquals(title, '') self.assertEquals(normalizeText(text), RAW_TEXT.replace(u'é', 'e')) self.assertEquals(links, []) def testTitleGuess(self): """Make sure the title is the filename when we treat a text file or no title could be found """ title, text, links, offset = self.parser.parseFile(join(DATADIR, "notitle.html"), 'notitle.html') self.assertEquals(title, 'notitle.html') self.assertEquals(normalizeText(text), "maille maay") self.assertEquals(links, []) def testParseSimpleHtml(self): title, text, links, offset = self.parser.parseString(SIMPLE_HTML) self.assertEquals(title, 'maille Maay') self.assertEquals(normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileWithEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile(filename, 'encoded.html', 'iso-8859-1') self.assertEquals(title, 'maille Maay') self.assertEquals(normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def testParseHtmlFileAndGuessEncoding(self): filename = join(DATADIR, 'encoded.html') title, text, links, offset = self.parser.parseFile(filename, 'encoded.html') self.assertEquals(title, 'maille Maay') self.assertEquals(normalizeText(text), 'hello ete world this is a link and this is another link') self.assertEquals(links, ['something.com', 'somethingelse.com']) def test_normalizeHTMLEncoding(self): data = [ 'latin1', 'ISO-8859-1', 'iso-88591', 'ISO-8859-1', ] def _test_parseDifficultFile(self): """test_parseDifficultFile: This test fails for now""" # This file has got some weird, non HTML compliant content # and is not handled properly by HTMLParser stream = file(join(DATADIR, 'node22.html')) data = stream.read() stream.close() title, text, links, offset = self.parser.parseString(data) self.assertEquals(title, u'21 Porting to Python 2.3') self.failUnless(len(text)>10)