Python MaayHTMLParser.parseFileの例

プログラミング言語: Python

名前空間/パッケージ名: maay.texttool

クラス/型: MaayHTMLParser

メソッド/関数: parseFile

hotexamples.comのコード掲載数: 4

Python MaayHTMLParser.parseFile - 4件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのmaay.texttool.MaayHTMLParser.parseFileの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

parseString(2)

parseFile(2)

MaayHTMLParser(1)

コード例 #1

ファイルを表示

class HTMLParserTC(unittest.TestCase):
    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % ROW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        self.assertEquals(title, '')
        self.assertEquals(text, ROW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(
            filename, 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1',
            'ISO-8859-1',
            'iso-88591',
            'ISO-8859-1',
        ]

    def test_parseDifficultFile(self):
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text) > 10)

コード例 #2

ファイルを表示

ファイル: test_texttool.py プロジェクト: BackupTheBerlios/maay-svn

class HTMLParserTC(unittest.TestCase):

    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % ROW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        self.assertEquals(title, '')
        self.assertEquals(text, ROW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
    

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename, 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(text, 'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1', 'ISO-8859-1',
            'iso-88591', 'ISO-8859-1',
            
            ]

    def test_parseDifficultFile(self):
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser 
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text)>10)

コード例 #3

ファイルを表示

ファイル: test_texttool.py プロジェクト: BackupTheBerlios/maay-svn

class HTMLParserTC(unittest.TestCase):
    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % RAW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        # parseString() should return empty title when non available in the HTML
        self.assertEquals(title, '')
        self.assertEquals(normalizeText(text), RAW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testTitleGuess(self):
        """Make sure the title is the filename when we treat a text file
           or no title could be found
        """
        title, text, links, offset = self.parser.parseFile(
            join(DATADIR, "notitle.html"), 'notitle.html')
        self.assertEquals(title, 'notitle.html')
        self.assertEquals(normalizeText(text), "maille maay")
        self.assertEquals(links, [])

    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            normalizeText(text),
            'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(
            filename, 'encoded.html', 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            normalizeText(text),
            'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(
            filename, 'encoded.html')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(
            normalizeText(text),
            'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])

    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1',
            'ISO-8859-1',
            'iso-88591',
            'ISO-8859-1',
        ]

    def _test_parseDifficultFile(self):
        """test_parseDifficultFile: This test fails for now"""
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text) > 10)

コード例 #4

ファイルを表示

ファイル: test_texttool.py プロジェクト: BackupTheBerlios/maay-svn

class HTMLParserTC(unittest.TestCase):

    def setUp(self):
        self.parser = MaayHTMLParser()

    def testParseRaw(self):
        html = '<body>%s</body>' % RAW_TEXT
        title, text, links, offset = self.parser.parseString(html)
        # parseString() should return empty title when non available in the HTML
        self.assertEquals(title, '')
        self.assertEquals(normalizeText(text),
                          RAW_TEXT.replace(u'é', 'e'))
        self.assertEquals(links, [])

    def testTitleGuess(self):
        """Make sure the title is the filename when we treat a text file
           or no title could be found
        """
        title, text, links, offset = self.parser.parseFile(join(DATADIR, "notitle.html"), 'notitle.html')
        self.assertEquals(title, 'notitle.html')
        self.assertEquals(normalizeText(text), "maille maay")
        self.assertEquals(links, [])


    def testParseSimpleHtml(self):
        title, text, links, offset = self.parser.parseString(SIMPLE_HTML)
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(normalizeText(text),
                          'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
    

    def testParseHtmlFileWithEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename, 'encoded.html', 'iso-8859-1')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(normalizeText(text),
                          'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def testParseHtmlFileAndGuessEncoding(self):
        filename = join(DATADIR, 'encoded.html')
        title, text, links, offset = self.parser.parseFile(filename, 'encoded.html')
        self.assertEquals(title, 'maille Maay')
        self.assertEquals(normalizeText(text),
                          'hello ete world this is a link and this is another link')
        self.assertEquals(links, ['something.com', 'somethingelse.com'])
        
    def test_normalizeHTMLEncoding(self):
        data = [
            'latin1', 'ISO-8859-1',
            'iso-88591', 'ISO-8859-1',
            
            ]

    def _test_parseDifficultFile(self):
        """test_parseDifficultFile: This test fails for now"""
        # This file has got some weird, non HTML compliant content
        # and is not handled properly by HTMLParser 
        stream = file(join(DATADIR, 'node22.html'))
        data = stream.read()
        stream.close()
        title, text, links, offset = self.parser.parseString(data)
        self.assertEquals(title, u'21 Porting to Python 2.3')
        self.failUnless(len(text)>10)