Esempio n. 1
0
 def test_valid_http_equiv_is_required(self):
     reader = HtmlReader()
     reader.feed('<meta content="text/html; charset=utf-8" />')
     assert_equals(reader._encoding, 'ISO-8859-1')
     reader.feed(
         '<meta http-equiv="Invalid" content="text/html; charset=utf-8" />')
     assert_equals(reader._encoding, 'ISO-8859-1')
Esempio n. 2
0
 def test_encoding_is_read_from_meta_tag(self):
     reader = HtmlReader()
     reader.feed(
         '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'
     )
     assert_equals(reader._encoding, 'utf-8')
     reader.feed(
         '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">'
     )
     assert_equals(reader._encoding, 'UTF-8')
Esempio n. 3
0
 def test_encoding_and_entityrefs(self):
     reader = HtmlReader()
     reader.populator = PopulatorMock()
     reader.feed('<meta content="text/html; charset=utf-8" />')
     reader.feed('<table><tr><td>Setting</td></tr>')
     reader.feed('<tr><td>&auml;iti')
     assert_equals(reader.current_cell, [u'\xe4', u'iti'])
     reader.feed('</tr>')
     assert_equals(reader.populator.tables['Setting'][0], [u'\xe4iti'])
 def test_encoding_and_entityrefs(self):
     reader = HtmlReader()
     reader.populator = PopulatorMock()
     reader.feed('<meta content="text/html; charset=utf-8" />')
     reader.feed('<table><tr><td>Setting</td></tr>')
     reader.feed('<tr><td>&auml;iti')
     assert_equals(reader.current_cell, [u'\xe4', u'iti'])
     reader.feed('</tr>')
     assert_equals(reader.populator.tables['Setting'][0], [u'\xe4iti'])
Esempio n. 5
0
 def test_encoding_and_entityrefs(self):
     reader = HtmlReader()
     reader.populator = PopulatorMock()
     reader.feed('<meta content="text/html; charset=utf-8" />')
     reader.feed("<table><tr><td>Setting</td></tr>")
     reader.feed("<tr><td>&auml;iti")
     assert_equals(reader.current_cell, [u"\xe4", u"iti"])
     reader.feed("</tr>")
     assert_equals(reader.populator.tables["Setting"][0], [u"\xe4iti"])
class TestHtmlReader(unittest.TestCase):

    def setUp(self):
        self.reader = HtmlReader()
        self.reader.populator = PopulatorMock()

    def test_initial_state(self):
        self.reader.state = self.reader.IGNORE
        self.reader.feed('<table>')
        assert_equals(self.reader.state, self.reader.INITIAL)
        self.reader.feed('</table>')
        assert_equals(self.reader.state, self.reader.IGNORE)

    def test_start_valid_table(self):
        for name in VALID_TABLES:
            self.reader.feed('<table>')
            self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2'))
            assert_equals(self.reader.state, self.reader.PROCESS)
            assert_equals(self.reader.populator.current, name)
            self.reader.feed('</table>')
            assert_equals(self.reader.state, self.reader.IGNORE)

    def test_process_invalid_table(self):
        for name in [ "Foo", "VaribleTable" ]:
            self.reader.feed('<table>')
            self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2'))
            assert_equals(self.reader.state, self.reader.IGNORE)
            assert_none(self.reader.populator.current)
            self.reader.feed(ROW_TEMPLATE % ('This', 'row', 'is ignored'))
            assert_equals(self.reader.state, self.reader.IGNORE)
            assert_equals(len(self.reader.populator.tables.values()), 0)
            self.reader.feed('</table>')
            assert_equals(self.reader.state, self.reader.IGNORE)

    def test_br(self):
        inp = ('x<br>y', '1<br />2', '<br><br>')
        exp = ['x\ny', '1\n2', '\n\n']
        for name in VALID_TABLES:
            self.reader.feed('<table>')
            self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2'))
            self.reader.feed(ROW_TEMPLATE % inp)
            self.reader.feed('</table>')
            assert_equals(self.reader.populator.tables[name], [ exp ])

    def test_processing(self):
        self._row_processing(ROW_TEMPLATE)

    def test_missing_end_td(self):
        self._row_processing('<tr><td>%s<td>%s</td><td>%s</td></tr>')
        self._row_processing('<tr><td>%s<td>%s<td>%s</tr>')

    def test_missing_end_tr(self):
        self._row_processing('<tr><td>%s<td>%s</td><td>%s</td>')

    def test_extra_end_tr(self):
        self._row_processing('<tr><td>%s<td>%s</td><td>%s</td></tr></tr>')

    def test_missing_start_tr(self):
        self._row_processing('<td>%s<td>%s</td><td>%s</td></tr></tr>')

    def _row_processing(self, row_template):
        for name in VALID_TABLES:
            self.reader.feed('<table>')
            self.reader.feed(row_template % (name, 'Value 1', 'Value2'))
            row_data = [['Just', 'some', 'data'],
                        ['here', '', 'for'],
                        ['', 'these', 'rows']]
            for data in row_data:
                self.reader.feed(row_template % tuple(data))
            assert_equals(self.reader.state, self.reader.PROCESS)
            self.reader.feed('</table>')
            assert_equals(self.reader.state, self.reader.IGNORE)
            assert_equals(self.reader.populator.tables[name], row_data)
 def test_encoding_is_set_from_xml_preamble(self):
     reader = HtmlReader()
     reader.feed('<?xml version="1.0" encoding="UTF-8"?>')
     assert_equals(reader._encoding, 'UTF-8')
     reader.feed('<?xml encoding=US-ASCII version="1.0"?>')
     assert_equals(reader._encoding, 'US-ASCII')
Esempio n. 8
0
 def test_valid_http_equiv_is_required(self):
     reader = HtmlReader()
     reader.feed('<meta content="text/html; charset=utf-8" />')
     assert_equals(reader._encoding, 'ISO-8859-1')
     reader.feed('<meta http-equiv="Invalid" content="text/html; charset=utf-8" />')
     assert_equals(reader._encoding, 'ISO-8859-1')
Esempio n. 9
0
 def test_encoding_is_read_from_meta_tag(self):
     reader = HtmlReader()
     reader.feed('<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />')
     assert_equals(reader._encoding, 'utf-8')
     reader.feed('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">')
     assert_equals(reader._encoding, 'UTF-8')
Esempio n. 10
0
class TestHtmlReader(unittest.TestCase):
    def setUp(self):
        self.reader = HtmlReader()
        self.reader.populator = PopulatorMock()

    def test_initial_state(self):
        self.reader.state = self.reader.IGNORE
        self.reader.feed("<table>")
        assert_equals(self.reader.state, self.reader.INITIAL)
        self.reader.feed("</table>")
        assert_equals(self.reader.state, self.reader.IGNORE)

    def test_start_valid_table(self):
        for name in VALID_TABLES:
            self.reader.feed("<table>")
            self.reader.feed(ROW_TEMPLATE % (name, "Value 1", "Value2"))
            assert_equals(self.reader.state, self.reader.PROCESS)
            assert_equals(self.reader.populator.current, name)
            self.reader.feed("</table>")
            assert_equals(self.reader.state, self.reader.IGNORE)

    def test_process_invalid_table(self):
        for name in ["Foo", "VaribleTable"]:
            self.reader.feed("<table>")
            self.reader.feed(ROW_TEMPLATE % (name, "Value 1", "Value2"))
            assert_equals(self.reader.state, self.reader.IGNORE)
            assert_none(self.reader.populator.current)
            self.reader.feed(ROW_TEMPLATE % ("This", "row", "is ignored"))
            assert_equals(self.reader.state, self.reader.IGNORE)
            assert_equals(len(self.reader.populator.tables.values()), 0)
            self.reader.feed("</table>")
            assert_equals(self.reader.state, self.reader.IGNORE)

    def test_br(self):
        inp = ("x<br>y", "1<br />2", "<br><br>")
        exp = ["x\ny", "1\n2", "\n\n"]
        for name in VALID_TABLES:
            self.reader.feed("<table>")
            self.reader.feed(ROW_TEMPLATE % (name, "Value 1", "Value2"))
            self.reader.feed(ROW_TEMPLATE % inp)
            self.reader.feed("</table>")
            assert_equals(self.reader.populator.tables[name], [exp])

    def test_processing(self):
        self._row_processing(ROW_TEMPLATE)

    def test_missing_end_td(self):
        self._row_processing("<tr><td>%s<td>%s</td><td>%s</td></tr>")
        self._row_processing("<tr><td>%s<td>%s<td>%s</tr>")

    def test_missing_end_tr(self):
        self._row_processing("<tr><td>%s<td>%s</td><td>%s</td>")

    def test_extra_end_tr(self):
        self._row_processing("<tr><td>%s<td>%s</td><td>%s</td></tr></tr>")

    def test_missing_start_tr(self):
        self._row_processing("<td>%s<td>%s</td><td>%s</td></tr></tr>")

    def _row_processing(self, row_template):
        for name in VALID_TABLES:
            self.reader.feed("<table>")
            self.reader.feed(row_template % (name, "Value 1", "Value2"))
            row_data = [["Just", "some", "data"], ["here", "", "for"], ["", "these", "rows"]]
            for data in row_data:
                self.reader.feed(row_template % tuple(data))
            assert_equals(self.reader.state, self.reader.PROCESS)
            self.reader.feed("</table>")
            assert_equals(self.reader.state, self.reader.IGNORE)
            assert_equals(self.reader.populator.tables[name], row_data)