def test_valid_http_equiv_is_required(self): reader = HtmlReader() reader.feed('<meta content="text/html; charset=utf-8" />') assert_equals(reader._encoding, 'ISO-8859-1') reader.feed( '<meta http-equiv="Invalid" content="text/html; charset=utf-8" />') assert_equals(reader._encoding, 'ISO-8859-1')
def test_encoding_is_read_from_meta_tag(self): reader = HtmlReader() reader.feed( '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />' ) assert_equals(reader._encoding, 'utf-8') reader.feed( '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">' ) assert_equals(reader._encoding, 'UTF-8')
def test_encoding_and_entityrefs(self): reader = HtmlReader() reader.populator = PopulatorMock() reader.feed('<meta content="text/html; charset=utf-8" />') reader.feed('<table><tr><td>Setting</td></tr>') reader.feed('<tr><td>äiti') assert_equals(reader.current_cell, [u'\xe4', u'iti']) reader.feed('</tr>') assert_equals(reader.populator.tables['Setting'][0], [u'\xe4iti'])
def test_encoding_and_entityrefs(self): reader = HtmlReader() reader.populator = PopulatorMock() reader.feed('<meta content="text/html; charset=utf-8" />') reader.feed("<table><tr><td>Setting</td></tr>") reader.feed("<tr><td>äiti") assert_equals(reader.current_cell, [u"\xe4", u"iti"]) reader.feed("</tr>") assert_equals(reader.populator.tables["Setting"][0], [u"\xe4iti"])
class TestHtmlReader(unittest.TestCase): def setUp(self): self.reader = HtmlReader() self.reader.populator = PopulatorMock() def test_initial_state(self): self.reader.state = self.reader.IGNORE self.reader.feed('<table>') assert_equals(self.reader.state, self.reader.INITIAL) self.reader.feed('</table>') assert_equals(self.reader.state, self.reader.IGNORE) def test_start_valid_table(self): for name in VALID_TABLES: self.reader.feed('<table>') self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2')) assert_equals(self.reader.state, self.reader.PROCESS) assert_equals(self.reader.populator.current, name) self.reader.feed('</table>') assert_equals(self.reader.state, self.reader.IGNORE) def test_process_invalid_table(self): for name in [ "Foo", "VaribleTable" ]: self.reader.feed('<table>') self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2')) assert_equals(self.reader.state, self.reader.IGNORE) assert_none(self.reader.populator.current) self.reader.feed(ROW_TEMPLATE % ('This', 'row', 'is ignored')) assert_equals(self.reader.state, self.reader.IGNORE) assert_equals(len(self.reader.populator.tables.values()), 0) self.reader.feed('</table>') assert_equals(self.reader.state, self.reader.IGNORE) def test_br(self): inp = ('x<br>y', '1<br />2', '<br><br>') exp = ['x\ny', '1\n2', '\n\n'] for name in VALID_TABLES: self.reader.feed('<table>') self.reader.feed(ROW_TEMPLATE % (name, 'Value 1', 'Value2')) self.reader.feed(ROW_TEMPLATE % inp) self.reader.feed('</table>') assert_equals(self.reader.populator.tables[name], [ exp ]) def test_processing(self): self._row_processing(ROW_TEMPLATE) def test_missing_end_td(self): self._row_processing('<tr><td>%s<td>%s</td><td>%s</td></tr>') self._row_processing('<tr><td>%s<td>%s<td>%s</tr>') def test_missing_end_tr(self): self._row_processing('<tr><td>%s<td>%s</td><td>%s</td>') def test_extra_end_tr(self): self._row_processing('<tr><td>%s<td>%s</td><td>%s</td></tr></tr>') def test_missing_start_tr(self): self._row_processing('<td>%s<td>%s</td><td>%s</td></tr></tr>') def _row_processing(self, row_template): for name in VALID_TABLES: self.reader.feed('<table>') self.reader.feed(row_template % (name, 'Value 1', 'Value2')) row_data = [['Just', 'some', 'data'], ['here', '', 'for'], ['', 'these', 'rows']] for data in row_data: self.reader.feed(row_template % tuple(data)) assert_equals(self.reader.state, self.reader.PROCESS) self.reader.feed('</table>') assert_equals(self.reader.state, self.reader.IGNORE) assert_equals(self.reader.populator.tables[name], row_data)
def test_encoding_is_set_from_xml_preamble(self): reader = HtmlReader() reader.feed('<?xml version="1.0" encoding="UTF-8"?>') assert_equals(reader._encoding, 'UTF-8') reader.feed('<?xml encoding=US-ASCII version="1.0"?>') assert_equals(reader._encoding, 'US-ASCII')
def test_valid_http_equiv_is_required(self): reader = HtmlReader() reader.feed('<meta content="text/html; charset=utf-8" />') assert_equals(reader._encoding, 'ISO-8859-1') reader.feed('<meta http-equiv="Invalid" content="text/html; charset=utf-8" />') assert_equals(reader._encoding, 'ISO-8859-1')
def test_encoding_is_read_from_meta_tag(self): reader = HtmlReader() reader.feed('<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />') assert_equals(reader._encoding, 'utf-8') reader.feed('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">') assert_equals(reader._encoding, 'UTF-8')
class TestHtmlReader(unittest.TestCase): def setUp(self): self.reader = HtmlReader() self.reader.populator = PopulatorMock() def test_initial_state(self): self.reader.state = self.reader.IGNORE self.reader.feed("<table>") assert_equals(self.reader.state, self.reader.INITIAL) self.reader.feed("</table>") assert_equals(self.reader.state, self.reader.IGNORE) def test_start_valid_table(self): for name in VALID_TABLES: self.reader.feed("<table>") self.reader.feed(ROW_TEMPLATE % (name, "Value 1", "Value2")) assert_equals(self.reader.state, self.reader.PROCESS) assert_equals(self.reader.populator.current, name) self.reader.feed("</table>") assert_equals(self.reader.state, self.reader.IGNORE) def test_process_invalid_table(self): for name in ["Foo", "VaribleTable"]: self.reader.feed("<table>") self.reader.feed(ROW_TEMPLATE % (name, "Value 1", "Value2")) assert_equals(self.reader.state, self.reader.IGNORE) assert_none(self.reader.populator.current) self.reader.feed(ROW_TEMPLATE % ("This", "row", "is ignored")) assert_equals(self.reader.state, self.reader.IGNORE) assert_equals(len(self.reader.populator.tables.values()), 0) self.reader.feed("</table>") assert_equals(self.reader.state, self.reader.IGNORE) def test_br(self): inp = ("x<br>y", "1<br />2", "<br><br>") exp = ["x\ny", "1\n2", "\n\n"] for name in VALID_TABLES: self.reader.feed("<table>") self.reader.feed(ROW_TEMPLATE % (name, "Value 1", "Value2")) self.reader.feed(ROW_TEMPLATE % inp) self.reader.feed("</table>") assert_equals(self.reader.populator.tables[name], [exp]) def test_processing(self): self._row_processing(ROW_TEMPLATE) def test_missing_end_td(self): self._row_processing("<tr><td>%s<td>%s</td><td>%s</td></tr>") self._row_processing("<tr><td>%s<td>%s<td>%s</tr>") def test_missing_end_tr(self): self._row_processing("<tr><td>%s<td>%s</td><td>%s</td>") def test_extra_end_tr(self): self._row_processing("<tr><td>%s<td>%s</td><td>%s</td></tr></tr>") def test_missing_start_tr(self): self._row_processing("<td>%s<td>%s</td><td>%s</td></tr></tr>") def _row_processing(self, row_template): for name in VALID_TABLES: self.reader.feed("<table>") self.reader.feed(row_template % (name, "Value 1", "Value2")) row_data = [["Just", "some", "data"], ["here", "", "for"], ["", "these", "rows"]] for data in row_data: self.reader.feed(row_template % tuple(data)) assert_equals(self.reader.state, self.reader.PROCESS) self.reader.feed("</table>") assert_equals(self.reader.state, self.reader.IGNORE) assert_equals(self.reader.populator.tables[name], row_data)