def test_htmlsplitter(): """ Test to make sure that HTMLSplitter correctly inputs lines of type SoupString to return a generator that gives all header and data elements. """ splitter = html.HTMLSplitter() lines = [ html.SoupString( BeautifulSoup( '<table><tr><th>Col 1</th><th>Col 2</th></tr></table>', 'html.parser').tr), html.SoupString( BeautifulSoup( '<table><tr><td>Data 1</td><td>Data 2</td></tr></table>', 'html.parser').tr) ] expected_data = [['Col 1', 'Col 2'], ['Data 1', 'Data 2']] assert list(splitter(lines)) == expected_data # Make sure the presence of a non-SoupString triggers a TypeError lines.append('<tr><td>Data 3</td><td>Data 4</td></tr>') with pytest.raises(TypeError): list(splitter(lines)) # Make sure that passing an empty list triggers an error with pytest.raises(core.InconsistentTableError): list(splitter([]))
def test_htmldata(): """ Test to ensure that the start_line and end_lines methods of HTMLData returns the first line of table data. Uses t/html.html for sample input. """ f = 'data/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} data = html.HTMLData() lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '<tr><td>1</td><td>a</td><td>1.05</td></tr>' # end_line returns the index of the last data element + 1 assert str(lines[data.end_line(lines) - 1]) == \ '<tr><td>3</td><td>c</td><td>-1.25</td></tr>' inputter.html['table_id'] = 'second' lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '<tr><td>4</td><td>d</td><td>10.5</td></tr>' assert str(lines[data.end_line(lines) - 1]) == \ '<tr><td>6</td><td>f</td><td>-12.5</td></tr>' inputter.html['table_id'] = 3 lines = inputter.get_lines(table) assert str(lines[data.start_line(lines)]) == \ '<tr><td>7</td><td>g</td><td>105.0</td></tr>' assert str(lines[data.end_line(lines) - 1]) == \ '<tr><td>9</td><td>i</td><td>-125.0</td></tr>' # start_line should raise an error if no table data exists lines = [ html.SoupString(BeautifulSoup('<div></div>', 'html.parser').div), html.SoupString(BeautifulSoup('<p>Text</p>', 'html.parser').p) ] with pytest.raises(core.InconsistentTableError): data.start_line(lines) # end_line should return None if no table data exists assert data.end_line(lines) is None # Should raise an error if a non-SoupString is present lines.append('<tr><td>Data</td></tr>') with pytest.raises(TypeError): data.start_line(lines) with pytest.raises(TypeError): data.end_line(lines)
def process_lines(self, lines): """ Convert the given input into a list of SoupString rows for further processing. """ try: from bs4 import BeautifulSoup except ImportError: raise core.OptionalTableImportError( 'BeautifulSoup must be ' 'installed to read HTML tables') if 'parser' not in self.html: soup = BeautifulSoup('\n'.join(lines)) else: # use a custom backend parser soup = BeautifulSoup('\n'.join(lines), self.html['parser']) tables = soup.find_all('table') for i, possible_table in enumerate(tables): if html.identify_table(possible_table, self.html, i + 1): table = possible_table # Find the correct table break else: if isinstance(self.html['table_id'], int): err_descr = 'number {0}'.format(self.html['table_id']) else: err_descr = "id '{0}'".format(self.html['table_id']) raise core.InconsistentTableError( 'ERROR: HTML table {0} not found'.format(err_descr)) self.html['attrs'] = table.attrs # Get all table rows soup_list = [html.SoupString(x) for x in table.find_all('tr')] return soup_list
def test_soupstring(): """ Test to make sure the class SoupString behaves properly. """ soup = BeautifulSoup('<html><head></head><body><p>foo</p></body></html>') soup_str = html.SoupString(soup) assert isinstance(soup_str, str) assert isinstance(soup_str, html.SoupString) assert soup_str == '<html><head></head><body><p>foo</p></body></html>' assert soup_str.soup is soup
def test_htmlheader_start(): """ Test to ensure that the start_line method of HTMLHeader returns the first line of header data. Uses t/html.html for sample input. """ f = 'data/html.html' with open(f) as fd: table = fd.read() inputter = html.HTMLInputter() inputter.html = {} header = html.HTMLHeader() lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ '<tr><th>Column 1</th><th>Column 2</th><th>Column 3</th></tr>' inputter.html['table_id'] = 'second' lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ '<tr><th>Column A</th><th>Column B</th><th>Column C</th></tr>' inputter.html['table_id'] = 3 lines = inputter.get_lines(table) assert str(lines[header.start_line(lines)]) == \ '<tr><th>C1</th><th>C2</th><th>C3</th></tr>' # start_line should return None if no valid header is found lines = [ html.SoupString( BeautifulSoup('<table><tr><td>Data</td></tr></table>', 'html.parser').tr), html.SoupString(BeautifulSoup('<p>Text</p>', 'html.parser').p) ] assert header.start_line(lines) is None # Should raise an error if a non-SoupString is present lines.append('<tr><th>Header</th></tr>') with pytest.raises(TypeError): header.start_line(lines)