コード例 #1
0
def test_htmlsplitter():
    """
    Test to make sure that HTMLSplitter correctly inputs lines
    of type SoupString to return a generator that gives all
    header and data elements.
    """

    splitter = html.HTMLSplitter()

    lines = [
        html.SoupString(
            BeautifulSoup(
                '<table><tr><th>Col 1</th><th>Col 2</th></tr></table>',
                'html.parser').tr),
        html.SoupString(
            BeautifulSoup(
                '<table><tr><td>Data 1</td><td>Data 2</td></tr></table>',
                'html.parser').tr)
    ]
    expected_data = [['Col 1', 'Col 2'], ['Data 1', 'Data 2']]
    assert list(splitter(lines)) == expected_data

    # Make sure the presence of a non-SoupString triggers a TypeError
    lines.append('<tr><td>Data 3</td><td>Data 4</td></tr>')
    with pytest.raises(TypeError):
        list(splitter(lines))

    # Make sure that passing an empty list triggers an error
    with pytest.raises(core.InconsistentTableError):
        list(splitter([]))
コード例 #2
0
def test_htmldata():
    """
    Test to ensure that the start_line and end_lines methods
    of HTMLData returns the first line of table data. Uses
    t/html.html for sample input.
    """

    f = 'data/html.html'
    with open(f) as fd:
        table = fd.read()

    inputter = html.HTMLInputter()
    inputter.html = {}
    data = html.HTMLData()

    lines = inputter.get_lines(table)
    assert str(lines[data.start_line(lines)]) == \
        '<tr><td>1</td><td>a</td><td>1.05</td></tr>'
    # end_line returns the index of the last data element + 1
    assert str(lines[data.end_line(lines) - 1]) == \
        '<tr><td>3</td><td>c</td><td>-1.25</td></tr>'

    inputter.html['table_id'] = 'second'
    lines = inputter.get_lines(table)
    assert str(lines[data.start_line(lines)]) == \
        '<tr><td>4</td><td>d</td><td>10.5</td></tr>'
    assert str(lines[data.end_line(lines) - 1]) == \
        '<tr><td>6</td><td>f</td><td>-12.5</td></tr>'

    inputter.html['table_id'] = 3
    lines = inputter.get_lines(table)
    assert str(lines[data.start_line(lines)]) == \
        '<tr><td>7</td><td>g</td><td>105.0</td></tr>'
    assert str(lines[data.end_line(lines) - 1]) == \
        '<tr><td>9</td><td>i</td><td>-125.0</td></tr>'

    # start_line should raise an error if no table data exists
    lines = [
        html.SoupString(BeautifulSoup('<div></div>', 'html.parser').div),
        html.SoupString(BeautifulSoup('<p>Text</p>', 'html.parser').p)
    ]
    with pytest.raises(core.InconsistentTableError):
        data.start_line(lines)

    # end_line should return None if no table data exists
    assert data.end_line(lines) is None

    # Should raise an error if a non-SoupString is present
    lines.append('<tr><td>Data</td></tr>')
    with pytest.raises(TypeError):
        data.start_line(lines)
    with pytest.raises(TypeError):
        data.end_line(lines)
コード例 #3
0
    def process_lines(self, lines):
        """
        Convert the given input into a list of SoupString rows
        for further processing.
        """

        try:
            from bs4 import BeautifulSoup
        except ImportError:
            raise core.OptionalTableImportError(
                'BeautifulSoup must be '
                'installed to read HTML tables')

        if 'parser' not in self.html:
            soup = BeautifulSoup('\n'.join(lines))
        else:  # use a custom backend parser
            soup = BeautifulSoup('\n'.join(lines), self.html['parser'])
        tables = soup.find_all('table')
        for i, possible_table in enumerate(tables):
            if html.identify_table(possible_table, self.html, i + 1):
                table = possible_table  # Find the correct table
                break
        else:
            if isinstance(self.html['table_id'], int):
                err_descr = 'number {0}'.format(self.html['table_id'])
            else:
                err_descr = "id '{0}'".format(self.html['table_id'])
            raise core.InconsistentTableError(
                'ERROR: HTML table {0} not found'.format(err_descr))

        self.html['attrs'] = table.attrs
        # Get all table rows
        soup_list = [html.SoupString(x) for x in table.find_all('tr')]

        return soup_list
コード例 #4
0
ファイル: test_html.py プロジェクト: kgc85/astropy
def test_soupstring():
    """
    Test to make sure the class SoupString behaves properly.
    """

    soup = BeautifulSoup('<html><head></head><body><p>foo</p></body></html>')
    soup_str = html.SoupString(soup)
    assert isinstance(soup_str, str)
    assert isinstance(soup_str, html.SoupString)
    assert soup_str == '<html><head></head><body><p>foo</p></body></html>'
    assert soup_str.soup is soup
コード例 #5
0
def test_htmlheader_start():
    """
    Test to ensure that the start_line method of HTMLHeader
    returns the first line of header data. Uses t/html.html
    for sample input.
    """

    f = 'data/html.html'
    with open(f) as fd:
        table = fd.read()

    inputter = html.HTMLInputter()
    inputter.html = {}
    header = html.HTMLHeader()

    lines = inputter.get_lines(table)
    assert str(lines[header.start_line(lines)]) == \
        '<tr><th>Column 1</th><th>Column 2</th><th>Column 3</th></tr>'
    inputter.html['table_id'] = 'second'
    lines = inputter.get_lines(table)
    assert str(lines[header.start_line(lines)]) == \
        '<tr><th>Column A</th><th>Column B</th><th>Column C</th></tr>'
    inputter.html['table_id'] = 3
    lines = inputter.get_lines(table)
    assert str(lines[header.start_line(lines)]) == \
        '<tr><th>C1</th><th>C2</th><th>C3</th></tr>'

    # start_line should return None if no valid header is found
    lines = [
        html.SoupString(
            BeautifulSoup('<table><tr><td>Data</td></tr></table>',
                          'html.parser').tr),
        html.SoupString(BeautifulSoup('<p>Text</p>', 'html.parser').p)
    ]
    assert header.start_line(lines) is None

    # Should raise an error if a non-SoupString is present
    lines.append('<tr><th>Header</th></tr>')
    with pytest.raises(TypeError):
        header.start_line(lines)