Esempio n. 1
0
 def test_invisible_text_html(self):
     fh = horror_fobj('invisible_text.html')
     table_set = HTMLTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(4, len(list(row_set)))
     row = list(row_set.sample)[1]
     assert_equal(row[5].value.strip(), '1 July 1879')
Esempio n. 2
0
 def test_read_real_html(self):
     fh = horror_fobj('html.html')
     table_set = HTMLTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(200, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(row[0].value.strip(), 'HDI Rank')
     assert_equal(row[1].value.strip(), 'Country')
     assert_equal(row[4].value.strip(), '2010')
Esempio n. 3
0
def any_tableset(fileobj, mimetype=None, extension=None):
    """Reads any supported table type according to a specified
    MIME type or file extension or automatically detecting the
    type.

    Best matching TableSet loaded with the fileobject is returned.
    Matching is done by looking at the type (e.g mimetype='text/csv')
    or file extension (e.g. extension='tsv'), or otherwise autodetecting
    the file format by using the magic library which looks at the first few
    bytes of the file BUT is often wrong. Consult the source for recognized
    MIME types and file extensions.

    On error it raises messytables.ReadError
    """
    # Auto-detect if the caller has offered no clue. (Because the
    # auto-detection routine is pretty poor.)
    if mimetype is None and extension is None:
        import magic
        # Since we need to peek the start of the stream, make sure we can
        # seek back later. If not, slurp in the contents into a StringIO.
        fileobj = messytables.seekable_stream(fileobj)
        header = fileobj.read(1024)
        mimetype = magic.from_buffer(header, mime=True)
        fileobj.seek(0)

    if (mimetype in ('application/x-zip-compressed', 'application/zip')
            or (extension and extension.lower() in ('zip',))):
        # Do this first because the extension applies to the content
        # type of the inner files, so don't check them before we check
        # for a ZIP file.
        return ZIPTableSet(fileobj)

    if (mimetype in ('text/csv', 'text/comma-separated-values') or
            (extension and extension.lower() in ('csv',))):
        return CSVTableSet(fileobj)  # guess delimiter
    if (mimetype in ('text/tsv', 'text/tab-separated-values') or
            (extension and extension.lower() in ('tsv',))):
        return CSVTableSet(fileobj, delimiter='\t')
    if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                    'application/xls') or (extension and extension.lower() in
                                           ('xls',)):
        return XLSTableSet(fileobj)
    if (mimetype in (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
            or (extension and extension.lower() in ('xlsx',))):
        return XLSXTableSet(fileobj)
    if (mimetype in ('text/html',)
            or (extension and extension.lower() in ('htm', 'html',))):
        return HTMLTableSet(fileobj)
    if mimetype:
        raise ValueError("Unrecognized MIME type: {mimetype}".format(
            mimetype=mimetype))
    if extension:
        raise ValueError('''Could not determine MIME type and
         unrecognized extension: {extension}'''.format(extension=extension))
    raise ValueError("Could not determine MIME type and no extension given.")
Esempio n. 4
0
    def test_that_inner_table_contains_data(self):
        fh = horror_fobj('complex.html')
        tables = {}
        for table in HTMLTableSet(fh).tables:
            tables[table.name] = table

        inner_table = tables['Table 1 of 2']
        cell_values = []
        for row in inner_table:
            for cell in row:
                cell_values.append(cell.value)
        assert_equal(['head', 'body', 'foot'], cell_values)
Esempio n. 5
0
    def test_that_outer_table_contains_nothing(self):
        fh = horror_fobj('complex.html')
        tables = {}
        for table in HTMLTableSet(fh).tables:
            tables[table.name] = table

        # outer_table should contain no meaningful data
        outer_table = list(tables['Table 2 of 2'])
        assert_equal(len(outer_table), 1)
        assert_equal(len(outer_table[0]), 1)
        assert_equal(
            outer_table[0][0].value.replace(" ", "").replace("\n", ""),
            "headfootbody")
Esempio n. 6
0
    def test_read_span_html(self):
        fh = horror_fobj('rowcolspan.html')
        table_set = HTMLTableSet(fh)
        row_set = table_set.tables[0]

        magic = {}
        for y, row in enumerate(row_set):
            for x, cell in enumerate(row):
                magic[(x, y)] = cell.value

        tests = {(0, 0): '05',
                 (0, 2): '25',
                 (0, 3): '',
                 (1, 3): '36',
                 (1, 6): '66',
                 (4, 7): '79',
                 (4, 8): '89'}

        for test in tests:
            assert_equal(magic[test], tests[test])
Esempio n. 7
0
 def test_html_table_name(self):
     fh = horror_fobj('html.html')
     table_set = HTMLTableSet(fh)
     assert_equal('Table 1 of 3', table_set.tables[0].name)
     assert_equal('Table 2 of 3', table_set.tables[1].name)
     assert_equal('Table 3 of 3', table_set.tables[2].name)