Beispiel #1
0
def any_tableset(fileobj, mimetype=None, extension=None):
    """Reads any supported table type according to a specified
    MIME type or file extension or automatically detecting the
    type.

    Best matching TableSet loaded with the fileobject is returned.
    Matching is done by looking at the type (e.g mimetype='text/csv')
    or file extension (e.g. extension='tsv'), or otherwise autodetecting
    the file format by using the magic library which looks at the first few
    bytes of the file BUT is often wrong. Consult the source for recognized
    MIME types and file extensions.

    On error it raises messytables.ReadError
    """
    # Auto-detect if the caller has offered no clue. (Because the
    # auto-detection routine is pretty poor.)
    if mimetype is None and extension is None:
        import magic
        # Since we need to peek the start of the stream, make sure we can
        # seek back later. If not, slurp in the contents into a StringIO.
        fileobj = messytables.seekable_stream(fileobj)
        header = fileobj.read(1024)
        mimetype = magic.from_buffer(header, mime=True)
        fileobj.seek(0)

    if (mimetype in ('application/x-zip-compressed', 'application/zip')
            or (extension and extension.lower() in ('zip',))):
        # Do this first because the extension applies to the content
        # type of the inner files, so don't check them before we check
        # for a ZIP file.
        return ZIPTableSet(fileobj)

    if (mimetype in ('text/csv', 'text/comma-separated-values') or
            (extension and extension.lower() in ('csv',))):
        return CSVTableSet(fileobj)  # guess delimiter
    if (mimetype in ('text/tsv', 'text/tab-separated-values') or
            (extension and extension.lower() in ('tsv',))):
        return CSVTableSet(fileobj, delimiter='\t')
    if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                    'application/xls') or (extension and extension.lower() in
                                           ('xls',)):
        return XLSTableSet(fileobj)
    if (mimetype in (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
            or (extension and extension.lower() in ('xlsx',))):
        return XLSXTableSet(fileobj)
    if (mimetype in ('text/html',)
            or (extension and extension.lower() in ('htm', 'html',))):
        return HTMLTableSet(fileobj)
    if mimetype:
        raise ValueError("Unrecognized MIME type: {mimetype}".format(
            mimetype=mimetype))
    if extension:
        raise ValueError('''Could not determine MIME type and
         unrecognized extension: {extension}'''.format(extension=extension))
    raise ValueError("Could not determine MIME type and no extension given.")
Beispiel #2
0
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
Beispiel #3
0
  def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]:
    """derive the column types

    Using messytables' CSV API, attempt to derive the column types based on a best-guess
    of a sample of the rows.

    This is still a WIP due to the parlous state of the DV360/CM CSV data formats in
    general
    
    Arguments:
        data {io.BytesIO} -- sample of the CSV file

    Returns:
        (List[str], List[str]) -- tuple of list of header names and list of column types
    """
    table_set = CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logging.info(headers)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    logging.info(types)

    return (headers, types)
Beispiel #4
0
 def test_read_encoded_csv(self):
     fh = horror_fobj('utf-16le_encoded.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(328, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(row[1].value, 'Organisation_name')
Beispiel #5
0
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType()
     ])
Beispiel #6
0
    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            IntegerType(),
            IntegerType(),
            IntegerType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
Beispiel #7
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            StringType(),
            IntegerType(),
            StringType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
Beispiel #8
0
 def test_overriding_sniffed(self):
     # semicolon separated values
     fh = horror_fobj('simple.csv')
     table_set = CSVTableSet(fh, delimiter=";")
     row_set = table_set.tables[0]
     assert_equal(7, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(len(row), 1)
Beispiel #9
0
 def test_rowset_as_schema(self):
     from StringIO import StringIO as sio
     ts = CSVTableSet(sio('''name,dob\nmk,2012-01-02\n'''))
     rs = ts.tables[0]
     jts = rowset_as_jts(rs).as_dict()
     assert_equal(jts['fields'], [
         {'type': 'string', 'id': u'name', 'label': u'name'},
         {'type': 'date', 'id': u'dob', 'label': u'dob'}])
Beispiel #10
0
    def test_guess_headers(self):
        fh = horror_fobj('weird_head_padding.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        data = list(row_set)
        assert 'Frauenheilkunde' in data[9][0].value, data[9][0].value

        fh = horror_fobj('weird_head_padding.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(headers_processor(['foo', 'bar']))
        data = list(row_set)
        assert 'foo' in data[12][0].column, data[12][0]
        assert 'Chirurgie' in data[12][0].value, data[12][0].value
Beispiel #11
0
 def test_non_strict_guessing_handles_padding(self):
     csv_file = StringIO.StringIO('''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=False)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
Beispiel #12
0
 def test_read_head_offset_csv(self):
     fh = horror_fobj('simple.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert_equal(offset, 0)
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     assert_equal(int(data[0][1].value), 1)
     data = list(row_set)
     assert_equal(int(data[0][1].value), 1)
 def test_strict_guessing_handles_padding(self):
     csv_file = io.BytesIO(b'''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types,
                  [StringType(), StringType(),
                   DecimalType()])
Beispiel #14
0
 def test_read_simple_tsv(self):
     fh = horror_fobj('example.tsv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(141, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(row[0].value, 'hour')
     assert_equal(row[1].value, 'expr1_0_imp')
     for row in list(row_set):
         assert_equal(17, len(row))
         assert_equal(row[0].type, StringType())
Beispiel #15
0
 def test_http_csv_encoding(self):
     url = 'http://www.messytables.org/static/utf-16le_encoded.csv'
     httpretty.register_uri(httpretty.GET,
                            url,
                            body=horror_fobj('utf-16le_encoded.csv').read(),
                            content_type="application/csv")
     fh = urlopen(url)
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(328, len(data))
Beispiel #16
0
 def test_read_encoded_characters_csv(self):
     fh = horror_fobj('characters.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set)
     assert_equal(382, len(data))
     assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處')
     assert_equal(data[-1][2].value, u'沈光文紀念廳')
Beispiel #17
0
    def test_read_simple_csv(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(7, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'temperature')

        for row in list(row_set):
            assert_equal(3, len(row))
            assert_equal(row[0].type, StringType())
Beispiel #18
0
    def connect(self,
                host=None,
                port=None,
                database=None,
                username=None,
                password=None,
                file=None):
        # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle
        self.database = database
        conn_string = ''
        if self.engine == 'psycopg2':
            if database:
                conn_string += "dbname='%s' " % database
            if username:
                conn_string += "user='******' " % username
            if host:
                conn_string += "host='%s' " % host
            if port:
                conn_string += "port='%s' " % port
            if password:
                conn_string += "password='******' " % password
            self.conn = psycopg2.connect(conn_string)

        elif self.engine == 'pymssql':
            self.conn = pymssql.connect(host,
                                        username,
                                        password,
                                        database,
                                        port=port,
                                        as_dict=True,
                                        charset='LATIN1')

        elif self.engine == 'csv':
            # https://messytables.readthedocs.io/en/latest/
            fh = StringIO.StringIO(self.data)
            #dialect = csv.Sniffer().sniff(f.read(1024))
            #f.seek(0)
            #self.conn = csv.DictReader(f, dialect=dialect)
            #fh = open('messy.csv', 'rb')

            # Load a file object:
            table_set = CSVTableSet(fh)
            row_set = table_set.tables[0]
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))
            types = type_guess(row_set.sample, strict=True)
            row_set.register_processor(types_processor(types))

            self.conn = row_set

        return self.conn
Beispiel #19
0
    def test_guessing_uses_first_in_case_of_tie(self):
        csv_file = StringIO.StringIO('''
            2
            1.1
            1500''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(
            rows.sample, types=[DecimalType, IntegerType], strict=False)
        assert_equal(guessed_types, [DecimalType()])

        guessed_types = type_guess(
            rows.sample, types=[IntegerType, DecimalType], strict=False)
        assert_equal(guessed_types, [IntegerType()])
Beispiel #20
0
    def test_read_simple_psv(self):
        # pipe/vertical bar ("|") separated values
        fh = horror_fobj('simple.psv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(29, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'Year')
        assert_equal(row[1].value, 'New dwellings')

        for row in list(row_set):
            assert_equal(6, len(row))
            assert_equal(row[0].type, StringType())
Beispiel #21
0
 def test_http_csv_requests(self):
     url = 'http://www.messytables.org/static/long.csv'
     httpretty.register_uri(httpretty.GET,
                            url,
                            body=horror_fobj('long.csv').read(),
                            content_type="application/csv")
     r = requests.get(url, stream=True)
     # no full support for non blocking version yet, use urllib2
     fh = io.BytesIO(r.raw.read())
     table_set = CSVTableSet(fh, encoding='utf-8')
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(4000, len(data))
Beispiel #22
0
    def test_type_guess(self):
        csv_file = StringIO.StringIO('''
            1,   2012/2/12, 2,   02 October 2011,  yes,   1
            2,   2012/2/12, 2,   02 October 2011,  true,  1
            2.4, 2012/2/12, 1,   1 May 2011,       no,    0
            foo, bar,       1000, ,                false, 0
            4.3, ,          42,  24 October 2012,,
             ,   2012/2/12, 21,  24 December 2013, true,  1''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'), IntegerType(),
            DateType('%d %B %Y'), BoolType(), BoolType()])
Beispiel #23
0
def csvParse(csv_file_path):
    fh = open(csv_file_path, 'rb')
    # Load a file object:
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    # add one to begin with content, not the header:
    row_set.register_processor(offset_processor(offset + 1))
    # guess column types:
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set, headers, offset, types
Beispiel #24
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = [c.type for c in data[0]]
        assert_equal(header_types, [StringType()] * 3)
        row_types = [c.type for c in data[2]]
        assert_equal(expected_types, row_types)
Beispiel #25
0
 def test_read_head_padding_csv(self):
     fh = horror_fobj('weird_head_padding.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert 11 == len(headers), headers
     assert_equal('1985', headers[1].strip())
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     for row in row_set:
         assert_equal(11, len(row))
     value = data[1][0].value.strip()
     assert value == u'Gefäßchirurgie', value
Beispiel #26
0
    def test_read_complex_csv(self):
        fh = horror_fobj('complex.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(4, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'another date')
        assert_equal(row[2].value, 'temperature')
        assert_equal(row[3].value, 'place')

        for row in list(row_set):
            assert_equal(4, len(row))
            assert_equal(row[0].type, StringType())
Beispiel #27
0
 def test_type_guess_strict(self):
     import locale
     locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
     csv_file = StringIO.StringIO('''
         1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
         2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"
         foo, bar,       1500,   0,,"NaN"
         4,   2012/2/12, 42,"-2,000",24 October 2012,"42"
         ,,,,,''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(guessed_types, [
         StringType(), StringType(),
         DecimalType(), IntegerType(), DateType('%d %B %Y'),
         DecimalType()])
Beispiel #28
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
Beispiel #29
0
    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as f:
            table_set = CSVTableSet(f)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)))
            return zip(headers, types)
def _guess_csv_datatype(fh):
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logger.info("(offset, headers) = ({}, {})".format(offset, headers))

    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    counter = 0
    for row in row_set:
        logger.info(row)
        counter += 1
        if counter >= 32:
            break

    d = {h: t for h, t in zip(headers, types)}
    logger.info(d)
    return d