Python CSVTableSet.CSVTableSet Beispiele, messytables.CSVTableSet.CSVTableSet Python Beispiele

Beispiel #1

0

Datei anzeigen

def any_tableset(fileobj, mimetype=None, extension=None):
    """Reads any supported table type according to a specified
    MIME type or file extension or automatically detecting the
    type.

    Best matching TableSet loaded with the fileobject is returned.
    Matching is done by looking at the type (e.g mimetype='text/csv')
    or file extension (e.g. extension='tsv'), or otherwise autodetecting
    the file format by using the magic library which looks at the first few
    bytes of the file BUT is often wrong. Consult the source for recognized
    MIME types and file extensions.

    On error it raises messytables.ReadError
    """
    # Auto-detect if the caller has offered no clue. (Because the
    # auto-detection routine is pretty poor.)
    if mimetype is None and extension is None:
        import magic
        # Since we need to peek the start of the stream, make sure we can
        # seek back later. If not, slurp in the contents into a StringIO.
        fileobj = messytables.seekable_stream(fileobj)
        header = fileobj.read(1024)
        mimetype = magic.from_buffer(header, mime=True)
        fileobj.seek(0)

    if (mimetype in ('application/x-zip-compressed', 'application/zip')
            or (extension and extension.lower() in ('zip',))):
        # Do this first because the extension applies to the content
        # type of the inner files, so don't check them before we check
        # for a ZIP file.
        return ZIPTableSet(fileobj)

    if (mimetype in ('text/csv', 'text/comma-separated-values') or
            (extension and extension.lower() in ('csv',))):
        return CSVTableSet(fileobj)  # guess delimiter
    if (mimetype in ('text/tsv', 'text/tab-separated-values') or
            (extension and extension.lower() in ('tsv',))):
        return CSVTableSet(fileobj, delimiter='\t')
    if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                    'application/xls') or (extension and extension.lower() in
                                           ('xls',)):
        return XLSTableSet(fileobj)
    if (mimetype in (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
            or (extension and extension.lower() in ('xlsx',))):
        return XLSXTableSet(fileobj)
    if (mimetype in ('text/html',)
            or (extension and extension.lower() in ('htm', 'html',))):
        return HTMLTableSet(fileobj)
    if mimetype:
        raise ValueError("Unrecognized MIME type: {mimetype}".format(
            mimetype=mimetype))
    if extension:
        raise ValueError('''Could not determine MIME type and
         unrecognized extension: {extension}'''.format(extension=extension))
    raise ValueError("Could not determine MIME type and no extension given.")

Beispiel #2

0

Datei anzeigen

 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])

Beispiel #3

0

Datei anzeigen

  def get_column_types(data: io.BytesIO) -> Tuple[List[str], List[str]]:
    """derive the column types

    Using messytables' CSV API, attempt to derive the column types based on a best-guess
    of a sample of the rows.

    This is still a WIP due to the parlous state of the DV360/CM CSV data formats in
    general
    
    Arguments:
        data {io.BytesIO} -- sample of the CSV file

    Returns:
        (List[str], List[str]) -- tuple of list of header names and list of column types
    """
    table_set = CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logging.info(headers)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    logging.info(types)

    return (headers, types)

Beispiel #4

0

Datei anzeigen

 def test_read_encoded_csv(self):
     fh = horror_fobj('utf-16le_encoded.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(328, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(row[1].value, 'Organisation_name')

Beispiel #5

0

Datei anzeigen

 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType()
     ])

Beispiel #6

0

Datei anzeigen

    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            IntegerType(),
            IntegerType(),
            IntegerType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])

Beispiel #7

0

Datei anzeigen

    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            StringType(),
            IntegerType(),
            StringType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]

Beispiel #8

0

Datei anzeigen

 def test_overriding_sniffed(self):
     # semicolon separated values
     fh = horror_fobj('simple.csv')
     table_set = CSVTableSet(fh, delimiter=";")
     row_set = table_set.tables[0]
     assert_equal(7, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(len(row), 1)

Beispiel #9

0

Datei anzeigen

Datei: test_read.py Projekt: scraperdragon/messytables

 def test_rowset_as_schema(self):
     from StringIO import StringIO as sio
     ts = CSVTableSet(sio('''name,dob\nmk,2012-01-02\n'''))
     rs = ts.tables[0]
     jts = rowset_as_jts(rs).as_dict()
     assert_equal(jts['fields'], [
         {'type': 'string', 'id': u'name', 'label': u'name'},
         {'type': 'date', 'id': u'dob', 'label': u'dob'}])

Beispiel #10

0

Datei anzeigen

    def test_guess_headers(self):
        fh = horror_fobj('weird_head_padding.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        data = list(row_set)
        assert 'Frauenheilkunde' in data[9][0].value, data[9][0].value

        fh = horror_fobj('weird_head_padding.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(headers_processor(['foo', 'bar']))
        data = list(row_set)
        assert 'foo' in data[12][0].column, data[12][0]
        assert 'Chirurgie' in data[12][0].value, data[12][0].value

Beispiel #11

0

Datei anzeigen

 def test_non_strict_guessing_handles_padding(self):
     csv_file = StringIO.StringIO('''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=False)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])

Beispiel #12

0

Datei anzeigen

 def test_read_head_offset_csv(self):
     fh = horror_fobj('simple.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert_equal(offset, 0)
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     assert_equal(int(data[0][1].value), 1)
     data = list(row_set)
     assert_equal(int(data[0][1].value), 1)

Beispiel #13

0

Datei anzeigen

Datei: test_guessing.py Projekt: tusharbihani/messytables

 def test_strict_guessing_handles_padding(self):
     csv_file = io.BytesIO(b'''
         1,   , 2
         2,   , 1.1
         foo, , 1500''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(len(guessed_types), 3)
     assert_equal(guessed_types,
                  [StringType(), StringType(),
                   DecimalType()])

Beispiel #14

0

Datei anzeigen

 def test_read_simple_tsv(self):
     fh = horror_fobj('example.tsv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     assert_equal(141, len(list(row_set)))
     row = list(row_set.sample)[0]
     assert_equal(row[0].value, 'hour')
     assert_equal(row[1].value, 'expr1_0_imp')
     for row in list(row_set):
         assert_equal(17, len(row))
         assert_equal(row[0].type, StringType())

Beispiel #15

0

Datei anzeigen

Datei: test_stream.py Projekt: tusharbihani/messytables

 def test_http_csv_encoding(self):
     url = 'http://www.messytables.org/static/utf-16le_encoded.csv'
     httpretty.register_uri(httpretty.GET,
                            url,
                            body=horror_fobj('utf-16le_encoded.csv').read(),
                            content_type="application/csv")
     fh = urlopen(url)
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(328, len(data))

Beispiel #16

0

Datei anzeigen

 def test_read_encoded_characters_csv(self):
     fh = horror_fobj('characters.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set)
     assert_equal(382, len(data))
     assert_equal(data[0][2].value, u'雲嘉南濱海國家風景區管理處')
     assert_equal(data[-1][2].value, u'沈光文紀念廳')

Beispiel #17

0

Datei anzeigen

    def test_read_simple_csv(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(7, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'temperature')

        for row in list(row_set):
            assert_equal(3, len(row))
            assert_equal(row[0].type, StringType())

Beispiel #18

0

Datei anzeigen

    def connect(self,
                host=None,
                port=None,
                database=None,
                username=None,
                password=None,
                file=None):
        # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle
        self.database = database
        conn_string = ''
        if self.engine == 'psycopg2':
            if database:
                conn_string += "dbname='%s' " % database
            if username:
                conn_string += "user='******' " % username
            if host:
                conn_string += "host='%s' " % host
            if port:
                conn_string += "port='%s' " % port
            if password:
                conn_string += "password='******' " % password
            self.conn = psycopg2.connect(conn_string)

        elif self.engine == 'pymssql':
            self.conn = pymssql.connect(host,
                                        username,
                                        password,
                                        database,
                                        port=port,
                                        as_dict=True,
                                        charset='LATIN1')

        elif self.engine == 'csv':
            # https://messytables.readthedocs.io/en/latest/
            fh = StringIO.StringIO(self.data)
            #dialect = csv.Sniffer().sniff(f.read(1024))
            #f.seek(0)
            #self.conn = csv.DictReader(f, dialect=dialect)
            #fh = open('messy.csv', 'rb')

            # Load a file object:
            table_set = CSVTableSet(fh)
            row_set = table_set.tables[0]
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))
            types = type_guess(row_set.sample, strict=True)
            row_set.register_processor(types_processor(types))

            self.conn = row_set

        return self.conn

Beispiel #19

0

Datei anzeigen

    def test_guessing_uses_first_in_case_of_tie(self):
        csv_file = StringIO.StringIO('''
            2
            1.1
            1500''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(
            rows.sample, types=[DecimalType, IntegerType], strict=False)
        assert_equal(guessed_types, [DecimalType()])

        guessed_types = type_guess(
            rows.sample, types=[IntegerType, DecimalType], strict=False)
        assert_equal(guessed_types, [IntegerType()])

Beispiel #20

0

Datei anzeigen

    def test_read_simple_psv(self):
        # pipe/vertical bar ("|") separated values
        fh = horror_fobj('simple.psv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(29, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'Year')
        assert_equal(row[1].value, 'New dwellings')

        for row in list(row_set):
            assert_equal(6, len(row))
            assert_equal(row[0].type, StringType())

Beispiel #21

0

Datei anzeigen

Datei: test_stream.py Projekt: tusharbihani/messytables

 def test_http_csv_requests(self):
     url = 'http://www.messytables.org/static/long.csv'
     httpretty.register_uri(httpretty.GET,
                            url,
                            body=horror_fobj('long.csv').read(),
                            content_type="application/csv")
     r = requests.get(url, stream=True)
     # no full support for non blocking version yet, use urllib2
     fh = io.BytesIO(r.raw.read())
     table_set = CSVTableSet(fh, encoding='utf-8')
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(4000, len(data))

Beispiel #22

0

Datei anzeigen

    def test_type_guess(self):
        csv_file = StringIO.StringIO('''
            1,   2012/2/12, 2,   02 October 2011,  yes,   1
            2,   2012/2/12, 2,   02 October 2011,  true,  1
            2.4, 2012/2/12, 1,   1 May 2011,       no,    0
            foo, bar,       1000, ,                false, 0
            4.3, ,          42,  24 October 2012,,
             ,   2012/2/12, 21,  24 December 2013, true,  1''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'), IntegerType(),
            DateType('%d %B %Y'), BoolType(), BoolType()])

Beispiel #23

0

Datei anzeigen

Datei: import.py Projekt: conryf/csv-to-mysql

def csvParse(csv_file_path):
    fh = open(csv_file_path, 'rb')
    # Load a file object:
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    # add one to begin with content, not the header:
    row_set.register_processor(offset_processor(offset + 1))
    # guess column types:
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))
    return row_set, headers, offset, types

Beispiel #24

0

Datei anzeigen

    def test_read_type_guess_simple(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = [c.type for c in data[0]]
        assert_equal(header_types, [StringType()] * 3)
        row_types = [c.type for c in data[2]]
        assert_equal(expected_types, row_types)

Beispiel #25

0

Datei anzeigen

 def test_read_head_padding_csv(self):
     fh = horror_fobj('weird_head_padding.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert 11 == len(headers), headers
     assert_equal('1985', headers[1].strip())
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     for row in row_set:
         assert_equal(11, len(row))
     value = data[1][0].value.strip()
     assert value == u'Gefäßchirurgie', value

Beispiel #26

0

Datei anzeigen

    def test_read_complex_csv(self):
        fh = horror_fobj('complex.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(4, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'another date')
        assert_equal(row[2].value, 'temperature')
        assert_equal(row[3].value, 'place')

        for row in list(row_set):
            assert_equal(4, len(row))
            assert_equal(row[0].type, StringType())

Beispiel #27

0

Datei anzeigen

 def test_type_guess_strict(self):
     import locale
     locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
     csv_file = StringIO.StringIO('''
         1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
         2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"
         foo, bar,       1500,   0,,"NaN"
         4,   2012/2/12, 42,"-2,000",24 October 2012,"42"
         ,,,,,''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(guessed_types, [
         StringType(), StringType(),
         DecimalType(), IntegerType(), DateType('%d %B %Y'),
         DecimalType()])

Beispiel #28

0

Datei anzeigen

Datei: helpers.py Projekt: trickvi/spendb

def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows

Beispiel #29

0

Datei anzeigen

Datei: tap_s3_csv.py Projekt: stish834/pipelinewise

    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as f:
            table_set = CSVTableSet(f)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)))
            return zip(headers, types)

Beispiel #30

0

Datei anzeigen

Datei: csv2athena.py Projekt: yuokada/heroku-csv2athena_schema

def _guess_csv_datatype(fh):
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logger.info("(offset, headers) = ({}, {})".format(offset, headers))

    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    counter = 0
    for row in row_set:
        logger.info(row)
        counter += 1
        if counter >= 32:
            break

    d = {h: t for h, t in zip(headers, types)}
    logger.info(d)
    return d