Esempio n. 1
1
 def test_rowset_as_schema(self):
     from io import BytesIO as sio
     ts = CSVTableSet(sio(b'''name,dob\nmk,2012-01-02\n'''))
     rs = ts.tables[0]
     jts = rowset_as_jts(rs).as_dict()
     assert_equal(jts['fields'], [{
         'type': 'string',
         'id': 'name',
         'label': 'name'
     }, {
         'type': 'date',
         'id': 'dob',
         'label': 'dob'
     }])
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), IntegerType(), StringType(), DecimalType(),
         DecimalType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         IntegerType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), DateUtilType(),
         DateUtilType(), DateUtilType(), DateUtilType(), StringType(),
         StringType(), StringType()])
    def convert(self):

        table_set = CSVTableSet.from_fileobj(self.stream)
        row_set = table_set.tables.pop()
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] =  u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id'] for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        data_row = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            result.append(data_row)
        return fields, result
 def test_file_with_few_strings_among_integers(self):
     fh = horror_fobj('mixedGLB.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 19)
     print guessed_types
     assert_equal(guessed_types, [
         IntegerType(), IntegerType(),
         IntegerType(), IntegerType(), IntegerType(), IntegerType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), StringType(), StringType(),
         StringType(), StringType(), IntegerType(), StringType(),
         StringType()])
Esempio n. 5
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj)  # guess delimiter
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
Esempio n. 6
0
    def test_type_guess(self):
        csv_file = StringIO.StringIO('''
            1,   2012/2/12, 2,   02 October 2011
            2,   2012/2/12, 2,   02 October 2011
            2.4, 2012/2/12, 1,   1 May 2011
            foo, bar,       1000,
            4.3, ,          42,  24 October 2012
             ,   2012/2/12, 21,  24 December 2013''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(), DateType('%Y/%m/%d'),
            IntegerType(), DateType('%d %B %Y')])
Esempio n. 7
0
def csvParse(csv_file_path):
    fh = open(csv_file_path, 'rb')
    # Load a file object:
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    # guess header names and the offset of the header:
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    # add one to begin with content, not the header:
    row_set.register_processor(offset_processor(offset + 1))
    # guess column types:
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types, strict=False))
    return row_set, headers, offset, types
Esempio n. 8
0
 def test_read_head_padding_csv(self):
     fh = horror_fobj('weird_head_padding.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert 11 == len(headers), headers
     assert_equal(u'1985', headers[1].strip())
     row_set.register_processor(headers_processor(headers))
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     for row in row_set:
         assert_equal(11, len(row))
     value = data[1][0].value.strip()
     assert value == u'Gefäßchirurgie', value
Esempio n. 9
0
    def test_read_type_guess_simple(self):
        fh = horror_fobj('simple.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample)
        expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        header_types = [c.type for c in data[0]]
        assert_equal(header_types, [StringType()] * 3)
        row_types = [c.type for c in data[2]]
        assert_equal(expected_types, row_types)
Esempio n. 10
0
    def test_read_complex_csv(self):
        fh = horror_fobj('complex.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        assert_equal(4, len(list(row_set)))
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'another date')
        assert_equal(row[2].value, 'temperature')
        assert_equal(row[3].value, 'place')

        for row in list(row_set):
            assert_equal(4, len(row))
            assert_equal(row[0].type, StringType())
Esempio n. 11
0
    def test_guessing_uses_first_in_case_of_tie(self):
        csv_file = StringIO.StringIO('''
            2
            1.1
            1500''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample,
                                   types=[DecimalType, IntegerType],
                                   strict=False)
        assert_equal(guessed_types, [DecimalType()])

        guessed_types = type_guess(rows.sample,
                                   types=[IntegerType, DecimalType],
                                   strict=False)
        assert_equal(guessed_types, [IntegerType()])
Esempio n. 12
0
 def test_type_guess_strict(self):
     import locale
     locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8')
     csv_file = StringIO.StringIO('''
         1,   2012/2/12, 2,      2,02 October 2011,"100.234354"
         2,   2012/2/12, 1.1,    0,1 May 2011,"100,000,000.12"
         foo, bar,       1500,   0,,"NaN"
         4,   2012/2/12, 42,"-2,000",24 October 2012,"42"
         ,,,,,''')
     rows = CSVTableSet(csv_file).tables[0]
     guessed_types = type_guess(rows.sample, strict=True)
     assert_equal(guessed_types, [
         StringType(), StringType(),
         DecimalType(), IntegerType(), DateType('%d %B %Y'),
         DecimalType()])
Esempio n. 13
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
Esempio n. 14
0
def load_data(config):
    if not 'url' in config:
        yield {config.get('field'): config.get('value')}
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
Esempio n. 15
0
    def test_type_guess(self):
        csv_file = io.BytesIO(b'''
            1,   2012/2/12, 2,   02 October 2011,  yes,   1
            2,   2012/2/12, 2,   02 October 2011,  true,  1
            2.4, 2012/2/12, 1,   1 May 2011,       no,    0
            foo, bar,       1000, ,                false, 0
            4.3, ,          42,  24 October 2012,,
             ,   2012/2/12, 21,  24 December 2013, true,  1''')
        rows = CSVTableSet(csv_file).tables[0]
        guessed_types = type_guess(rows.sample)

        assert_equal(guessed_types, [
            DecimalType(),
            DateType('%Y/%m/%d'),
            IntegerType(),
            DateType('%d %B %Y'),
            BoolType(),
            BoolType()
        ])
Esempio n. 16
0
def load_data(config):
    if not 'url' in config:
        yield {
            config.get('field'): config.get('value')
            }
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
Esempio n. 17
0
    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as f:
            table_set = CSVTableSet(f)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True)))
            return zip(headers, types)
Esempio n. 18
0
def prepare_csv_rows(csv_file):
    row_set = CSVTableSet(csv_file).tables[0]

    offset, headers = headers_guess(row_set.sample)
    headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)]

    row_set.register_processor(headers_processor_remove_blank(headers))
    row_set.register_processor(offset_processor(offset + 1))

    DateType.formats = create_date_formats(day_first=False)

    # We are never wanting boolean types, so remove that from the default list
    eligible_types = [StringType, DecimalType, IntegerType, DateType]
    types = type_guess(row_set.sample, types=eligible_types, strict=True)

    row_set.register_processor(types_processor(types))

    return row_set
Esempio n. 19
0
def csvimport_table(name):
    from messytables import CSVTableSet, type_guess
    from messytables import types_processor, headers_guess
    from messytables import headers_processor, offset_processor
    from spendb.etl.extract import parse_table

    row_set = CSVTableSet(data_fixture(name)).tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    rows = []
    for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)):
        rows.append(row)

    return fields, rows
def _guess_csv_datatype(fh):
    table_set = CSVTableSet(fh)
    row_set = table_set.tables[0]
    offset, headers = headers_guess(row_set.sample)
    logger.info("(offset, headers) = ({}, {})".format(offset, headers))

    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    types = type_guess(row_set.sample, strict=True)
    row_set.register_processor(types_processor(types))

    counter = 0
    for row in row_set:
        logger.info(row)
        counter += 1
        if counter >= 32:
            break

    d = {h: t for h, t in zip(headers, types)}
    logger.info(d)
    return d
Esempio n. 21
0
    def update(self, config):

        # Update the mime, sha1 of the files
        for i in range(len(config['files'])):
            filename = config['files'][i]['filename']
            if os.path.exists(filename):

                u = {
                    'mimetype': mimetypes.guess_type(filename)[0],
                    'sha1': compute_sha1(filename)
                }

                if filename.lower().endswith('sv'): # csv/tsv
                    rows = CSVTableSet(csv_file).tables[0]
                    guessed_types = type_guess(rows.sample)
                    u['schema'] = guessed_types

                config['files'][i].update(u)


        return config
Esempio n. 22
0
def generate_schema(samples: List[Dict], table_spec: Dict) -> Dict:
    """
    Guess columns types from the given samples and build json schema
    :param samples: List of dictionaries containing samples data from csv file(s)
    :param table_spec: table/stream specs given in the tap definition
    :return: dictionary where the keys are the headers and values are the guessed types - compatible with json schema
    """
    schema = {}

    table_set = CSVTableSet(_csv2bytesio(samples))

    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    types = type_guess(row_set.sample, strict=True)

    for header, header_type in zip(headers, types):

        date_overrides = set(table_spec.get('date_overrides', []))

        if header in date_overrides:
            schema[header] = {'type': ['null', 'string'], 'format': 'date-time'}
        else:
            if isinstance(header_type, IntegerType):
                schema[header] = {
                    'type': ['null', 'integer']
                }
            elif isinstance(header_type, DecimalType):
                schema[header] = {
                    'type': ['null', 'number']
                }
            else:
                schema[header] = {
                    'type': ['null', 'string']
                }

    return schema
Esempio n. 23
0
    def connect(self, host=None, port=None, database=None, username=None, password=None, file=None):
        # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle
        self.database = database
        conn_string = ''
        if self.engine == 'psycopg2':
            if database:
                conn_string += "dbname='%s' " % database
            if username:
                conn_string += "user='******' " % username
            if host:
                conn_string += "host='%s' " % host
            if port:
                conn_string += "port='%s' " % port
            if password:
                conn_string += "password='******' " % password
            self.conn = psycopg2.connect(conn_string)

        elif self.engine == 'pymssql':
            self.conn = pymssql.connect(host, username, password, database, port=port, as_dict=True, charset='LATIN1')

        elif self.engine == 'csv':
            # https://messytables.readthedocs.io/en/latest/
            fh = StringIO.StringIO(self.data)
            #dialect = csv.Sniffer().sniff(f.read(1024))
            #f.seek(0)
            #self.conn = csv.DictReader(f, dialect=dialect)
            #fh = open('messy.csv', 'rb')

            # Load a file object:
            table_set = CSVTableSet(fh)
            row_set = table_set.tables[0]
            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))
            row_set.register_processor(offset_processor(offset + 1))
            types = type_guess(row_set.sample, strict=True)
            row_set.register_processor(types_processor(types))

            self.conn = row_set

        return self.conn
Esempio n. 24
0
    def _get_table_columns(self, csv_file_path: str) -> zip:
        """
        Read the csv file and tries to guess the the type of each column using messytables library.
        The type can be 'Integer', 'Decimal', 'String' or 'Bool'
        :param csv_file_path: path to the csv file with content in it
        :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type
        """
        with gzip.open(csv_file_path, 'rb') as csvfile:
            table_set = CSVTableSet(csvfile, window=1)

            row_set = table_set.tables[0]

            offset, headers = headers_guess(row_set.sample)
            row_set.register_processor(headers_processor(headers))

            row_set.register_processor(offset_processor(offset + 1))

            types = [
                'integer' if header == S3Helper.SDC_SOURCE_LINENO_COLUMN else 'string'
                for header in headers
            ]
            return zip(headers, types)
Esempio n. 25
0
    def test_null_process(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        row_set.register_processor(null_processor(['null']))
        data = list(row_set)

        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, False, False])

        types = type_guess(row_set.sample, strict=True)
        expected_types = [IntegerType(), BoolType(), BoolType(), BoolType()]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))

        # after applying the types, '' should become None for int columns
        data = list(row_set)
        nones = [[x.value is None for x in row] for row in data]
        assert_equal(nones[0], [False, True, False, False])
        assert_equal(nones[1], [False, False, False, True])
        assert_equal(nones[2], [False, True, True, True])
Esempio n. 26
0
    def test_apply_null_values(self):
        fh = horror_fobj('null.csv')
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        types = type_guess(row_set.sample, strict=True)
        expected_types = [
            IntegerType(),
            StringType(),
            BoolType(), StringType()
        ]
        assert_equal(types, expected_types)

        row_set.register_processor(types_processor(types))
        data = list(row_set)
        # treat null as non empty text and 0 as non empty integer
        assert [x.empty for x in data[0]] == [False, False, False, False]
        assert [x.empty for x in data[1]] == [False, False, False, False]
        assert [x.empty for x in data[2]] == [False, False, True, True]
        assert [x.empty for x in data[3]] == [False, False, False, False]
        assert [x.empty for x in data[4]] == [False, False, False, True]
        assert [x.empty for x in data[5]] == [False, False, False, True]

        # we expect None for Integers and "" for empty strings in CSV
        assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
Esempio n. 27
0
 def test_small_csv(self):
     fh = horror_fobj('small.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(1, len(data))
Esempio n. 28
0
 def test_long_csv(self):
     fh = horror_fobj('long.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(4000, len(data))
Esempio n. 29
0
def webstorer_upload(context, data):

    context = json.loads(context)
    resource = json.loads(data)
    
    excel_types = ['xls', 'application/ms-excel', 'application/xls']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))

    rows = []
    
    for row in row_set.dicts():
        rows.append(row)


    webstore_url = context.get('webstore_url').rstrip('/')
    
    webstore_request_url = '%s/%s/%s' % (webstore_url,
                                         context['username'],
                                         resource['id']
                                         )
    #check if resource is already there.
    webstore_response = requests.get(webstore_request_url+'.json')
    check_response_and_retry(webstore_response, webstore_request_url+'.json')

    #should be an empty list as no tables should be there.
    if json.loads(webstore_response.content):
        raise WebstorerError('Webstore already has this resource')

    response = requests.post(webstore_request_url+'/data',
                             data = json.dumps(rows),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )
    check_response_and_retry(response, webstore_request_url+'.json')
    if response.status_code != 201:
        raise WebstorerError('Websore bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )

    ckan_url = context['site_url'].rstrip('/')
    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url+'/data',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Esempio n. 30
0
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

#######################################################

########### Load in the data ##################

fh = open('sales_activity_report.csv', 'rb')

# Load a file object:
table_set = CSVTableSet(fh)

# A table set is a collection of tables:
row_set = table_set.tables[0]

# guess header names and the offset of the header:
offset, headers = headers_guess(row_set.sample)

# import the table of values from the summary report into a Pandas dataframe
df = pd.read_csv('sales_activity_report.csv',
                 header=offset - 1,
                 skipfooter=2,
                 engine='python')
em = pd.read_csv('emojis.csv')

################################################
Esempio n. 31
0
def TABTableSet(fileobj):
    return CSVTableSet(fileobj, delimiter='\t')
Esempio n. 32
0
def _datastorer_upload(context, resource, logger):

    excel_types = ['xls', 'application/ms-excel', 'application/xls',
                   'application/vnd.ms-excel']
    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']

    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        is_tsv = (content_type in tsv_types or
                  resource['format'] in tsv_types)
        delimiter = '\t' if is_tsv else ','
        table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat(),
        'url': resource['url']
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))
Esempio n. 33
0
def parse(stream, guess_types=True, **kwargs):
    '''Parse CSV file and return row iterator plus metadata (fields etc).

    Additional CSV arguments as per
    http://docs.python.org/2/library/csv.html#csv-fmt-params

    :param delimiter:
    :param quotechar:
    :param window: the size of the sample used for analysis

    There is also support for:

    :param encoding: file encoding (will be guess with chardet if not provided)


    You can process csv as well as tsv files using this function. For tsv just
    pass::

        delimiter='\t'
    '''
    metadata = dict(**kwargs)
    delimiter = metadata.get('delimiter', None)
    quotechar = metadata.get('quotechar', None)
    window = metadata.get('window', None)
    encoding = metadata.get('encoding', None)
    table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter,
            quotechar=quotechar, encoding=encoding, window=window)
    row_set = table_set.tables.pop()
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guessable_types = [StringType, IntegerType, FloatType, DecimalType,
                           DateUtilType]
        row_types = type_guess(row_set.sample, guessable_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', unicode(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))
    if guess_types:
        row_set.register_processor(types_processor(row_types))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row
    return row_iterator(), {'fields': fields}
Esempio n. 34
0
def main():

    pgpass = os.getenv('PGPASSFILE', os.path.expanduser('~/.pgpass'))
    if not os.path.exists(pgpass):
        print(
            'You need to create file ~/.pgpass that contains at least one line:'
        )
        print('hostname:port:database:username:password')
        print('and has restricted permissions: chmod 600 ~/.pgpass')
        print(
            'Also please check https://www.postgresql.org/docs/current/static/libpq-pgpass.html'
        )
        return False

    #args.dsn == 'postgresql://dirk@mydb:32063/petersen'
    #args.csvfile = '/home/petersen/sc/data/slurm_jobs.csv'

    if not args.dsn.startswith('postgresql://'):
        dl = args.dsn.split(':')
        args.dsn = 'postgresql://%s@%s:%s/%s' % (dl[3], dl[0], dl[1], dl[2])

    try:
        conn = psycopg2.connect(args.dsn)
        cur = conn.cursor()
    except (Exception, psycopg2.DatabaseError) as error:
        print('Database error:', error)
        return False

    with open(args.csvfile, 'rb') as fh:
        table_set = CSVTableSet(fh)
        row_set = table_set.tables[0]
        #print row_set.sample.next()
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + rowtocheck))
        types = type_guess(row_set.sample, strict=True)

    myd = dict(zip(headers, types))
    print("\nDetected columns & types:\n", myd, '\n')

    table = os.path.splitext(os.path.basename(args.csvfile))[0]
    table = table.replace('-', '_')
    table = table.replace(' ', '_')
    create_sql = "CREATE TABLE %s (" % table
    idh = 0
    for h in headers:
        myt = "TEXT"
        if str(types[idh]) == 'Integer':
            myt = 'BIGINT'
        elif str(types[idh]) == 'Bool':
            myt = 'BIGINT'
        elif str(types[idh]) == 'Decimal':
            myt = 'DECIMAL'

        create_sql += "%s %s, " % (h, myt)
        idh += 1
    create_sql = create_sql[:-2] + ');'

    print("\ncreating postgres table '%s':\n" % table, create_sql, '\n')
    try:
        if args.overwrite:
            drop_sql = 'DROP TABLE IF EXISTS %s' % table
            cur.execute(drop_sql)
            conn.commit()
        cur.execute(create_sql)
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        print('Database error:', error)
        sys.exit()

    print("\nloading data .... ")

    with open(args.csvfile, 'rb') as fh:
        sample_text = ''.join(str(fh.readline()) for x in range(3))
        try:
            dialect = csv.Sniffer().sniff(sample_text)
            if dialect.delimiter == 't':
                delim = '\t'
            else:
                delim = dialect.delimiter
        except:
            delim = ","
        copy_sql = "COPY %s FROM stdin WITH CSV HEADER DELIMITER as '%s'" % \
                   (table, delim)
        try:
            cur.copy_expert(sql=copy_sql, file=fh)
            conn.commit()
            cur.close()
        except (Exception, psycopg2.DatabaseError) as error:
            print('Database error:', error)

    print('Done !')
Esempio n. 35
0
def _datastorer_upload(context, resource):

    excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))


    ckan_url = context['site_url'].rstrip('/')
    
    webstore_request_url = '%s/api/data/%s/' % (ckan_url,
                                                resource['id']
                                                )

    def send_request(data):
        return requests.post(webstore_request_url + '_bulk',
                             data = "%s%s" % ("\n".join(data), "\n"),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )

    data = []
    for count,dict_ in enumerate(row_set.dicts()):
        data.append(json.dumps({"index": {"_id": count+1}}))
        data.append(json.dumps(dict_))
        if (count % 100) == 0:
            response = send_request(data)
            check_response_and_retry(response, webstore_request_url+'_mapping')
            data[:] = []

    if data:
        respose = send_request(data)
        check_response_and_retry(response, webstore_request_url+'_mapping')


    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url,
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Esempio n. 36
0
 def rows(skip_policy):
     fh = horror_fobj('skip_initials.csv')
     table_set = CSVTableSet(fh, skipinitialspace=skip_policy)
     row_set = table_set.tables[0]
     return row_set
Esempio n. 37
0
def parse_csv(filename, cfg_in):
    """
    Guess csv structure

    :param filename:
    :param cfg_in:
    :param known_structure: list of strings formats in order of columns, from start
    but may be not all (next is auto treeted)
    :return: lst_types, offset, headers 


    * quotechar - specifies a one-character string to use as the
        quoting character.  It defaults to '"'.
    * delimiter - specifies a one-character string to use as the
        field separator.  It defaults to ','.
    * skipinitialspace - specifies how to interpret whitespace which
        immediately follows a delimiter.  It defaults to False, which
        means that whitespace immediately following a delimiter is part
        of the following field.
    * lineterminator -  specifies the character sequence which should
        terminate rows.
    * quoting - controls when quotes should be generated by the writer.
        It can take on any of the following module constants:

        csv.QUOTE_MINIMAL means only when required, for example, when a
            field contains either the quotechar or the delimiter
        csv.QUOTE_ALL means that quotes are always placed around fields.
        csv.QUOTE_NONNUMERIC means that quotes are always placed around
            fields which do not parse as integers or floating point
            numbers.
        csv.QUOTE_NONE means that quotes are never placed around fields.
    * escapechar - specifies a one-character string used to escape
        the delimiter when quoting is set to QUOTE_NONE.
    * doublequote - controls the handling of quotes inside fields.  When
        True, two consecutive quotes are interpreted as one during read,
        and when writing, each quote character embedded in the data is
        written as two quotes
    Example:
    parse_csv(filename, ['%H:%M:%S'])
    """
    set_field_if_no(cfg_in, 'types', [])
    set_field_if_no(cfg_in, 'delimiter')
    with open(filename, 'rb') as fh:
        ext = os_path.splitext(filename)[1]
        # Load a file object:
        try:
            # If you are sure that file is csv use CSVTableSet(fh)
            from magic import MagicException  # because any_tableset uses libmagic
            table_set = any_tableset(fh,
                                     mimetype=None,
                                     extension=ext,
                                     delimiter=cfg_in['delimiter'])
        except (ImportError, MagicException) as e:
            print('There are error ', standard_error_info(e),
                  '\n=> Loading file as csv without trying other formats')
            table_set = CSVTableSet(fh, delimiter=cfg_in['delimiter'])

        # A table set is a collection of tables:
        row_set = table_set.tables[0]
        # A row set is an iterator over the table, but it can only
        # be run once. To peek, a sample is provided:

        # guess header names and the offset of the header:
        offset, headers = headers_guess(row_set.sample)  # tolerance=1
        row_set.register_processor(headers_processor(headers))
        # add one to begin with content, not the header:
        row_set.register_processor(offset_processor(offset + 1))
        # guess column types:
        lst_types = type_guess(row_set.sample, strict=True)
        row_sample = next(row_set.sample)

        # check not detected types
        def formats2types(formats_str):
            for f in formats_str:
                if f:
                    if is_date_format(f):
                        yield (types.DateType(f))
                    else:
                        yield (TimeType())
                else:
                    yield (None)

        known_types = formats2types(cfg_in['types'])

        for n, (t, s, kt) in enumerate(zip(lst_types, row_sample,
                                           known_types)):
            if t.result_type == types.StringType.result_type:
                # not auto detected? -> check known_types
                if kt.test(s.value):
                    lst_types[n] = kt  # t= kt
                else:  # known_types fits element
                    print(
                        "col'"
                        's#{:d} value "{}" type not match provided type of {}'.
                        format(n, s.value, type(kt)))
                    # kt = types.DateType('mm/dd/yyyy')
                    # kt.test('0'+s.value)
                    # detect?
            else:
                pass
        # not works for time type:
        # print(jts.headers_and_typed_as_jts(headers,
        #       list(map(jts.celltype_as_string, lst_types))).as_json())
        return lst_types, offset, headers
Esempio n. 38
0
 def test_utf8bom_lost(self):
     fh = horror_fobj('utf8bom.csv')
     table_set = CSVTableSet(fh)
     row_set = table_set.tables[0]
     row = list(row_set)[0]
     assert_equal(row[0].value, "kitten")
date_format = args.date_format
folder, filename = s3path.rsplit('/', 1)
print("file path:", s3path)
print("delimiter:", delimiter)
print("date_format:", date_format)

# getting data
generator = (i for i in smart_open.smart_open(s3path))
head_lst = list(next(generator) for _ in range(100))
head_str = map(
    lambda x: str(x).strip("b").strip("\"").strip('\\n').strip("\'"), head_lst)
head_join = "\n".join(head_str)
f = io.BytesIO(bytes(head_join, 'utf-8'))

# gussing
row_set = CSVTableSet(f, delimiter=delimiter).tables[0]
offset, headers = headers_guess(row_set.sample)
row_set.register_processor(headers_processor(headers))
row_set.register_processor(offset_processor(offset + 1))
types = type_guess(row_set.sample)
#types = type_guess(row_set.sample, strict=True)
print('guessed types:', types)

# constructing ddl
cols = []
for indx, typ in enumerate(types):
    if typ == StringType():
        cols.append("  `a%s` string" % (indx))
    elif typ == DateType(date_format):
        cols.append("  `a%s` date" % (indx))
    elif typ == DecimalType():
Esempio n. 40
0
def parse(stream, guess_types=True, **kwargs):
    '''Parse CSV file and return row iterator plus metadata (fields etc).

    Additional CSV arguments as per
    http://docs.python.org/2/library/csv.html#csv-fmt-params

    :param delimiter:
    :param quotechar:
    :param window: the size of the sample used for analysis

    There is also support for:

    :param encoding: file encoding (will be guess with chardet if not provided)


    You can process csv as well as tsv files using this function. For tsv just
    pass::

        delimiter='\t'
    '''
    metadata = dict(**kwargs)
    delimiter = metadata.get('delimiter', None)
    quotechar = metadata.get('quotechar', None)
    window = metadata.get('window', None)
    encoding = metadata.get('encoding', None)
    table_set = CSVTableSet(stream,
                            delimiter=delimiter,
                            quotechar=quotechar,
                            encoding=encoding,
                            window=window)
    strict_type_guess = metadata.get('strict_type_guess', False)
    row_set = table_set.tables.pop()
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guessable_types = [
            StringType, IntegerType, FloatType, DecimalType, DateUtilType
        ]
        sample = row_set.sample
        for _ in range(offset + 1):
            sample.next()
        row_types = type_guess(sample,
                               guessable_types,
                               strict=strict_type_guess)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', unicode(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))
    if guess_types:
        row_set.register_processor(
            types_processor(row_types, strict=strict_type_guess))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row

    return row_iterator(), {'fields': fields}
Esempio n. 41
0
 def test_strict_type_guessing_with_large_file(self):
     fh = horror_fobj('211.csv')
     rows = CSVTableSet(fh).tables[0]
     offset, headers = headers_guess(rows.sample)
     rows.register_processor(offset_processor(offset + 1))
     types = [StringType, IntegerType, DecimalType, DateUtilType]
     guessed_types = type_guess(rows.sample, types, True)
     assert_equal(len(guessed_types), 96)
     assert_equal(guessed_types, [
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         DecimalType(),
         DecimalType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         IntegerType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         StringType(),
         DateUtilType(),
         DateUtilType(),
         DateUtilType(),
         DateUtilType(),
         StringType(),
         StringType(),
         StringType()
     ])