def test_rowset_as_schema(self): from io import BytesIO as sio ts = CSVTableSet(sio(b'''name,dob\nmk,2012-01-02\n''')) rs = ts.tables[0] jts = rowset_as_jts(rs).as_dict() assert_equal(jts['fields'], [{ 'type': 'string', 'id': 'name', 'label': 'name' }, { 'type': 'date', 'id': 'dob', 'label': 'dob' }])
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def convert(self): table_set = CSVTableSet.from_fileobj(self.stream) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) data_row = {} result = [] for row in row_set: for index, cell in enumerate(row): data_row[cell.column] = cell.value result.append(data_row) return fields, result
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType()])
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011 2, 2012/2/12, 2, 02 October 2011 2.4, 2012/2/12, 1, 1 May 2011 foo, bar, 1000, 4.3, , 42, 24 October 2012 , 2012/2/12, 21, 24 December 2013''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y')])
def csvParse(csv_file_path): fh = open(csv_file_path, 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types, strict=False)) return row_set, headers, offset, types
def test_read_head_padding_csv(self): fh = horror_fobj('weird_head_padding.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert 11 == len(headers), headers assert_equal(u'1985', headers[1].strip()) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) for row in row_set: assert_equal(11, len(row)) value = data[1][0].value.strip() assert value == u'Gefäßchirurgie', value
def test_read_type_guess_simple(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] assert_equal(header_types, [StringType()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types)
def test_read_complex_csv(self): fh = horror_fobj('complex.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] assert_equal(4, len(list(row_set))) row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'another date') assert_equal(row[2].value, 'temperature') assert_equal(row[3].value, 'place') for row in list(row_set): assert_equal(4, len(row)) assert_equal(row[0].type, StringType())
def test_guessing_uses_first_in_case_of_tie(self): csv_file = StringIO.StringIO(''' 2 1.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, types=[DecimalType, IntegerType], strict=False) assert_equal(guessed_types, [DecimalType()]) guessed_types = type_guess(rows.sample, types=[IntegerType, DecimalType], strict=False) assert_equal(guessed_types, [IntegerType()])
def test_type_guess_strict(self): import locale locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" foo, bar, 1500, 0,,"NaN" 4, 2012/2/12, 42,"-2,000",24 October 2012,"42" ,,,,,''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ StringType(), StringType(), DecimalType(), IntegerType(), DateType('%d %B %Y'), DecimalType()])
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
def load_data(config): if not 'url' in config: yield {config.get('field'): config.get('value')} return fh = urlopen(config.get('url')) table_set = CSVTableSet.from_fileobj(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: row = [(c.column, c.value) for c in row] yield dict(row) fh.close()
def test_type_guess(self): csv_file = io.BytesIO(b''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType() ])
def load_data(config): if not 'url' in config: yield { config.get('field'): config.get('value') } return fh = urlopen(config.get('url')) table_set = CSVTableSet.from_fileobj(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: row = [(c.column, c.value) for c in row] yield dict(row) fh.close()
def _get_table_columns(self, csv_file_path: str) -> zip: """ Read the csv file and tries to guess the the type of each column using messytables library. The type can be 'Integer', 'Decimal', 'String' or 'Bool' :param csv_file_path: path to the csv file with content in it :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type """ with gzip.open(csv_file_path, 'rb') as f: table_set = CSVTableSet(f) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = list(map(jts.celltype_as_string, type_guess(row_set.sample, strict=True))) return zip(headers, types)
def prepare_csv_rows(csv_file): row_set = CSVTableSet(csv_file).tables[0] offset, headers = headers_guess(row_set.sample) headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)] row_set.register_processor(headers_processor_remove_blank(headers)) row_set.register_processor(offset_processor(offset + 1)) DateType.formats = create_date_formats(day_first=False) # We are never wanting boolean types, so remove that from the default list eligible_types = [StringType, DecimalType, IntegerType, DateType] types = type_guess(row_set.sample, types=eligible_types, strict=True) row_set.register_processor(types_processor(types)) return row_set
def _guess_csv_datatype(fh): table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) logger.info("(offset, headers) = ({}, {})".format(offset, headers)) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) counter = 0 for row in row_set: logger.info(row) counter += 1 if counter >= 32: break d = {h: t for h, t in zip(headers, types)} logger.info(d) return d
def update(self, config): # Update the mime, sha1 of the files for i in range(len(config['files'])): filename = config['files'][i]['filename'] if os.path.exists(filename): u = { 'mimetype': mimetypes.guess_type(filename)[0], 'sha1': compute_sha1(filename) } if filename.lower().endswith('sv'): # csv/tsv rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) u['schema'] = guessed_types config['files'][i].update(u) return config
def generate_schema(samples: List[Dict], table_spec: Dict) -> Dict: """ Guess columns types from the given samples and build json schema :param samples: List of dictionaries containing samples data from csv file(s) :param table_spec: table/stream specs given in the tap definition :return: dictionary where the keys are the headers and values are the guessed types - compatible with json schema """ schema = {} table_set = CSVTableSet(_csv2bytesio(samples)) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) for header, header_type in zip(headers, types): date_overrides = set(table_spec.get('date_overrides', [])) if header in date_overrides: schema[header] = {'type': ['null', 'string'], 'format': 'date-time'} else: if isinstance(header_type, IntegerType): schema[header] = { 'type': ['null', 'integer'] } elif isinstance(header_type, DecimalType): schema[header] = { 'type': ['null', 'number'] } else: schema[header] = { 'type': ['null', 'string'] } return schema
def connect(self, host=None, port=None, database=None, username=None, password=None, file=None): # TODO: mysql, pymssql, csv, sqlite3, pymongo, cx_Oracle self.database = database conn_string = '' if self.engine == 'psycopg2': if database: conn_string += "dbname='%s' " % database if username: conn_string += "user='******' " % username if host: conn_string += "host='%s' " % host if port: conn_string += "port='%s' " % port if password: conn_string += "password='******' " % password self.conn = psycopg2.connect(conn_string) elif self.engine == 'pymssql': self.conn = pymssql.connect(host, username, password, database, port=port, as_dict=True, charset='LATIN1') elif self.engine == 'csv': # https://messytables.readthedocs.io/en/latest/ fh = StringIO.StringIO(self.data) #dialect = csv.Sniffer().sniff(f.read(1024)) #f.seek(0) #self.conn = csv.DictReader(f, dialect=dialect) #fh = open('messy.csv', 'rb') # Load a file object: table_set = CSVTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) self.conn = row_set return self.conn
def _get_table_columns(self, csv_file_path: str) -> zip: """ Read the csv file and tries to guess the the type of each column using messytables library. The type can be 'Integer', 'Decimal', 'String' or 'Bool' :param csv_file_path: path to the csv file with content in it :return: a Zip object where each tuple has two elements: the first is the column name and the second is the type """ with gzip.open(csv_file_path, 'rb') as csvfile: table_set = CSVTableSet(csvfile, window=1) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = [ 'integer' if header == S3Helper.SDC_SOURCE_LINENO_COLUMN else 'string' for header in headers ] return zip(headers, types)
def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [IntegerType(), BoolType(), BoolType(), BoolType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), BoolType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def test_small_csv(self): fh = horror_fobj('small.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(1, len(data))
def test_long_csv(self): fh = horror_fobj('long.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(4000, len(data))
def webstorer_upload(context, data): context = json.loads(context) resource = json.loads(data) excel_types = ['xls', 'application/ms-excel', 'application/xls'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) rows = [] for row in row_set.dicts(): rows.append(row) webstore_url = context.get('webstore_url').rstrip('/') webstore_request_url = '%s/%s/%s' % (webstore_url, context['username'], resource['id'] ) #check if resource is already there. webstore_response = requests.get(webstore_request_url+'.json') check_response_and_retry(webstore_response, webstore_request_url+'.json') #should be an empty list as no tables should be there. if json.loads(webstore_response.content): raise WebstorerError('Webstore already has this resource') response = requests.post(webstore_request_url+'/data', data = json.dumps(rows), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, webstore_request_url+'.json') if response.status_code != 201: raise WebstorerError('Websore bad response code (%s). Response was %s'% (response.status_code, response.content) ) ckan_url = context['site_url'].rstrip('/') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url+'/data', 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, classification_report from sklearn.metrics import roc_curve from sklearn.metrics import accuracy_score ####################################################### ########### Load in the data ################## fh = open('sales_activity_report.csv', 'rb') # Load a file object: table_set = CSVTableSet(fh) # A table set is a collection of tables: row_set = table_set.tables[0] # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) # import the table of values from the summary report into a Pandas dataframe df = pd.read_csv('sales_activity_report.csv', header=offset - 1, skipfooter=2, engine='python') em = pd.read_csv('emojis.csv') ################################################
def TABTableSet(fileobj): return CSVTableSet(fileobj, delimiter='\t')
def _datastorer_upload(context, resource, logger): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: is_tsv = (content_type in tsv_types or resource['format'] in tsv_types) delimiter = '\t' if is_tsv else ',' table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat(), 'url': resource['url'] } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))
def parse(stream, guess_types=True, **kwargs): '''Parse CSV file and return row iterator plus metadata (fields etc). Additional CSV arguments as per http://docs.python.org/2/library/csv.html#csv-fmt-params :param delimiter: :param quotechar: :param window: the size of the sample used for analysis There is also support for: :param encoding: file encoding (will be guess with chardet if not provided) You can process csv as well as tsv files using this function. For tsv just pass:: delimiter='\t' ''' metadata = dict(**kwargs) delimiter = metadata.get('delimiter', None) quotechar = metadata.get('quotechar', None) window = metadata.get('window', None) encoding = metadata.get('encoding', None) table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter, quotechar=quotechar, encoding=encoding, window=window) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guessable_types = [StringType, IntegerType, FloatType, DecimalType, DateUtilType] row_types = type_guess(row_set.sample, guessable_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', unicode(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) if guess_types: row_set.register_processor(types_processor(row_types)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def main(): pgpass = os.getenv('PGPASSFILE', os.path.expanduser('~/.pgpass')) if not os.path.exists(pgpass): print( 'You need to create file ~/.pgpass that contains at least one line:' ) print('hostname:port:database:username:password') print('and has restricted permissions: chmod 600 ~/.pgpass') print( 'Also please check https://www.postgresql.org/docs/current/static/libpq-pgpass.html' ) return False #args.dsn == 'postgresql://dirk@mydb:32063/petersen' #args.csvfile = '/home/petersen/sc/data/slurm_jobs.csv' if not args.dsn.startswith('postgresql://'): dl = args.dsn.split(':') args.dsn = 'postgresql://%s@%s:%s/%s' % (dl[3], dl[0], dl[1], dl[2]) try: conn = psycopg2.connect(args.dsn) cur = conn.cursor() except (Exception, psycopg2.DatabaseError) as error: print('Database error:', error) return False with open(args.csvfile, 'rb') as fh: table_set = CSVTableSet(fh) row_set = table_set.tables[0] #print row_set.sample.next() offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + rowtocheck)) types = type_guess(row_set.sample, strict=True) myd = dict(zip(headers, types)) print("\nDetected columns & types:\n", myd, '\n') table = os.path.splitext(os.path.basename(args.csvfile))[0] table = table.replace('-', '_') table = table.replace(' ', '_') create_sql = "CREATE TABLE %s (" % table idh = 0 for h in headers: myt = "TEXT" if str(types[idh]) == 'Integer': myt = 'BIGINT' elif str(types[idh]) == 'Bool': myt = 'BIGINT' elif str(types[idh]) == 'Decimal': myt = 'DECIMAL' create_sql += "%s %s, " % (h, myt) idh += 1 create_sql = create_sql[:-2] + ');' print("\ncreating postgres table '%s':\n" % table, create_sql, '\n') try: if args.overwrite: drop_sql = 'DROP TABLE IF EXISTS %s' % table cur.execute(drop_sql) conn.commit() cur.execute(create_sql) conn.commit() except (Exception, psycopg2.DatabaseError) as error: print('Database error:', error) sys.exit() print("\nloading data .... ") with open(args.csvfile, 'rb') as fh: sample_text = ''.join(str(fh.readline()) for x in range(3)) try: dialect = csv.Sniffer().sniff(sample_text) if dialect.delimiter == 't': delim = '\t' else: delim = dialect.delimiter except: delim = "," copy_sql = "COPY %s FROM stdin WITH CSV HEADER DELIMITER as '%s'" % \ (table, delim) try: cur.copy_expert(sql=copy_sql, file=fh) conn.commit() cur.close() except (Exception, psycopg2.DatabaseError) as error: print('Database error:', error) print('Done !')
def _datastorer_upload(context, resource): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) ckan_url = context['site_url'].rstrip('/') webstore_request_url = '%s/api/data/%s/' % (ckan_url, resource['id'] ) def send_request(data): return requests.post(webstore_request_url + '_bulk', data = "%s%s" % ("\n".join(data), "\n"), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) data = [] for count,dict_ in enumerate(row_set.dicts()): data.append(json.dumps({"index": {"_id": count+1}})) data.append(json.dumps(dict_)) if (count % 100) == 0: response = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') data[:] = [] if data: respose = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url, 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def rows(skip_policy): fh = horror_fobj('skip_initials.csv') table_set = CSVTableSet(fh, skipinitialspace=skip_policy) row_set = table_set.tables[0] return row_set
def parse_csv(filename, cfg_in): """ Guess csv structure :param filename: :param cfg_in: :param known_structure: list of strings formats in order of columns, from start but may be not all (next is auto treeted) :return: lst_types, offset, headers * quotechar - specifies a one-character string to use as the quoting character. It defaults to '"'. * delimiter - specifies a one-character string to use as the field separator. It defaults to ','. * skipinitialspace - specifies how to interpret whitespace which immediately follows a delimiter. It defaults to False, which means that whitespace immediately following a delimiter is part of the following field. * lineterminator - specifies the character sequence which should terminate rows. * quoting - controls when quotes should be generated by the writer. It can take on any of the following module constants: csv.QUOTE_MINIMAL means only when required, for example, when a field contains either the quotechar or the delimiter csv.QUOTE_ALL means that quotes are always placed around fields. csv.QUOTE_NONNUMERIC means that quotes are always placed around fields which do not parse as integers or floating point numbers. csv.QUOTE_NONE means that quotes are never placed around fields. * escapechar - specifies a one-character string used to escape the delimiter when quoting is set to QUOTE_NONE. * doublequote - controls the handling of quotes inside fields. When True, two consecutive quotes are interpreted as one during read, and when writing, each quote character embedded in the data is written as two quotes Example: parse_csv(filename, ['%H:%M:%S']) """ set_field_if_no(cfg_in, 'types', []) set_field_if_no(cfg_in, 'delimiter') with open(filename, 'rb') as fh: ext = os_path.splitext(filename)[1] # Load a file object: try: # If you are sure that file is csv use CSVTableSet(fh) from magic import MagicException # because any_tableset uses libmagic table_set = any_tableset(fh, mimetype=None, extension=ext, delimiter=cfg_in['delimiter']) except (ImportError, MagicException) as e: print('There are error ', standard_error_info(e), '\n=> Loading file as csv without trying other formats') table_set = CSVTableSet(fh, delimiter=cfg_in['delimiter']) # A table set is a collection of tables: row_set = table_set.tables[0] # A row set is an iterator over the table, but it can only # be run once. To peek, a sample is provided: # guess header names and the offset of the header: offset, headers = headers_guess(row_set.sample) # tolerance=1 row_set.register_processor(headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(offset_processor(offset + 1)) # guess column types: lst_types = type_guess(row_set.sample, strict=True) row_sample = next(row_set.sample) # check not detected types def formats2types(formats_str): for f in formats_str: if f: if is_date_format(f): yield (types.DateType(f)) else: yield (TimeType()) else: yield (None) known_types = formats2types(cfg_in['types']) for n, (t, s, kt) in enumerate(zip(lst_types, row_sample, known_types)): if t.result_type == types.StringType.result_type: # not auto detected? -> check known_types if kt.test(s.value): lst_types[n] = kt # t= kt else: # known_types fits element print( "col'" 's#{:d} value "{}" type not match provided type of {}'. format(n, s.value, type(kt))) # kt = types.DateType('mm/dd/yyyy') # kt.test('0'+s.value) # detect? else: pass # not works for time type: # print(jts.headers_and_typed_as_jts(headers, # list(map(jts.celltype_as_string, lst_types))).as_json()) return lst_types, offset, headers
def test_utf8bom_lost(self): fh = horror_fobj('utf8bom.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row = list(row_set)[0] assert_equal(row[0].value, "kitten")
date_format = args.date_format folder, filename = s3path.rsplit('/', 1) print("file path:", s3path) print("delimiter:", delimiter) print("date_format:", date_format) # getting data generator = (i for i in smart_open.smart_open(s3path)) head_lst = list(next(generator) for _ in range(100)) head_str = map( lambda x: str(x).strip("b").strip("\"").strip('\\n').strip("\'"), head_lst) head_join = "\n".join(head_str) f = io.BytesIO(bytes(head_join, 'utf-8')) # gussing row_set = CSVTableSet(f, delimiter=delimiter).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample) #types = type_guess(row_set.sample, strict=True) print('guessed types:', types) # constructing ddl cols = [] for indx, typ in enumerate(types): if typ == StringType(): cols.append(" `a%s` string" % (indx)) elif typ == DateType(date_format): cols.append(" `a%s` date" % (indx)) elif typ == DecimalType():
def parse(stream, guess_types=True, **kwargs): '''Parse CSV file and return row iterator plus metadata (fields etc). Additional CSV arguments as per http://docs.python.org/2/library/csv.html#csv-fmt-params :param delimiter: :param quotechar: :param window: the size of the sample used for analysis There is also support for: :param encoding: file encoding (will be guess with chardet if not provided) You can process csv as well as tsv files using this function. For tsv just pass:: delimiter='\t' ''' metadata = dict(**kwargs) delimiter = metadata.get('delimiter', None) quotechar = metadata.get('quotechar', None) window = metadata.get('window', None) encoding = metadata.get('encoding', None) table_set = CSVTableSet(stream, delimiter=delimiter, quotechar=quotechar, encoding=encoding, window=window) strict_type_guess = metadata.get('strict_type_guess', False) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guessable_types = [ StringType, IntegerType, FloatType, DecimalType, DateUtilType ] sample = row_set.sample for _ in range(offset + 1): sample.next() row_types = type_guess(sample, guessable_types, strict=strict_type_guess) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', unicode(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) if guess_types: row_set.register_processor( types_processor(row_types, strict=strict_type_guess)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType() ])