def convert(self): table_set = CSVTableSet.from_fileobj(self.stream) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) data_row = {} result = [] for row in row_set: for index, cell in enumerate(row): data_row[cell.column] = cell.value result.append(data_row) return fields, result
def convert(self): table_set = CSVTableSet.from_fileobj(self.stream) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', str(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, str(dup_columns[field])]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) data_row = {} result = [] for row in row_set: for index, cell in enumerate(row): data_row[cell.column] = cell.value result.append(data_row) return fields, result
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj, delimiter=',') if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls', 'application/excel') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def load_data(config): if not 'url' in config: yield {config.get('field'): config.get('value')} return fh = urlopen(config.get('url')) table_set = CSVTableSet.from_fileobj(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: row = [(c.column, c.value) for c in row] yield dict(row) fh.close()
def load_data(config): if not 'url' in config: yield { config.get('field'): config.get('value') } return fh = urlopen(config.get('url')) table_set = CSVTableSet.from_fileobj(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: row = [(c.column, c.value) for c in row] yield dict(row) fh.close()
def parse(stream, guess_types=True, **kwargs): '''Parse CSV file and return row iterator plus metadata (fields etc). Additional CSV arguments as per http://docs.python.org/2/library/csv.html#csv-fmt-params :param delimiter: :param quotechar: :param window: the size of the sample used for analysis There is also support for: :param encoding: file encoding (will be guess with chardet if not provided) You can process csv as well as tsv files using this function. For tsv just pass:: delimiter='\t' ''' metadata = dict(**kwargs) delimiter = metadata.get('delimiter', None) quotechar = metadata.get('quotechar', None) window = metadata.get('window', None) encoding = metadata.get('encoding', None) table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter, quotechar=quotechar, encoding=encoding, window=window) row_set = table_set.tables.pop() offset, headers = headers_guess(row_set.sample) fields = [] dup_columns = {} noname_count = 1 if guess_types: guessable_types = [StringType, IntegerType, FloatType, DecimalType, DateUtilType] row_types = type_guess(row_set.sample, guessable_types) for index, field in enumerate(headers): field_dict = {} if "" == field: field = '_'.join(['column', unicode(noname_count)]) headers[index] = field noname_count += 1 if headers.count(field) == 1: field_dict['id'] = field else: dup_columns[field] = dup_columns.get(field, 0) + 1 field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])]) if guess_types: if isinstance(row_types[index], DateUtilType): field_dict['type'] = 'DateTime' else: field_dict['type'] = str(row_types[index]) fields.append(field_dict) row_set.register_processor(headers_processor([x['id'] for x in fields])) row_set.register_processor(offset_processor(offset + 1)) if guess_types: row_set.register_processor(types_processor(row_types)) def row_iterator(): for row in row_set: data_row = {} for index, cell in enumerate(row): data_row[cell.column] = cell.value yield data_row return row_iterator(), {'fields': fields}
def _datastorer_upload(context, resource): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) ckan_url = context['site_url'].rstrip('/') webstore_request_url = '%s/api/data/%s/' % (ckan_url, resource['id'] ) def send_request(data): return requests.post(webstore_request_url + '_bulk', data = "%s%s" % ("\n".join(data), "\n"), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) data = [] for count,dict_ in enumerate(row_set.dicts()): data.append(json.dumps({"index": {"_id": count+1}})) data.append(json.dumps(dict_)) if (count % 100) == 0: response = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') data[:] = [] if data: respose = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url, 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def webstorer_upload(context, data): context = json.loads(context) resource = json.loads(data) excel_types = ['xls', 'application/ms-excel', 'application/xls'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) rows = [] for row in row_set.dicts(): rows.append(row) webstore_url = context.get('webstore_url').rstrip('/') webstore_request_url = '%s/%s/%s' % (webstore_url, context['username'], resource['id'] ) #check if resource is already there. webstore_response = requests.get(webstore_request_url+'.json') check_response_and_retry(webstore_response, webstore_request_url+'.json') #should be an empty list as no tables should be there. if json.loads(webstore_response.content): raise WebstorerError('Webstore already has this resource') response = requests.post(webstore_request_url+'/data', data = json.dumps(rows), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, webstore_request_url+'.json') if response.status_code != 201: raise WebstorerError('Websore bad response code (%s). Response was %s'% (response.status_code, response.content) ) ckan_url = context['site_url'].rstrip('/') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url+'/data', 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def _datastorer_upload(context, resource, logger): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: is_tsv = (content_type in tsv_types or resource['format'] in tsv_types) delimiter = '\t' if is_tsv else ',' table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat(), 'url': resource['url'] } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))