raise except Exception, e: if i < max_retries: logger.error("Error while performing download: %r. Retrying...", e) sleep(5 * i) continue else: raise else: break content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = AnyTableSet.from_fileobj(f, mimetype=content_type, extension=resource['format'].lower()) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType,
def parse_resource(resource_file, mime_type, fileext, default_schema): # Given resource data, returns a tuple of # * the schema used to load the file # * the resource as a table, as given by messytables (an iterator over rows) # Schema defaults. We'll build up the schema with defaults # from the actual resource so that it is easy for the data # owner to customize the schema later. schema = { "_format": 1, } # Update defaults from the provided schema. if default_schema: schema.update(default_schema) else: schema["auto"] = True # Utility function that's like dict.get() but works on nested # dicts and takes a path through to dicts as arguments. Returns # None if no value is found. # e.g. schema_get('format', 'name') # This returns schema["format"]["name"], or None if # "format" isn't in schema or "name" isn't in schema["format"]. def schema_get(*path, **kwargs): if len(path) < 1: raise ValueError() v = schema for p in path: v = v.get(p, {}) if v == { }: v = kwargs.get("default", None) return v # Utility function that sets a value in a set of nested dicts. # Takes a path plus a value as arguments. # e.g. schema_set('format', 'name', 'zip') # This is equivalent to: # schema["format"]["name"] = "zip" # but creating dicts as necessary along the way. def schema_set(*path_and_value): if len(path_and_value) < 2: raise ValueError() path = path_and_value[0:-2] field = path_and_value[-2] value = path_and_value[-1] container = schema for p in path: container = container.setdefault(p, {}) container[field] = value # Parse the resource_file. # Get the table data format. if schema_get('format', 'name') == None: # Auto-detect format. from messytables import AnyTableSet as data_format data_format_args = { } elif schema_get('format', 'name') in ("csv", "tsv"): # "format" = { # "name": "csv" | "tsv", # "delimiter": ",", # "quotechar": "\"", # "encoding": "utf-8" # } # Load as CSV/TSV. from messytables import CSVTableSet as data_format # Default load parameters. data_format_args = { "delimiter": "," if schema_get('format', 'name') == "csv" else "\t", "quotechar": '"', "encoding": None, } # Override parameters from the schema. for n in ("delimiter", "quotechar", "encoding"): v = schema_get("format", n) if v: data_format_args[n] = str(v) # csv module requires str's not unicode else: raise UserError("Invalid format name in schema. Allowed values are: csv, tsv.") # If the user specifies a ZIP container, then parse the # resource_file as a ZIP file and pass the format parameters # into ZIPTableSet so it knows how to parse the inner files. if schema_get("container", "name") == "zip": # "container = { # "name": "zip" # } from messytables import ZIPTableSet table_set = ZIPTableSet.from_fileobj(resource_file, inner_data_format=data_format, inner_parser_args=data_format_args) elif schema_get("container", "name") != None: raise UserError("Invalid container name in schema. Allowed values are: zip.") # If format parameters were given explicity, do not use a container. # Just parse according to the specified format. elif schema_get('format', 'name') != None: table_set = data_format.from_fileobj(resource_file, **data_format_args) # If no container and no format was specified, auto-detect everything. else: # Use the AnyTableSet to guess all parsing parameters. from messytables import AnyTableSet try: table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext) except Exception as e: raise UserError("The file format could not be recognized: %s" % str(e)) # Provide the container information that may have been guessed. if type(table_set).__name__ == "ZIPTableSet": schema_set("container", "name", "zip") table = table_set.tables[0] # Provide the CSV parser settings that may have been guessed. if type(table).__name__ == "CSVRowSet": schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv") schema_set("format", "delimiter", table.delimiter) schema_set("format", "quotechar", table.quotechar) schema_set("format", "encoding", table.encoding) # Get the column header names and the row offset to the data. # Start by guessing. from messytables import headers_guess, headers_processor offset, headers = headers_guess(table.sample) # Overwrite the has_headers and offset values with the schema, if present. has_headers = schema_get("header", "present", default=True) offset = schema_get("header", "offset", default=offset) # Set the header information back into the schema. schema_set("header", "present", True) schema_set("header", "offset", offset) # Override the header names with what is specified in the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: headers[cidx] = col.get("name", headers[cidx]) except IndexError: pass # ignore schema problems? # Since SQL column names are not case sensitive, prevent any # uniqueness clashes by converting all to lowercase. While # we're at it, also trim spaces. headers = [h.lower().strip() for h in headers] # Ensure the headers are valid SQL-ish & datastore column names: # 1st character: letter # subsequent characters: letter, number, or underscore for i, header in enumerate(headers): # To play nice with international characters, convert to ASCII # by replacing extended characters with their closest ASCII # equivalent where possible. header = u"".join(c for c in unicodedata.normalize('NFKD', header) if not unicodedata.combining(c)) # Replace any invalid characters with "". header = re.sub("[^a-z0-9_]", "", header) # If there is an invalid 1st character, prepend a valid start. if not re.match("^[a-z]", header): header = "field_" + header # And force to an ASCII byte string, which should be possible by now. headers[i] = str(header) # TODO: Check that there is not an insane number of columns. # That could crash headers_make_unique. # Ensure the headers are unique, and not too long. Postgres # supports 63 (64?)-char-long col names, but that's ridiculous. from messytables import headers_make_unique headers = headers_make_unique(headers, max_length=24) # Skip the header row. # (Add one to begin with content, not the header.) from messytables import offset_processor table.register_processor(offset_processor(offset + 1)) # Try to guess the datatypes. import messytables.types from messytables import type_guess, types_processor datatypes = type_guess( table.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateType ], strict=True ) if len(datatypes) != len(headers): raise UserError("Could not guess data types. Column header count does not match rows found during type guessing.") messytable_datastore_type_mapping = { messytables.types.StringType: lambda t : 'text', messytables.types.IntegerType: lambda t : 'bigint', # 'int' may not be big enough, # and type detection may not realize it needs to be big messytables.types.FloatType: lambda t : 'float', messytables.types.DecimalType: lambda t : 'numeric', messytables.types.DateType: lambda t : 'timestamp:' + t.format, } datatypes = [messytable_datastore_type_mapping[type(t)](t) for t in datatypes] # convert objects to strings # Override the datatypes from the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: datatypes[cidx] = col.get("type", datatypes[cidx]) except IndexError: pass # ignore schema problems? # Provide the header names and types back to the user in the schema. schema["columns"] = [] for i in xrange(len(headers)): schema["columns"].append({ "name": headers[i], "type": datatypes[i], }) # Validate that the datatypes are all legit. for dt in datatypes: if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"): raise UserError("Invalid data type in schema: %s" % dt) return schema, table
def parse_resource(resource_file, mime_type, fileext, default_schema): # Given resource data, returns a tuple of # * the schema used to load the file # * the resource as a table, as given by messytables (an iterator over rows) # Schema defaults. We'll build up the schema with defaults # from the actual resource so that it is easy for the data # owner to customize the schema later. schema = { "_format": 1, } # Update defaults from the provided schema. if default_schema: schema.update(default_schema) else: schema["auto"] = True # Utility function that's like dict.get() but works on nested # dicts and takes a path through to dicts as arguments. Returns # None if no value is found. # e.g. schema_get('format', 'name') # This returns schema["format"]["name"], or None if # "format" isn't in schema or "name" isn't in schema["format"]. def schema_get(*path, **kwargs): if len(path) < 1: raise ValueError() v = schema for p in path: v = v.get(p, {}) if v == {}: v = kwargs.get("default", None) return v # Utility function that sets a value in a set of nested dicts. # Takes a path plus a value as arguments. # e.g. schema_set('format', 'name', 'zip') # This is equivalent to: # schema["format"]["name"] = "zip" # but creating dicts as necessary along the way. def schema_set(*path_and_value): if len(path_and_value) < 2: raise ValueError() path = path_and_value[0:-2] field = path_and_value[-2] value = path_and_value[-1] container = schema for p in path: container = container.setdefault(p, {}) container[field] = value # Parse the resource_file. # Get the table data format. if schema_get('format', 'name') == None: # Auto-detect format. from messytables import AnyTableSet as data_format data_format_args = {} elif schema_get('format', 'name') in ("csv", "tsv"): # "format" = { # "name": "csv" | "tsv", # "delimiter": ",", # "quotechar": "\"", # "encoding": "utf-8" # } # Load as CSV/TSV. from messytables import CSVTableSet as data_format # Default load parameters. data_format_args = { "delimiter": "," if schema_get('format', 'name') == "csv" else "\t", "quotechar": '"', "encoding": None, } # Override parameters from the schema. for n in ("delimiter", "quotechar", "encoding"): v = schema_get("format", n) if v: data_format_args[n] = str( v) # csv module requires str's not unicode else: raise UserError( "Invalid format name in schema. Allowed values are: csv, tsv.") # If the user specifies a ZIP container, then parse the # resource_file as a ZIP file and pass the format parameters # into ZIPTableSet so it knows how to parse the inner files. if schema_get("container", "name") == "zip": # "container = { # "name": "zip" # } from messytables import ZIPTableSet table_set = ZIPTableSet.from_fileobj( resource_file, inner_data_format=data_format, inner_parser_args=data_format_args) elif schema_get("container", "name") != None: raise UserError( "Invalid container name in schema. Allowed values are: zip.") # If format parameters were given explicity, do not use a container. # Just parse according to the specified format. elif schema_get('format', 'name') != None: table_set = data_format.from_fileobj(resource_file, **data_format_args) # If no container and no format was specified, auto-detect everything. else: # Use the AnyTableSet to guess all parsing parameters. from messytables import AnyTableSet try: table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext) except Exception as e: raise UserError("The file format could not be recognized: %s" % str(e)) # Provide the container information that may have been guessed. if type(table_set).__name__ == "ZIPTableSet": schema_set("container", "name", "zip") table = table_set.tables[0] # Provide the CSV parser settings that may have been guessed. if type(table).__name__ == "CSVRowSet": schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv") schema_set("format", "delimiter", table.delimiter) schema_set("format", "quotechar", table.quotechar) schema_set("format", "encoding", table.encoding) # Get the column header names and the row offset to the data. # Start by guessing. from messytables import headers_guess, headers_processor offset, headers = headers_guess(table.sample) # Overwrite the has_headers and offset values with the schema, if present. has_headers = schema_get("header", "present", default=True) offset = schema_get("header", "offset", default=offset) # Set the header information back into the schema. schema_set("header", "present", True) schema_set("header", "offset", offset) # Override the header names with what is specified in the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: headers[cidx] = col.get("name", headers[cidx]) except IndexError: pass # ignore schema problems? # Since SQL column names are not case sensitive, prevent any # uniqueness clashes by converting all to lowercase. While # we're at it, also trim spaces. headers = [h.lower().strip() for h in headers] # Ensure the headers are valid SQL-ish & datastore column names: # 1st character: letter # subsequent characters: letter, number, or underscore for i, header in enumerate(headers): # To play nice with international characters, convert to ASCII # by replacing extended characters with their closest ASCII # equivalent where possible. header = u"".join(c for c in unicodedata.normalize('NFKD', header) if not unicodedata.combining(c)) # Replace any invalid characters with "". header = re.sub("[^a-z0-9_]", "", header) # If there is an invalid 1st character, prepend a valid start. if not re.match("^[a-z]", header): header = "field_" + header # And force to an ASCII byte string, which should be possible by now. headers[i] = str(header) # TODO: Check that there is not an insane number of columns. # That could crash headers_make_unique. # Ensure the headers are unique, and not too long. Postgres # supports 63 (64?)-char-long col names, but that's ridiculous. from messytables import headers_make_unique headers = headers_make_unique(headers, max_length=24) # Skip the header row. # (Add one to begin with content, not the header.) from messytables import offset_processor table.register_processor(offset_processor(offset + 1)) # Try to guess the datatypes. import messytables.types from messytables import type_guess, types_processor datatypes = type_guess(table.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateType ], strict=True) if len(datatypes) != len(headers): raise UserError( "Could not guess data types. Column header count does not match rows found during type guessing." ) messytable_datastore_type_mapping = { messytables.types.StringType: lambda t: 'text', messytables.types.IntegerType: lambda t: 'bigint', # 'int' may not be big enough, # and type detection may not realize it needs to be big messytables.types.FloatType: lambda t: 'float', messytables.types.DecimalType: lambda t: 'numeric', messytables.types.DateType: lambda t: 'timestamp:' + t.format, } datatypes = [ messytable_datastore_type_mapping[type(t)](t) for t in datatypes ] # convert objects to strings # Override the datatypes from the schema. for cidx, col in enumerate(schema_get("columns", default=[])): try: datatypes[cidx] = col.get("type", datatypes[cidx]) except IndexError: pass # ignore schema problems? # Provide the header names and types back to the user in the schema. schema["columns"] = [] for i in xrange(len(headers)): schema["columns"].append({ "name": headers[i], "type": datatypes[i], }) # Validate that the datatypes are all legit. for dt in datatypes: if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"): raise UserError("Invalid data type in schema: %s" % dt) return schema, table
def push_to_datastore(self, context, resource): try: result = download( context, resource, self.max_content_length, DATA_FORMATS ) except Exception as e: logger.exception(e) return content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = AnyTableSet.from_fileobj( f, mimetype=content_type, extension=resource['format'].lower() ) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): data_dict = { 'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data } response = logic.get_action('datastore_create')( context, data_dict ) return response # Delete any existing data before proceeding. Otherwise # 'datastore_create' will append to the existing datastore. And if the # fields have significantly changed, it may also fail. logger.info('Deleting existing datastore (it may not exist): ' '{0}.'.format(resource['id'])) try: logic.get_action('datastore_delete')( context, {'resource_id': resource['id']} ) except Exception as e: logger.exception(e) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format( n=count, res_id=resource['id'] )) resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) logic.get_action('resource_update')(context, resource)
def _datastorer_upload(context, resource, logger): result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') table_sets = AnyTableSet.from_fileobj(f, mimetype=content_type, extension=resource['format'].lower()) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) # Delete any existing data before proceeding. Otherwise 'datastore_create' will # append to the existing datastore. And if the fields have significantly changed, # it may also fail. try: logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id'])) response = requests.post('%s/api/action/datastore_delete' % (ckan_url), data=json.dumps({'resource_id': resource['id']}), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']} ) if not response.status_code or response.status_code not in (200, 404): # skips 200 (OK) or 404 (datastore does not exist, no need to delete it) logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response))) raise DatastorerException("Deleting existing datastore failed.") except requests.exceptions.RequestException as e: logger.error('Deleting existing datastore failed: {0}'.format(str(e))) raise DatastorerException("Deleting existing datastore failed.") logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' resource.update({ 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat() }) response = requests.post( ckan_request_url, data=json.dumps(resource), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))