Ejemplo n.º 1
0
            raise
        except Exception, e:
            if i < max_retries:
                logger.error("Error while performing download: %r. Retrying...", e)
                sleep(5 * i)
                continue
            else:
                raise
        else:
            break

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')
    table_sets = AnyTableSet.from_fileobj(f, mimetype=content_type, extension=resource['format'].lower())

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
Ejemplo n.º 2
0
def parse_resource(resource_file, mime_type, fileext, default_schema):
	# Given resource data, returns a tuple of
	#  * the schema used to load the file
	#  * the resource as a table, as given by messytables (an iterator over rows) 
	
	# Schema defaults. We'll build up the schema with defaults
	# from the actual resource so that it is easy for the data
	# owner to customize the schema later.
	schema = {
		"_format": 1,
	}
	
	# Update defaults from the provided schema.
	if default_schema:
		schema.update(default_schema)
	else:
		schema["auto"] = True
	
	# Utility function that's like dict.get() but works on nested
	# dicts and takes a path through to dicts as arguments. Returns
	# None if no value is found.
	#   e.g. schema_get('format', 'name')
	#        This returns schema["format"]["name"], or None if
	#        "format" isn't in schema or "name" isn't in schema["format"].
	def schema_get(*path, **kwargs):
		if len(path) < 1: raise ValueError()
		v = schema
		for p in path: v = v.get(p, {})
		if v == { }: v = kwargs.get("default", None)
		return v
	
	# Utility function that sets a value in a set of nested dicts.
	# Takes a path plus a value as arguments.
	#   e.g. schema_set('format', 'name', 'zip')
	#        This is equivalent to:
	#          schema["format"]["name"] = "zip"
	#        but creating dicts as necessary along the way.
	def schema_set(*path_and_value):
		if len(path_and_value) < 2: raise ValueError()
		path = path_and_value[0:-2]
		field = path_and_value[-2]
		value = path_and_value[-1]
		container = schema
		for p in path:
			container = container.setdefault(p, {})
		container[field] = value
	
	# Parse the resource_file.
	
	# Get the table data format.
	
	if schema_get('format', 'name') == None:
		# Auto-detect format.
		from messytables import AnyTableSet as data_format
		data_format_args = { }
		
	elif schema_get('format', 'name') in ("csv", "tsv"):
		# "format" = {
		#   "name": "csv" | "tsv",
		#   "delimiter": ",",
		#   "quotechar": "\"",
		#   "encoding": "utf-8"
		# }
		
		# Load as CSV/TSV.
		from messytables import CSVTableSet as data_format
		
		# Default load parameters.
		data_format_args = {
			"delimiter": "," if schema_get('format', 'name') == "csv" else "\t",
			"quotechar": '"',
			"encoding": None,
		}
		
		# Override parameters from the schema.
		for n in ("delimiter", "quotechar", "encoding"):
			v = schema_get("format", n)
			if v:
				data_format_args[n] = str(v) # csv module requires str's not unicode
		
	else:
		raise UserError("Invalid format name in schema. Allowed values are: csv, tsv.")
		
	# If the user specifies a ZIP container, then parse the
	# resource_file as a ZIP file and pass the format parameters
	# into ZIPTableSet so it knows how to parse the inner files.
	
	if schema_get("container", "name") == "zip":
		# "container = {
		#   "name": "zip"
		# }
		
		from messytables import ZIPTableSet
		table_set = ZIPTableSet.from_fileobj(resource_file,
			inner_data_format=data_format,
			inner_parser_args=data_format_args)

	elif schema_get("container", "name") != None:
		raise UserError("Invalid container name in schema. Allowed values are: zip.")

	# If format parameters were given explicity, do not use a container.
	# Just parse according to the specified format.

	elif schema_get('format', 'name') != None:
		table_set = data_format.from_fileobj(resource_file, **data_format_args)
		
	# If no container and no format was specified, auto-detect everything.
	
	else:
		# Use the AnyTableSet to guess all parsing parameters.
		from messytables import AnyTableSet
		try:
			table_set = AnyTableSet.from_fileobj(resource_file, mimetype=mime_type, extension=fileext)
		except Exception as e:
			raise UserError("The file format could not be recognized: %s" % str(e))
		
		# Provide the container information that may have been guessed.
		if type(table_set).__name__ == "ZIPTableSet":
			schema_set("container", "name", "zip")
		
	table = table_set.tables[0]

	# Provide the CSV parser settings that may have been guessed.
	if type(table).__name__ == "CSVRowSet":
		schema_set("format", "name", "tsv" if table.delimiter == "\t" else "csv")
		schema_set("format", "delimiter", table.delimiter)
		schema_set("format", "quotechar", table.quotechar)
		schema_set("format", "encoding", table.encoding)
        
	# Get the column header names and the row offset to the data.
	
	# Start by guessing.
	from messytables import headers_guess, headers_processor
	offset, headers = headers_guess(table.sample)
	
	# Overwrite the has_headers and offset values with the schema, if present.
	has_headers = schema_get("header", "present", default=True)
	offset = schema_get("header", "offset", default=offset)
	
	# Set the header information back into the schema.
	schema_set("header", "present", True)
	schema_set("header", "offset", offset)
	
	# Override the header names with what is specified in the schema.
	for cidx, col in enumerate(schema_get("columns", default=[])):
		try:
			headers[cidx] = col.get("name", headers[cidx])
		except IndexError:
			pass # ignore schema problems?
	
	# Since SQL column names are not case sensitive, prevent any
	# uniqueness clashes by converting all to lowercase. While
	# we're at it, also trim spaces.
	headers = [h.lower().strip() for h in headers]
	
	# Ensure the headers are valid SQL-ish & datastore column names:
	#  1st character: letter
	#  subsequent characters: letter, number, or underscore
	for i, header in enumerate(headers):
		# To play nice with international characters, convert to ASCII
		# by replacing extended characters with their closest ASCII
		# equivalent where possible.
		header = u"".join(c for c in unicodedata.normalize('NFKD', header)
			if not unicodedata.combining(c))
		
		# Replace any invalid characters with "".
		header = re.sub("[^a-z0-9_]", "", header)
		
		# If there is an invalid 1st character, prepend a valid start.
		if not re.match("^[a-z]", header):
			header = "field_" + header
			
		# And force to an ASCII byte string, which should be possible by now.
		headers[i] = str(header)

	# TODO: Check that there is not an insane number of columns.
	# That could crash headers_make_unique. 

	# Ensure the headers are unique, and not too long. Postgres
	# supports 63 (64?)-char-long col names, but that's ridiculous.
	from messytables import headers_make_unique
	headers = headers_make_unique(headers, max_length=24)
	
	# Skip the header row.
	# (Add one to begin with content, not the header.)
	from messytables import offset_processor
	table.register_processor(offset_processor(offset + 1))
	
	# Try to guess the datatypes.
	import messytables.types
	from messytables import type_guess, types_processor
	datatypes = type_guess(
		table.sample,
		[
			messytables.types.StringType,
			messytables.types.IntegerType,
			messytables.types.FloatType,
			messytables.types.DecimalType,
			messytables.types.DateType
		],
		strict=True
		)
	 
	if len(datatypes) != len(headers):
		raise UserError("Could not guess data types. Column header count does not match rows found during type guessing.")
	messytable_datastore_type_mapping = {
		messytables.types.StringType: lambda t : 'text',
		messytables.types.IntegerType: lambda t : 'bigint',  # 'int' may not be big enough,
						# and type detection may not realize it needs to be big
		messytables.types.FloatType: lambda t : 'float',
		messytables.types.DecimalType: lambda t : 'numeric',
		messytables.types.DateType: lambda t : 'timestamp:' + t.format,
	}
	datatypes = [messytable_datastore_type_mapping[type(t)](t) for t in datatypes] # convert objects to strings
	
	# Override the datatypes from the schema.
	for cidx, col in enumerate(schema_get("columns", default=[])):
		try:
			datatypes[cidx] = col.get("type", datatypes[cidx])
		except IndexError:
			pass # ignore schema problems?
	
	# Provide the header names and types back to the user in the schema.
	schema["columns"] = []
	for i in xrange(len(headers)):
		schema["columns"].append({
			"name": headers[i],
			"type": datatypes[i],
		})
		
	# Validate that the datatypes are all legit.
	for dt in datatypes:
		if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool", "numeric", "date", "time", "timestamp", "json"):
			raise UserError("Invalid data type in schema: %s" % dt)
			
	return schema, table
Ejemplo n.º 3
0
def parse_resource(resource_file, mime_type, fileext, default_schema):
    # Given resource data, returns a tuple of
    #  * the schema used to load the file
    #  * the resource as a table, as given by messytables (an iterator over rows)

    # Schema defaults. We'll build up the schema with defaults
    # from the actual resource so that it is easy for the data
    # owner to customize the schema later.
    schema = {
        "_format": 1,
    }

    # Update defaults from the provided schema.
    if default_schema:
        schema.update(default_schema)
    else:
        schema["auto"] = True

    # Utility function that's like dict.get() but works on nested
    # dicts and takes a path through to dicts as arguments. Returns
    # None if no value is found.
    #   e.g. schema_get('format', 'name')
    #        This returns schema["format"]["name"], or None if
    #        "format" isn't in schema or "name" isn't in schema["format"].
    def schema_get(*path, **kwargs):
        if len(path) < 1: raise ValueError()
        v = schema
        for p in path:
            v = v.get(p, {})
        if v == {}: v = kwargs.get("default", None)
        return v

    # Utility function that sets a value in a set of nested dicts.
    # Takes a path plus a value as arguments.
    #   e.g. schema_set('format', 'name', 'zip')
    #        This is equivalent to:
    #          schema["format"]["name"] = "zip"
    #        but creating dicts as necessary along the way.
    def schema_set(*path_and_value):
        if len(path_and_value) < 2: raise ValueError()
        path = path_and_value[0:-2]
        field = path_and_value[-2]
        value = path_and_value[-1]
        container = schema
        for p in path:
            container = container.setdefault(p, {})
        container[field] = value

    # Parse the resource_file.

    # Get the table data format.

    if schema_get('format', 'name') == None:
        # Auto-detect format.
        from messytables import AnyTableSet as data_format
        data_format_args = {}

    elif schema_get('format', 'name') in ("csv", "tsv"):
        # "format" = {
        #   "name": "csv" | "tsv",
        #   "delimiter": ",",
        #   "quotechar": "\"",
        #   "encoding": "utf-8"
        # }

        # Load as CSV/TSV.
        from messytables import CSVTableSet as data_format

        # Default load parameters.
        data_format_args = {
            "delimiter":
            "," if schema_get('format', 'name') == "csv" else "\t",
            "quotechar": '"',
            "encoding": None,
        }

        # Override parameters from the schema.
        for n in ("delimiter", "quotechar", "encoding"):
            v = schema_get("format", n)
            if v:
                data_format_args[n] = str(
                    v)  # csv module requires str's not unicode

    else:
        raise UserError(
            "Invalid format name in schema. Allowed values are: csv, tsv.")

    # If the user specifies a ZIP container, then parse the
    # resource_file as a ZIP file and pass the format parameters
    # into ZIPTableSet so it knows how to parse the inner files.

    if schema_get("container", "name") == "zip":
        # "container = {
        #   "name": "zip"
        # }

        from messytables import ZIPTableSet
        table_set = ZIPTableSet.from_fileobj(
            resource_file,
            inner_data_format=data_format,
            inner_parser_args=data_format_args)

    elif schema_get("container", "name") != None:
        raise UserError(
            "Invalid container name in schema. Allowed values are: zip.")

    # If format parameters were given explicity, do not use a container.
    # Just parse according to the specified format.

    elif schema_get('format', 'name') != None:
        table_set = data_format.from_fileobj(resource_file, **data_format_args)

    # If no container and no format was specified, auto-detect everything.

    else:
        # Use the AnyTableSet to guess all parsing parameters.
        from messytables import AnyTableSet
        try:
            table_set = AnyTableSet.from_fileobj(resource_file,
                                                 mimetype=mime_type,
                                                 extension=fileext)
        except Exception as e:
            raise UserError("The file format could not be recognized: %s" %
                            str(e))

        # Provide the container information that may have been guessed.
        if type(table_set).__name__ == "ZIPTableSet":
            schema_set("container", "name", "zip")

    table = table_set.tables[0]

    # Provide the CSV parser settings that may have been guessed.
    if type(table).__name__ == "CSVRowSet":
        schema_set("format", "name",
                   "tsv" if table.delimiter == "\t" else "csv")
        schema_set("format", "delimiter", table.delimiter)
        schema_set("format", "quotechar", table.quotechar)
        schema_set("format", "encoding", table.encoding)

    # Get the column header names and the row offset to the data.

    # Start by guessing.
    from messytables import headers_guess, headers_processor
    offset, headers = headers_guess(table.sample)

    # Overwrite the has_headers and offset values with the schema, if present.
    has_headers = schema_get("header", "present", default=True)
    offset = schema_get("header", "offset", default=offset)

    # Set the header information back into the schema.
    schema_set("header", "present", True)
    schema_set("header", "offset", offset)

    # Override the header names with what is specified in the schema.
    for cidx, col in enumerate(schema_get("columns", default=[])):
        try:
            headers[cidx] = col.get("name", headers[cidx])
        except IndexError:
            pass  # ignore schema problems?

    # Since SQL column names are not case sensitive, prevent any
    # uniqueness clashes by converting all to lowercase. While
    # we're at it, also trim spaces.
    headers = [h.lower().strip() for h in headers]

    # Ensure the headers are valid SQL-ish & datastore column names:
    #  1st character: letter
    #  subsequent characters: letter, number, or underscore
    for i, header in enumerate(headers):
        # To play nice with international characters, convert to ASCII
        # by replacing extended characters with their closest ASCII
        # equivalent where possible.
        header = u"".join(c for c in unicodedata.normalize('NFKD', header)
                          if not unicodedata.combining(c))

        # Replace any invalid characters with "".
        header = re.sub("[^a-z0-9_]", "", header)

        # If there is an invalid 1st character, prepend a valid start.
        if not re.match("^[a-z]", header):
            header = "field_" + header

        # And force to an ASCII byte string, which should be possible by now.
        headers[i] = str(header)

    # TODO: Check that there is not an insane number of columns.
    # That could crash headers_make_unique.

    # Ensure the headers are unique, and not too long. Postgres
    # supports 63 (64?)-char-long col names, but that's ridiculous.
    from messytables import headers_make_unique
    headers = headers_make_unique(headers, max_length=24)

    # Skip the header row.
    # (Add one to begin with content, not the header.)
    from messytables import offset_processor
    table.register_processor(offset_processor(offset + 1))

    # Try to guess the datatypes.
    import messytables.types
    from messytables import type_guess, types_processor
    datatypes = type_guess(table.sample, [
        messytables.types.StringType, messytables.types.IntegerType,
        messytables.types.FloatType, messytables.types.DecimalType,
        messytables.types.DateType
    ],
                           strict=True)

    if len(datatypes) != len(headers):
        raise UserError(
            "Could not guess data types. Column header count does not match rows found during type guessing."
        )
    messytable_datastore_type_mapping = {
        messytables.types.StringType:
        lambda t: 'text',
        messytables.types.IntegerType:
        lambda t: 'bigint',  # 'int' may not be big enough,
        # and type detection may not realize it needs to be big
        messytables.types.FloatType:
        lambda t: 'float',
        messytables.types.DecimalType:
        lambda t: 'numeric',
        messytables.types.DateType:
        lambda t: 'timestamp:' + t.format,
    }
    datatypes = [
        messytable_datastore_type_mapping[type(t)](t) for t in datatypes
    ]  # convert objects to strings

    # Override the datatypes from the schema.
    for cidx, col in enumerate(schema_get("columns", default=[])):
        try:
            datatypes[cidx] = col.get("type", datatypes[cidx])
        except IndexError:
            pass  # ignore schema problems?

    # Provide the header names and types back to the user in the schema.
    schema["columns"] = []
    for i in xrange(len(headers)):
        schema["columns"].append({
            "name": headers[i],
            "type": datatypes[i],
        })

    # Validate that the datatypes are all legit.
    for dt in datatypes:
        if dt.split(":")[0] not in ("text", "int", "bigint", "float", "bool",
                                    "numeric", "date", "time", "timestamp",
                                    "json"):
            raise UserError("Invalid data type in schema: %s" % dt)

    return schema, table
Ejemplo n.º 4
0
    def push_to_datastore(self, context, resource):
        try:
            result = download(
                context,
                resource,
                self.max_content_length,
                DATA_FORMATS
            )
        except Exception as e:
            logger.exception(e)
            return
        content_type = result['headers'].get('content-type', '')\
                                        .split(';', 1)[0]  # remove parameters

        f = open(result['saved_file'], 'rb')
        table_sets = AnyTableSet.from_fileobj(
            f,
            mimetype=content_type,
            extension=resource['format'].lower()
        )

        ##only first sheet in xls for time being
        row_set = table_sets.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))
        row_set.register_processor(offset_processor(offset + 1))
        row_set.register_processor(datetime_procesor())

        logger.info('Header offset: {0}.'.format(offset))

        guessed_types = type_guess(
            row_set.sample,
            [
                messytables.types.StringType,
                messytables.types.IntegerType,
                messytables.types.FloatType,
                messytables.types.DecimalType,
                messytables.types.DateUtilType
            ],
            strict=True
        )
        logger.info('Guessed types: {0}'.format(guessed_types))
        row_set.register_processor(types_processor(guessed_types, strict=True))
        row_set.register_processor(stringify_processor())

        guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in
                              guessed_types]

        def send_request(data):
            data_dict = {
                'resource_id': resource['id'],
                'fields': [dict(id=name, type=typename) for name, typename
                           in zip(headers, guessed_type_names)],
                'records': data
            }
            response = logic.get_action('datastore_create')(
                context,
                data_dict
            )
            return response

        # Delete any existing data before proceeding. Otherwise
        # 'datastore_create' will append to the existing datastore. And if the
        # fields have significantly changed, it may also fail.
        logger.info('Deleting existing datastore (it may not exist): '
                    '{0}.'.format(resource['id']))
        try:
            logic.get_action('datastore_delete')(
                context,
                {'resource_id': resource['id']}
            )
        except Exception as e:
            logger.exception(e)

        logger.info('Creating: {0}.'.format(resource['id']))

        # generates chunks of data that can be loaded into ckan
        # n is the maximum size of a chunk
        def chunky(iterable, n):
            it = iter(iterable)
            while True:
                chunk = list(
                    itertools.imap(
                        dict, itertools.islice(it, n)))
                if not chunk:
                    return
                yield chunk

        count = 0
        for data in chunky(row_set.dicts(), 100):
            count += len(data)
            send_request(data)

        logger.info("There should be {n} entries in {res_id}.".format(
            n=count,
            res_id=resource['id']
        ))

        resource.update({
            'webstore_url': 'active',
            'webstore_last_updated': datetime.datetime.now().isoformat()
        })

        logic.get_action('resource_update')(context, resource)
Ejemplo n.º 5
0
def _datastorer_upload(context, resource, logger):
    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')
    table_sets = AnyTableSet.from_fileobj(f, mimetype=content_type, extension=resource['format'].lower())

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    # Delete any existing data before proceeding. Otherwise 'datastore_create' will
    # append to the existing datastore. And if the fields have significantly changed,
    # it may also fail.
    try:
        logger.info('Deleting existing datastore (it may not exist): {0}.'.format(resource['id']))
        response = requests.post('%s/api/action/datastore_delete' % (ckan_url),
                        data=json.dumps({'resource_id': resource['id']}),
                        headers={'Content-Type': 'application/json',
                                'Authorization': context['apikey']}
                        )
        if not response.status_code or response.status_code not in (200, 404):
            # skips 200 (OK) or 404 (datastore does not exist, no need to delete it)
            logger.error('Deleting existing datastore failed: {0}'.format(get_response_error(response)))
            raise DatastorerException("Deleting existing datastore failed.")
    except requests.exceptions.RequestException as e:
        logger.error('Deleting existing datastore failed: {0}'.format(str(e)))
        raise DatastorerException("Deleting existing datastore failed.")

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    resource.update({
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    })

    response = requests.post(
        ckan_request_url,
        data=json.dumps(resource),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))