def score_by_url_extension(resource, score_reasons, log): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format is None * If it cannot score it, then score is None ''' extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append( _('Could not determine a file extension in the URL.')) return (None, None) for extension in extension_variants_: format_ = format_get(extension) if format_: score = lib.resource_format_scores().get(format_) if score: score_reasons.append( _('URL extension "%s" relates to format "%s" and receives score: %s.' ) % (extension, format_, score)) return score, format_ else: score = 1 score_reasons.append( _('URL extension "%s" relates to format "%s" but a score for that format is not configured, so giving it default score %s.' ) % (extension, format_, score)) return score, format_ score_reasons.append( _('URL extension "%s" is an unknown format.') % extension) return (None, None)
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append(_('Format field is blank.')) return (None, None) format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \ ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field)) if not format_tuple: score_reasons.append( _('Format field "%s" does not correspond to a known format.') % format_field) return (None, None) score = lib.resource_format_scores().get(format_tuple[1]) score_reasons.append( _('Format field "%s" receives score: %s.') % (format_field, score)) return (score, format_tuple[1])
def score_by_url_extension(resource, score_reasons, log): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format is None * If it cannot score it, then score is None ''' extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append('Could not determine a file extension in the URL.') return (None, None) for extension in extension_variants_: format_ = format_get(extension) if format_: score = lib.resource_format_scores().get(format_) if score: score_reasons.append('URL extension "%s" relates to format "%s" and receives score: %s.' % (extension, format_, score)) return score, format_ else: score = 1 score_reasons.append('URL extension "%s" relates to format "%s" but a score for that format is not configured, so giving it default score %s.' % (extension, format_, score)) return score, format_ score_reasons.append('URL extension "%s" is an unknown format.' % extension) return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons, log): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append( _('This file had not been downloaded at the time of scoring it.')) return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append( _('Cache filepath does not exist: "%s".') % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: score_reasons.append( _('Content of file appeared to be format "%s" which receives openness score: %s.' ) % (sniffed_format['format'], score)) return score, sniffed_format['format'] else: score_reasons.append( _('The format of the file was not recognized from its contents.' )) return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append( _('File was not downloaded deliberately') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( _('A system error occurred during downloading this file') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) else: score_reasons.append( _('This file had not been downloaded at the time of scoring it.' )) return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons, log): """ Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None """ if not archival or not archival.cache_filepath: score_reasons.append("This file had not been downloaded at the time of scoring it.") return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Cache filepath does not exist: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) score = lib.resource_format_scores().get(sniffed_format["format"]) if sniffed_format else None if sniffed_format: score_reasons.append( 'Content of file appeared to be format "%s" which receives openness score: %s.' % (sniffed_format["format"], score) ) return score, sniffed_format["format"] else: score_reasons.append("The format of the file was not recognized from its contents.") return (None, None) else: # No cache_url if archival.status_id == Status.by_text("Chose not to download"): score_reasons.append( "File was not downloaded deliberately. Reason: %s. Using other methods to determine file openness." % archival.reason ) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( "A system error occurred during downloading this file. Reason: %s. Using other methods to determine file openness." % archival.reason ) return (None, None) else: score_reasons.append("This file had not been downloaded at the time of scoring it.") return (None, None)
def score_by_format_field(resource, score_reasons, log): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append('Format field is blank.') return (None, None) format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \ ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field)) if not format_tuple: score_reasons.append('Format field "%s" does not correspond to a known format.' % format_field) return (None, None) score = lib.resource_format_scores().get(format_tuple[1]) score_reasons.append('Format field "%s" receives score: %s.' % (format_field, score)) return (score, format_tuple[1])
# GTFS check - a GTFS is a zip which containing specific filenames filenames = set((os.path.basename(f) for f in filepaths)) if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt')) - set(filenames)): log.info('GTFS detected') return {'format': 'GTFS'} top_score = 0 top_scoring_extension_counts = defaultdict( int) # extension: number_of_files for filepath in filepaths: extension = os.path.splitext(filepath)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0]
zip.close() except zipfile.BadZipfile, e: log.info('Zip file open raised error %s: %s', e, e.args) return except Exception, e: log.warning('Zip file open raised exception %s: %s', e, e.args) return top_score = 0 top_scoring_extension_counts = defaultdict(int) # extension: number_of_files for filename in filenames: extension = os.path.splitext(filename)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0] log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)',
def validate_resource(self, id): if toolkit.request.method == 'POST': data = dict_fns.unflatten( tuplize_dict(parse_params(toolkit.request.POST))) check_schema = toolkit.request.params.get('check_schema') upload_data = toolkit.request.params.get('upload_data') file_path = data.get('file_path') # Logic for validating a resource against a specified schema if check_schema: schema = {'fields': []} fields = data.get('field_name') field_type = data.get('field_type') # Schema is populated from data entered by the user for i, field in enumerate(fields): schema['fields'].append({ 'name': field, 'type': field_type[i] }) # File is validated with Goodtables report = validate(file_path, schema=schema) log = logging.getLogger('ckanext.tayside') # Score is calculated based on Sir Tim Berners-Lee's five star # of openness sniffed_format = sniff_file_format(file_path, log) score = resource_format_scores().get(sniffed_format['format']) vars = { 'report': report, 'pkg_name': id, 'stars': score, 'file_path': file_path } return toolkit.render('tayside/package/validate_resource.html', extra_vars=vars) elif upload_data: # Handles creating a resource in CKAN # Metadata for the resource is stored in Redis r = redis.StrictRedis() data = json.loads(r.get(file_path)) data['package_id'] = id # Dataset's state is changed from 'draft' to 'active' toolkit.get_action('package_patch')({}, { 'id': id, 'state': 'active' }) # FieldStorage instance is created which is needed to upload # the file to Filestore and Datastore fs = cgi.FieldStorage() fs.file = fs.make_file() fs.filename = data.get('url') f = open(file_path, 'r') fs.file.write(f.read()) fs.file.seek(0) f.close() data['upload'] = fs try: toolkit.get_action('resource_create')({}, data) except Exception as e: vars = { 'upload_error': 'An error occured while creating the ' 'resource.', 'pkg_name': id } return toolkit.render( 'tayside/package/validate_resource.html', extra_vars=vars) # File is uploaded on Filestore, and now it is safe to be # removed from the temporary location os.remove(file_path) toolkit.redirect_to(controller='package', action='read', id=id) else: is_upload = isinstance(data.get('upload'), cgi.FieldStorage) supported_formats = ['csv', 'tsv', 'xls', 'xlsx', 'ods'] current_format = data.get('url').split('.')[-1] if is_upload: if current_format in supported_formats: # Logic for storing the file locally and extracting # it's headers (fields) storage_path = config.get('ckan.storage_path') file_path = storage_path + '/' + data.get('url') # Read the file buffer = data.get('upload').file buffer.seek(0) # Write the file locally f = open(file_path, 'w') f.write(buffer.read()) f.close() # Inspect the headers (fields) of the file with Stream(file_path, headers=1) as stream: fields = stream.headers vars = { 'fields': fields, 'pkg_name': id, 'file_path': file_path } if is_redis_available(): # Store the metadata of the resource in Redis for # later usage r = redis.StrictRedis() resource_data = { 'name': data.get('name'), 'description': data.get('description'), 'format': data.get('format'), 'url': data.get('url'), } r.set(file_path, json.dumps(resource_data)) # Store it for 1 day r.expire(file_path, 86400) else: return toolkit.render( 'tayside/package/validate_resource.html', {'redis_error': 'Redis not available'}) return toolkit.render( 'tayside/package/validate_resource.html', extra_vars=vars) else: vars = { 'format_error': 'Format not supported.', 'pkg_name': id } return toolkit.render( 'tayside/package/validate_resource.html', extra_vars=vars) vars = { 'format_error': 'No file provided for validation.', 'pkg_name': id } return toolkit.render('tayside/package/validate_resource.html', extra_vars=vars) else: return toolkit.render('tayside/package/validate_resource.html', {'pkg_name': id})
def custom_resource_score(self, resource, resource_score): resource_score_format = resource_score.get('format').upper( ) if resource_score.get('format') is not None else '' resource_format = resource.format.upper( ) if resource.format is not None else '' # If resource openness_score is 3 and format is CSV if resource_score.get('openness_score', 0) == 3 and resource_score_format == 'CSV': # If resource has a JSON schema which validated successfully, set score to 4 if hasattr(resource, 'extras') and resource.extras.get( 'schema', None) and resource.extras.get( 'validation_status', '').lower() == 'success': resource_score['openness_score'] = 4 resource_score['openness_score_reason'] = toolkit._( 'Content of file appeared to be format "{0}" which receives openness score: {1}.' .format(resource_score_format, resource_score.get('openness_score', ''))) if resource_score.get('openness_score', 0) > 0: # QA cannot determine file formats that are not part of its own # 'resource_format_openness_scores.json' file and CKAN resource_formats.json file # The below are dataqld specific file formats that are not part of the default # CKAN resource_formats.json file and need custom scoring # If QA believes the resource is a TIFF file, check the resource format selected, # if it's GEOTIFF apply custom score if resource_score_format == 'TIFF' and resource_format == 'GEOTIFF': resource_score['openness_score'] = resource_score[ 'openness_score'] = qa_lib.resource_format_scores().get( resource_format) resource_score['openness_score_reason'] = toolkit._( 'Content of file appeared to be format "{0}" which receives openness score: {1}.' .format(resource_format, resource_score.get('openness_score', ''))) # If QA believes the resource is a ZIP file, check the resource format selected, # if it's GDB apply custom score if resource_score_format == 'ZIP' and 'GDB' in resource_format: resource_score['format'] = 'GDB' resource_score[ 'openness_score'] = qa_lib.resource_format_scores().get( resource_score['format']) resource_score['openness_score_reason'] = toolkit._( 'Content of file appeared to be format "{0}" which receives openness score: {1}.' .format(resource_format, resource_score.get('openness_score', ''))) # QA by default does not know how to handle GPKG formats, check the # resource format selected and extension, if it's GPKG apply custom score if 'GPKG' in resource_format: if resource.url_type == 'upload' and 'GPKG' in os.path.splitext(resource.url)[1].upper() \ or resource.url_type == 'url' and 'GPKG' in (ext.upper() for ext in qa_tasks.extension_variants(resource.url)): resource_score['format'] = 'GPKG' resource_score[ 'openness_score'] = qa_lib.resource_format_scores( ).get(resource_score['format']) resource_score['openness_score_reason'] = toolkit._( 'Content of file appeared to be format "{0}" which receives openness score: {1}.' .format(resource_format, resource_score.get('openness_score', ''))) return resource_score