def score_by_format_field(resource, score_reasons): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append(_('Format field is blank.')) return (None, None) format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \ ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field)) if not format_tuple: score_reasons.append( _('Format field "%s" does not correspond to a known format.') % format_field) return (None, None) score = lib.resource_format_scores().get(format_tuple[1]) score_reasons.append( _('Format field "%s" receives score: %s.') % (format_field, score)) return (score, format_tuple[1])
def score_by_url_extension(resource, score_reasons): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format is None * If it cannot score it, then score is None ''' extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append(_('Could not determine a file extension in the URL.')) return (None, None) for extension in extension_variants_: format_ = format_get(extension) if format_: score = lib.resource_format_scores().get(format_) if score: score_reasons.append(_('URL extension "%s" relates to format "%s" and receives score: %s.') % (extension, format_, score)) return score, format_ else: score = 1 score_reasons.append(_('URL extension "%s" relates to format "%s"' ' but a score for that format is not configured, so giving it default score %s.') % (extension, format_, score)) return score, format_ score_reasons.append(_('URL extension "%s" is an unknown format.') % extension) return (None, None)
def score_by_url_extension(resource, score_reasons): ''' Looks at the URL for a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format is None * If it cannot score it, then score is None ''' extension_variants_ = extension_variants(resource.url.strip()) if not extension_variants_: score_reasons.append( _('Could not determine a file extension in the URL.')) return (None, None) for extension in extension_variants_: format_ = format_get(extension) if format_: score = lib.resource_format_scores().get(format_) if score: score_reasons.append( _('URL extension "%s" relates to format "%s" and receives score: %s.' ) % (extension, format_, score)) return score, format_ else: score = 1 score_reasons.append( _('URL extension "%s" relates to format "%s"' ' but a score for that format is not configured, so giving it default score %s.' ) % (extension, format_, score)) return score, format_ score_reasons.append( _('URL extension "%s" is an unknown format.') % extension) return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append( _('This file had not been downloaded at the time of scoring it.')) return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append( _('Cache filepath does not exist: "%s".') % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: score_reasons.append( _('Content of file appeared to be format "%s" which receives openness score: %s.' ) % (sniffed_format['format'], score)) return score, sniffed_format['format'] else: score_reasons.append( _('The format of the file was not recognized from its contents.' )) return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append( _('File was not downloaded deliberately') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( _('A system error occurred during downloading this file') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) else: score_reasons.append( _('This file had not been downloaded at the time of scoring it.' )) return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append(_('This file had not been downloaded at the time of scoring it.')) return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.') % (sniffed_format['format'], score)) return score, sniffed_format['format'] else: score_reasons.append(_('The format of the file was not recognized from its contents.')) return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append(_('File was not downloaded deliberately') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append(_('A system error occurred during downloading this file') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) else: score_reasons.append(_('This file had not been downloaded at the time of scoring it.')) return (None, None)
def score_by_format_field(resource, score_reasons): ''' Looks at the format field of a resource to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' format_field = resource.format or '' if not format_field: score_reasons.append(_('Format field is blank.')) return (None, None) format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \ ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field)) if not format_tuple: score_reasons.append(_('Format field "%s" does not correspond to a known format.') % format_field) return (None, None) score = lib.resource_format_scores().get(format_tuple[1]) score_reasons.append(_('Format field "%s" receives score: %s.') % (format_field, score)) return (score, format_tuple[1])
# GTFS check - a GTFS is a zip which containing specific filenames filenames = set((os.path.basename(f) for f in filepaths)) if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt')) - set(filenames)): log.info('GTFS detected') return {'format': 'GTFS'} top_score = 0 top_scoring_extension_counts = defaultdict( int) # extension: number_of_files for filepath in filepaths: extension = os.path.splitext(filepath)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0]
return {'format': 'SHP'} # GTFS check - a GTFS is a zip which containing specific filenames filenames = set((os.path.basename(f) for f in filepaths)) if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt', 'stop_times.txt', 'calendar.txt')) - set(filenames)): log.info('GTFS detected') return {'format': 'GTFS'} top_score = 0 top_scoring_extension_counts = defaultdict(int) # extension: number_of_files for filepath in filepaths: extension = os.path.splitext(filepath)[-1][1:].lower() format_tuple = ckan_helpers.resource_formats().get(extension) if format_tuple: score = lib.resource_format_scores().get(format_tuple[1]) if score is not None and score > top_score: top_score = score top_scoring_extension_counts = defaultdict(int) if score == top_score: top_scoring_extension_counts[extension] += 1 else: log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath) if not top_scoring_extension_counts: log.info('Zip has no known extensions: %s', filepath) return {'format': 'ZIP'} top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(), key=lambda x: x[1]) top_extension = top_scoring_extension_counts[-1][0]