Esempio n. 1
0
def score_by_format_field(resource, score_reasons):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append(_('Format field is blank.'))
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
        ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field))
    if not format_tuple:
        score_reasons.append(
            _('Format field "%s" does not correspond to a known format.') %
            format_field)
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
    score_reasons.append(
        _('Format field "%s" receives score: %s.') % (format_field, score))
    return (score, format_tuple[1])
Esempio n. 2
0
def score_by_url_extension(resource, score_reasons):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format is None
      * If it cannot score it, then score is None
    '''
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append(_('Could not determine a file extension in the URL.'))
        return (None, None)
    for extension in extension_variants_:
        format_ = format_get(extension)
        if format_:
            score = lib.resource_format_scores().get(format_)
            if score:
                score_reasons.append(_('URL extension "%s" relates to format "%s" and receives score: %s.') % (extension, format_, score))
                return score, format_
            else:
                score = 1
                score_reasons.append(_('URL extension "%s" relates to format "%s"'
                                       ' but a score for that format is not configured, so giving it default score %s.')
                                     % (extension, format_, score))
                return score, format_
        score_reasons.append(_('URL extension "%s" is an unknown format.') % extension)
    return (None, None)
Esempio n. 3
0
def score_by_url_extension(resource, score_reasons):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format is None
      * If it cannot score it, then score is None
    '''
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append(
            _('Could not determine a file extension in the URL.'))
        return (None, None)
    for extension in extension_variants_:
        format_ = format_get(extension)
        if format_:
            score = lib.resource_format_scores().get(format_)
            if score:
                score_reasons.append(
                    _('URL extension "%s" relates to format "%s" and receives score: %s.'
                      ) % (extension, format_, score))
                return score, format_
            else:
                score = 1
                score_reasons.append(
                    _('URL extension "%s" relates to format "%s"'
                      ' but a score for that format is not configured, so giving it default score %s.'
                      ) % (extension, format_, score))
                return score, format_
        score_reasons.append(
            _('URL extension "%s" is an unknown format.') % extension)
    return (None, None)
Esempio n. 4
0
def score_by_sniffing_data(archival, resource, score_reasons):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append(
            _('This file had not been downloaded at the time of scoring it.'))
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append(
            _('Cache filepath does not exist: "%s".') % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath)
            score = lib.resource_format_scores().get(sniffed_format['format']) \
                if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    _('Content of file appeared to be format "%s" which receives openness score: %s.'
                      ) % (sniffed_format['format'], score))
                return score, sniffed_format['format']
            else:
                score_reasons.append(
                    _('The format of the file was not recognized from its contents.'
                      ))
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append(
                    _('File was not downloaded deliberately') + '. ' +
                    _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    _('A system error occurred during downloading this file') +
                    '. ' + _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            else:
                score_reasons.append(
                    _('This file had not been downloaded at the time of scoring it.'
                      ))
                return (None, None)
Esempio n. 5
0
def score_by_sniffing_data(archival, resource, score_reasons):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append(_('This file had not been downloaded at the time of scoring it.'))
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append(_('Cache filepath does not exist: "%s".') % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath)
            score = lib.resource_format_scores().get(sniffed_format['format']) \
                if sniffed_format else None
            if sniffed_format:
                score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.')
                                     % (sniffed_format['format'], score))
                return score, sniffed_format['format']
            else:
                score_reasons.append(_('The format of the file was not recognized from its contents.'))
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append(_('File was not downloaded deliberately') + '. '
                                     + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(_('A system error occurred during downloading this file') + '. '
                                     + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
                return (None, None)
            else:
                score_reasons.append(_('This file had not been downloaded at the time of scoring it.'))
                return (None, None)
Esempio n. 6
0
def score_by_format_field(resource, score_reasons):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append(_('Format field is blank.'))
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
        ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field))
    if not format_tuple:
        score_reasons.append(_('Format field "%s" does not correspond to a known format.') % format_field)
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
    score_reasons.append(_('Format field "%s" receives score: %s.') %
                         (format_field, score))
    return (score, format_tuple[1])
Esempio n. 7
0
    # GTFS check - a GTFS is a zip which containing specific filenames
    filenames = set((os.path.basename(f) for f in filepaths))
    if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt',
                 'stop_times.txt', 'calendar.txt')) - set(filenames)):
        log.info('GTFS detected')
        return {'format': 'GTFS'}

    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filepath in filepaths:
        extension = os.path.splitext(filepath)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
Esempio n. 8
0
        return {'format': 'SHP'}

    # GTFS check - a GTFS is a zip which containing specific filenames
    filenames = set((os.path.basename(f) for f in filepaths))
    if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt',
                 'stop_times.txt', 'calendar.txt')) - set(filenames)):
        log.info('GTFS detected')
        return {'format': 'GTFS'}

    top_score = 0
    top_scoring_extension_counts = defaultdict(int)  # extension: number_of_files
    for filepath in filepaths:
        extension = os.path.splitext(filepath)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)',
                     extension, filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]