Ejemplo n.º 1
0
    def _extract_file_format(self, url, headers):
        """
        Makes a best guess at the file format.

        /path/to/a_file.csv has format "CSV"
        /path/to/a_file.csv.zip has format "CSV / Zip"

        First this function tries to extract the file-extensions from the url,
        and deduce the format from there.  If no file-extension is found, then
        the mimetype from the headers is passed to `mimetypes.guess_extension()`.
        """
        formats = []
        parsed_url = urlparse.urlparse(url)
        path = parsed_url.path
        base, extension = posixpath.splitext(path)
        while extension:
            formats.append(extension[1:].upper())  # strip leading '.' from extension
            base, extension = posixpath.splitext(base)
        if formats:
            extension = ".".join(formats[::-1]).lower()
            format_tuple = ckan_helpers.resource_formats().get(extension)
            if format_tuple:
                return format_tuple[1]
            return " / ".join(formats[::-1])

        # No file extension found, attempt to extract format using the mimetype
        stripped_mimetype = self._extract_mimetype(headers)  # stripped of charset
        format_tuple = ckan_helpers.resource_formats().get(stripped_mimetype)
        if format_tuple:
            return format_tuple[1]

        extension = mimetypes.guess_extension(stripped_mimetype)
        if extension:
            return extension[1:].upper()
Ejemplo n.º 2
0
def score_by_format_field(resource, score_reasons):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append(_('Format field is blank.'))
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
        ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field))
    if not format_tuple:
        score_reasons.append(
            _('Format field "%s" does not correspond to a known format.') %
            format_field)
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
    score_reasons.append(
        _('Format field "%s" receives score: %s.') % (format_field, score))
    return (score, format_tuple[1])
Ejemplo n.º 3
0
def _extract_file_format(url, headers):
    """
    Makes a best guess at the file format.

    /path/to/a_file.csv has format "CSV"
    /path/to/a_file.csv.zip has format "CSV / Zip"

    First this function tries to extract the file-extensions from the url,
    and deduce the format from there.  If no file-extension is found, then
    the mimetype from the headers is passed to `mimetypes.guess_extension()`.
    """
    formats = []
    parsed_url = urlparse.urlparse(url)
    path = parsed_url.path
    base, extension = posixpath.splitext(path)
    while extension:
        formats.append(
            extension[1:].upper())  # strip leading '.' from extension
        base, extension = posixpath.splitext(base)
    if formats:
        extension = '.'.join(formats[::-1]).lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            return format_tuple[1]
        return ' / '.join(formats[::-1])

    # No file extension found, attempt to extract format using the mimetype
    stripped_mimetype = _extract_mimetype(headers)  # stripped of charset
    format_tuple = ckan_helpers.resource_formats().get(stripped_mimetype)
    if format_tuple:
        return format_tuple[1]

    extension = mimetypes.guess_extension(stripped_mimetype)
    if extension:
        return extension[1:].upper()
Ejemplo n.º 4
0
def set_sniffed_format(format_name):
    global sniffed_format
    if format_name:
        format_tuple = ckan_helpers.resource_formats().get(format_name.lower())
        sniffed_format = {'format': format_tuple[1]}
    else:
        sniffed_format = None
Ejemplo n.º 5
0
def set_sniffed_format(format_name):
    global sniffed_format
    if format_name:
        format_tuple = ckan_helpers.resource_formats().get(format_name.lower())
        sniffed_format = {'format': format_tuple[1]}
    else:
        sniffed_format = None
Ejemplo n.º 6
0
def run_bsd_file(filepath, log):
    '''Run the BSD command-line tool "file" to determine file type. Returns
    a format dict or None if it fails.'''
    result = check_output(['file', filepath])
    match = re.search('Name of Creating Application: ([^,]*),', result)
    if match:
        app_name = match.groups()[0]
        format_map = {'Microsoft Office PowerPoint': 'ppt',
                      'Microsoft PowerPoint': 'ppt',
                      'Microsoft Excel': 'xls',
                      'Microsoft Office Word': 'doc',
                      'Microsoft Word 10.0': 'doc',
                      'Microsoft Macintosh Word': 'doc',
                      }
        if app_name in format_map:
            extension = format_map[app_name]
            format_tuple = ckan_helpers.resource_formats()[extension]
            log.info('"file" detected file format: %s',
                     format_tuple[2])
            return {'format': format_tuple[1]}
    match = re.search(': ESRI Shapefile', result)
    if match:
        format_ = {'format': 'SHP'}
        log.info('"file" detected file format: %s',
                 format_['format'])
        return format_
    log.info('"file" could not determine file format of "%s": %s',
             filepath, result)
def run_bsd_file(filepath, log):
    '''Run the BSD command-line tool "file" to determine file type. Returns
    a format dict or None if it fails.'''
    result = check_output(['file', filepath])
    match = re.search('Name of Creating Application: ([^,]*),', result)
    if match:
        app_name = match.groups()[0]
        format_map = {
            'Microsoft Office PowerPoint': 'ppt',
            'Microsoft PowerPoint': 'ppt',
            'Microsoft Excel': 'xls',
            'Microsoft Office Word': 'doc',
            'Microsoft Word 10.0': 'doc',
            'Microsoft Macintosh Word': 'doc',
        }
        if app_name in format_map:
            extension = format_map[app_name]
            format_tuple = ckan_helpers.resource_formats()[extension]
            log.info('"file" detected file format: %s', format_tuple[2])
            return {'format': format_tuple[1]}
    match = re.search(': ESRI Shapefile', result)
    if match:
        format_ = {'format': 'SHP'}
        log.info('"file" detected file format: %s', format_['format'])
        return format_
    log.info('"file" could not determine file format of "%s": %s', filepath,
             result)
Ejemplo n.º 8
0
 def _clean_format(cls, format_string):
     if isinstance(format_string, basestring):
         matched_format = helpers.resource_formats().get(format_string.lower().strip(' .'))
         if matched_format:
             return matched_format[1]
         return re.sub(cls._disallowed_characters, '', format_string).strip()
     else:
         return format_string
Ejemplo n.º 9
0
 def _clean_format(cls, format_string):
     if isinstance(format_string, basestring):
         matched_format = helpers.resource_formats().get(
             format_string.lower().strip(' .'))
         if matched_format:
             return matched_format[1]
         return re.sub(cls._disallowed_characters, '',
                       format_string).strip()
     else:
         return format_string
Ejemplo n.º 10
0
def format_get(key):
    '''Returns a resource format, as defined in ckan.

    :param key: format extension / mimetype / title e.g. 'CSV',
                'application/msword', 'Word document'
    :param key: string
    :returns: format string
    '''
    format_tuple = ckan_helpers.resource_formats().get(key.lower())
    if not format_tuple:
        return
    return format_tuple[1]  # short name
Ejemplo n.º 11
0
def format_get(key):
    '''Returns a resource format, as defined in ckan.

    :param key: format extension / mimetype / title e.g. 'CSV',
                'application/msword', 'Word document'
    :param key: string
    :returns: format string
    '''
    format_tuple = ckan_helpers.resource_formats().get(key.lower())
    if not format_tuple:
        return
    return format_tuple[1]  # short name
Ejemplo n.º 12
0
def score_by_format_field(resource, score_reasons):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append(_('Format field is blank.'))
        return (None, None)
    format_tuple = ckan_helpers.resource_formats().get(format_field.lower()) or \
        ckan_helpers.resource_formats().get(lib.munge_format_to_be_canonical(format_field))
    if not format_tuple:
        score_reasons.append(_('Format field "%s" does not correspond to a known format.') % format_field)
        return (None, None)
    score = lib.resource_format_scores().get(format_tuple[1])
    score_reasons.append(_('Format field "%s" receives score: %s.') %
                         (format_field, score))
    return (score, format_tuple[1])
Ejemplo n.º 13
0
def get_xml_variant_without_xml_declaration(buf):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    # Parse the XML to find the first tag name.
    # Using expat directly, rather than go through xml.sax, since using I
    # couldn't see how to give it a string, so used StringIO which failed
    # for some files curiously.
    import xml.parsers.expat

    class GotFirstTag(Exception):
        pass

    def start_element(name, attrs):
        raise GotFirstTag(name)

    p = xml.parsers.expat.ParserCreate()
    p.StartElementHandler = start_element
    try:
        p.Parse(buf)
    except GotFirstTag as e:
        top_level_tag_name = six.text_type(e).lower()
    except xml.sax.SAXException as e:
        log.info('Sax parse error: %s %s', e, buf)
        return {'format': 'XML'}

    log.info('Top level tag detected as: %s', top_level_tag_name)
    top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
    top_level_tag_name = top_level_tag_name.replace('wms_capabilities',
                                                    'wms')  # WMS 1.3
    top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities',
                                                    'wms')  # WMS 1.1.1
    top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name)  # WFS 2.0
    top_level_tag_name = top_level_tag_name.replace('wfs_capabilities',
                                                    'wfs')  # WFS 1.0/1.1
    top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed')
    if top_level_tag_name.lower() == 'capabilities' and \
            'xmlns="http://www.opengis.net/wmts/' in buf:
        top_level_tag_name = 'wmts'
    if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \
            'xmlns="http://www.opengis.net/wcs/' in buf:
        top_level_tag_name = 'wcs'
    format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
    if format_tuple:
        format_ = {'format': format_tuple[1]}
        log.info('XML variant detected: %s', format_tuple[2])
        return format_
    log.warning('Did not recognise XML format: %s', top_level_tag_name)
    return {'format': 'XML'}
Ejemplo n.º 14
0
def get_xml_variant_without_xml_declaration(buf, log):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    xml_re = '.{0,3}\s*<([^>\s]*)'
    match = re.match(xml_re, buf)
    if match:
        top_level_tag_name = match.groups()[-1].lower()
        top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
        top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms')  # WMS 1.3
        top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities', 'wms')  # WMS 1.1.1
        format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
        if format_tuple:
            format_ = {'format': format_tuple[1]}
            log.info('XML variant detected: %s', format_tuple[2])
            return format_
        log.warning('Did not recognise XML format: %s', top_level_tag_name)
        return {'format': 'XML'}
    log.debug('XML tags not found: %s', buf)
def get_xml_variant_without_xml_declaration(buf, log):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    xml_re = '.{0,3}\s*<([^>\s]*)'
    match = re.match(xml_re, buf)
    if match:
        top_level_tag_name = match.groups()[-1].lower()
        top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
        top_level_tag_name = top_level_tag_name.replace(
            'wms_capabilities', 'wms')  # WMS 1.3
        top_level_tag_name = top_level_tag_name.replace(
            'wmt_ms_capabilities', 'wms')  # WMS 1.1.1
        format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
        if format_tuple:
            format_ = {'format': format_tuple[1]}
            log.info('XML variant detected: %s', format_tuple[2])
            return format_
        log.warning('Did not recognise XML format: %s', top_level_tag_name)
        return {'format': 'XML'}
    log.debug('XML tags not found: %s', buf)
Ejemplo n.º 16
0
def hdx_unified_resource_format(format):
    '''
    This function is based on the unified_resource_format() function from ckan.lib.helpers.
    As the one from core ckan it checks the resource formats configuration to translate the
    format string to a standard format.
    The difference is that in case nothing is found in 'resource_formats.json' then it's
    turned to lowercase.

    :param format: resource format as written by the user
    :type format: string
    :return:
    :rtype:
    '''
    formats = h.resource_formats()
    format_clean = format.lower()
    if format_clean in formats:
        format_new = formats[format_clean][1]
    else:
        format_new = format_clean
    return format_new
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.

    Returns a dict with format as a string, which is the format's canonical
    shortname (as defined by ckan's resource_formats.json) and a key that says
    if it is contained in a zip or something.

    e.g. {'format': 'CSV',
          'container': 'zip',
          }
    or None if it can\'t tell what it is.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
        else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type in ('application/msword', 'application/vnd.ms-office'):
            # In the past Magic gives the msword mime-type for Word and other
            # MS Office files too, so use BSD File to be sure which it is.
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = {'format': 'XLS'}
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = {'format': 'XLS'}
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = {'format': 'IATI'}

        if format_:
            return format_

        format_tuple = ckan_helpers.resource_formats().get(mime_type)
        if format_tuple:
            format_ = {'format': format_tuple[1]}

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = {'format': 'JSON'}
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = {'format': 'CSV'}
                elif is_psv(buf, log):
                    format_ = {'format': 'PSV'}

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',
                        mime_type)

        if format_:
            log.info('Mimetype translates to filetype: %s', format_['format'])

            if format_['format'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = {'format': 'JSON'}
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = {'format': 'CSV'}
                elif is_psv(buf, log):
                    format_ = {'format': 'PSV'}
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = {'format': 'TTL'}

            elif format_['format'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = {'format': 'RDFa'}

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = {'format': 'XLS'}
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
    top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
    top_level_tag_name = top_level_tag_name.replace('wms_capabilities',
                                                    'wms')  # WMS 1.3
    top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities',
                                                    'wms')  # WMS 1.1.1
    top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name)  # WFS 2.0
    top_level_tag_name = top_level_tag_name.replace('wfs_capabilities',
                                                    'wfs')  # WFS 1.0/1.1
    top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed')
    if top_level_tag_name.lower() == 'capabilities' and \
            'xmlns="http://www.opengis.net/wmts/' in buf:
        top_level_tag_name = 'wmts'
    if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \
            'xmlns="http://www.opengis.net/wcs/' in buf:
        top_level_tag_name = 'wcs'
    format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
    if format_tuple:
        format_ = {'format': format_tuple[1]}
        log.info('XML variant detected: %s', format_tuple[2])
        return format_
    log.warning('Did not recognise XML format: %s', top_level_tag_name)
    return {'format': 'XML'}


def has_rdfa(buf, log):
    '''If the buffer HTML contains RDFa then this returns True'''
    # quick check for the key words
    if 'about=' not in buf or 'property=' not in buf:
        log.debug('Not RDFA')
        return False
Ejemplo n.º 19
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.

    Returns a dict with format as a string, which is the format's canonical
    shortname (as defined by ckan's resource_formats.json) and a key that says
    if it is contained in a zip or something.

    e.g. {'format': 'CSV',
          'container': 'zip',
          }
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard Python logging
    one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
        else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type in ('application/msword', 'application/vnd.ms-office'):
            # In the past Magic gives the msword mime-type for Word and other
            # MS Office files too, so use BSD File to be sure which it is.
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = {'format': 'XLS'}
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = {'format': 'XLS'}
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = {'format': 'IATI'}

        if format_:
            return format_

        format_tuple = ckan_helpers.resource_formats().get(mime_type)
        if format_tuple:
            format_ = {'format': format_tuple[1]}

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = {'format': 'JSON'}
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = {'format': 'CSV'}
                elif is_psv(buf, log):
                    format_ = {'format': 'PSV'}

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',
                        mime_type)

        if format_:
            log.info('Mimetype translates to filetype: %s',
                     format_['format'])

            if format_['format'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = {'format': 'JSON'}
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = {'format': 'CSV'}
                elif is_psv(buf, log):
                    format_ = {'format': 'PSV'}
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = {'format': 'TTL'}

            elif format_['format'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = {'format': 'RDFa'}

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = {'format': 'XLS'}
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Ejemplo n.º 20
0
            filenames = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s',
                    e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s',
                    e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(int) # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
Ejemplo n.º 21
0
    def _distribution_format(self, distribution, normalize_ckan_format=True):
        '''
        Returns the Internet Media Type and format label for a distribution

        Given a reference (URIRef or BNode) to a dcat:Distribution, it will
        try to extract the media type (previously knowm as MIME type), eg
        `text/csv`, and the format label, eg `CSV`

        Values for the media type will be checked in the following order:

        1. literal value of dcat:mediaType
        2. literal value of dct:format if it contains a '/' character
        3. value of dct:format if it is an instance of dct:IMT, eg:

            <dct:format>
                <dct:IMT rdf:value="text/html" rdfs:label="HTML"/>
            </dct:format>

        Values for the label will be checked in the following order:

        1. literal value of dct:format if it not contains a '/' character
        2. label of dct:format if it is an instance of dct:IMT (see above)

        If `normalize_ckan_format` is True and using CKAN>=2.3, the label will
        be tried to match against the standard list of formats that is included
        with CKAN core
        (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json)
        This allows for instance to populate the CKAN resource format field
        with a format that view plugins, etc will understand (`csv`, `xml`,
        etc.)

        Return a tuple with the media type and the label, both set to None if
        they couldn't be found.
        '''

        imt = None
        label = None

        imt = self._object_value(distribution, DCAT.mediaType)

        _format = self._object(distribution, DCT['format'])
        if isinstance(_format, Literal):
            if not imt and '/' in _format:
                imt = unicode(_format)
            else:
                label = unicode(_format)
        elif isinstance(_format, (BNode, URIRef)):
            if self._object(_format, RDF.type) == DCT.IMT:
                if not imt:
                    imt = unicode(self.g.value(_format, default=None))
                label = unicode(self.g.label(_format, default=None))

        if ((imt or label) and normalize_ckan_format and
                toolkit.check_ckan_version(min_version='2.3')):
            import ckan.config
            from ckan.lib import helpers

            format_registry = helpers.resource_formats()

            if imt in format_registry:
                label = format_registry[imt][1]
            elif label in format_registry:
                label = format_registry[label][1]

        return imt, label
Ejemplo n.º 22
0
        return {'format': 'XML'}

    log.info('Top level tag detected as: %s', top_level_tag_name)
    top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
    top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms')  # WMS 1.3
    top_level_tag_name = top_level_tag_name.replace('wmt_ms_capabilities', 'wms')  # WMS 1.1.1
    top_level_tag_name = re.sub('wfs:.*', 'wfs', top_level_tag_name)  # WFS 2.0
    top_level_tag_name = top_level_tag_name.replace('wfs_capabilities', 'wfs')  # WFS 1.0/1.1
    top_level_tag_name = top_level_tag_name.replace('feed', 'atom feed')
    if top_level_tag_name.lower() == 'capabilities' and \
            'xmlns="http://www.opengis.net/wmts/' in buf:
        top_level_tag_name = 'wmts'
    if top_level_tag_name.lower() in ('coveragedescriptions', 'capabilities') and \
            'xmlns="http://www.opengis.net/wcs/' in buf:
        top_level_tag_name = 'wcs'
    format_tuple = ckan_helpers.resource_formats().get(top_level_tag_name)
    if format_tuple:
        format_ = {'format': format_tuple[1]}
        log.info('XML variant detected: %s', format_tuple[2])
        return format_
    log.warning('Did not recognise XML format: %s', top_level_tag_name)
    return {'format': 'XML'}

def has_rdfa(buf, log):
    '''If the buffer HTML contains RDFa then this returns True'''
    # quick check for the key words
    if 'about=' not in buf or 'property=' not in buf:
        log.debug('Not RDFA')
        return False

    # more rigorous check for them as tag attributes
Ejemplo n.º 23
0
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        import ckanext.dgu.lib.theme as dgutheme
        from ckan.lib.helpers import resource_formats
        from ckan import model
        from ckanext.harvest.model import (HarvestObjectExtra as HOExtra,
                                           HarvestGatherError)

        res_formats = resource_formats()

        inv_dataset = InventoryDocument.dataset_to_dict(
            InventoryDocument.parse_xml_string(harvest_object.content))

        pkg = dict(title=inv_dataset['title'],
                   notes=inv_dataset['description'],
                   state='active' if inv_dataset['active'] else 'deleted',
                   resources=[],
                   extras={
                       self.IDENTIFIER_KEY: inv_dataset['identifier'],
                       'harvest_source_reference': harvest_object.guid
                   })
        # License
        rights = inv_dataset.get('rights')
        if rights:
            license_id, licence = \
                dgu_helpers.get_licence_fields_from_free_text(rights)
            pkg['license_id'] = license_id
            if licence:
                pkg['extras']['licence'] = licence
                log.info('Custom licence %r', rights)
        else:
            pkg['license_id'] = ''

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = res_formats.get(inv_resource['mimetype'].lower().strip())
            if format_:
                format_ = format_[1]
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {
                'url': inv_resource['url'],
                'format': format_,
                'description': description,
                'resource_type': resource_type,
                'schema-url': schema_url,
                'schema-type': schema_type,
            }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self._gen_new_name('%s %s' %
                                             (pkg['title'], publisher_abbrev))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        try:
            themes = dgutheme.categorize_package(pkg)
            log.debug('%s given themes: %r', pkg['name'], themes)
        except ImportError, e:
            log.debug('Theme cannot be given: %s', e)
            themes = []
Ejemplo n.º 24
0
def get_zipped_format(filepath):
    '''For a given zip file, return the format of file inside.
    For multiple files, choose by the most open, and then by the most
    popular extension.'''
    # just check filename extension of each file inside
    try:
        # note: Cannot use "with" with a zipfile before python 2.7
        #       so we have to close it manually.
        zip = zipfile.ZipFile(filepath, 'r')
        try:
            filepaths = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile as e:
        log.info('Zip file open raised error %s: %s', e, e.args)
        return
    except Exception as e:
        log.warning('Zip file open raised exception %s: %s', e, e.args)
        return

    # Shapefile check - a Shapefile is a zip containing specific files:
    # .shp, .dbf and .shx amongst others
    extensions = set([f.split('.')[-1].lower() for f in filepaths])
    if len(extensions & set(('shp', 'dbf', 'shx'))) == 3:
        log.info('Shapefile detected')
        return {'format': 'SHP'}

    # GTFS check - a GTFS is a zip which containing specific filenames
    filenames = set((os.path.basename(f) for f in filepaths))
    if not (set(('agency.txt', 'stops.txt', 'routes.txt', 'trips.txt',
                 'stop_times.txt', 'calendar.txt')) - set(filenames)):
        log.info('GTFS detected')
        return {'format': 'GTFS'}

    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filepath in filepaths:
        extension = os.path.splitext(filepath)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
    log.info('Zip file\'s most popular extension is "%s" (All extensions: %r)',
             top_extension, top_scoring_extension_counts)
    format_tuple = ckan_helpers.resource_formats()[top_extension]
    format_ = {'format': format_tuple[1], 'container': 'ZIP'}
    log.info('Zipped file format detected: %s', format_tuple[2])
    return format_
        try:
            filenames = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s', e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s', e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        format_tuple = ckan_helpers.resource_formats().get(extension)
        if format_tuple:
            score = lib.resource_format_scores().get(format_tuple[1])
            if score is not None and score > top_score:
                top_score = score
                top_scoring_extension_counts = defaultdict(int)
            if score == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return {'format': 'ZIP'}

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
Ejemplo n.º 26
0
    def _distribution_format(self, distribution, normalize_ckan_format=True):
        '''
        Returns the Internet Media Type and format label for a distribution

        Given a reference (URIRef or BNode) to a dcat:Distribution, it will
        try to extract the media type (previously knowm as MIME type), eg
        `text/csv`, and the format label, eg `CSV`

        Values for the media type will be checked in the following order:

        1. literal value of dcat:mediaType
        2. literal value of dct:format if it contains a '/' character
        3. value of dct:format if it is an instance of dct:IMT, eg:

            <dct:format>
                <dct:IMT rdf:value="text/html" rdfs:label="HTML"/>
            </dct:format>

        Values for the label will be checked in the following order:

        1. literal value of dct:format if it not contains a '/' character
        2. label of dct:format if it is an instance of dct:IMT (see above)

        If `normalize_ckan_format` is True and using CKAN>=2.3, the label will
        be tried to match against the standard list of formats that is included
        with CKAN core
        (https://github.com/ckan/ckan/blob/master/ckan/config/resource_formats.json)
        This allows for instance to populate the CKAN resource format field
        with a format that view plugins, etc will understand (`csv`, `xml`,
        etc.)

        Return a tuple with the media type and the label, both set to None if
        they couldn't be found.
        '''

        imt = None
        label = None

        imt = self._object_value(distribution, DCAT.mediaType)

        _format = self._object(distribution, DCT['format'])
        if isinstance(_format, Literal):
            if not imt and '/' in _format:
                imt = unicode(_format)
            else:
                label = unicode(_format)
        elif isinstance(_format, (BNode, URIRef)):
            if self._object(_format, RDF.type) == DCT.IMT:
                if not imt:
                    imt = unicode(self.g.value(_format, default=None))
                label = unicode(self.g.label(_format, default=None))

        if ((imt or label) and normalize_ckan_format and
                toolkit.check_ckan_version(min_version='2.3')):
            import ckan.config
            from ckan.lib import helpers

            format_registry = helpers.resource_formats()

            if imt in format_registry:
                label = format_registry[imt][1]
            elif label in format_registry:
                label = format_registry[label][1]

        return imt, label
Ejemplo n.º 27
0
    def format_mapping(self):
        try:
            tk.check_access('sysadmin', {'user': g.user, model: model})
        except tk.NotAuthorized:
            return tk.abort(403)
        if request.method == 'POST':
            old = request.POST.get('from')
            new = request.POST.get('to')
            if old and new:
                ids = set()
                res_query = model.Session.query(model.Resource).filter_by(
                    format=old, state='active'
                )
                for res in res_query:
                    ids.add(res.package_id)

                res_query.update({'format': new})
                model.Session.commit()
                for id in ids:
                    clear(id)
                    rebuild(id, defer_commit=True)
                commit()
                tk.h.flash_success(
                    'Updated. Records changed: {}'.format(len(ids))
                )
            return tk.redirect_to('format_mapping')

        defined = set(
            map(lambda (_1, fmt, _3): fmt,
                h.resource_formats().values())
        )
        db_formats = model.Session.query(
            model.Resource.format, func.count(model.Resource.id),
            func.count(model.PackageExtra.value)
        ).outerjoin(
            model.PackageExtra,
            (model.Resource.package_id == model.PackageExtra.package_id)
            & ((model.PackageExtra.key == 'harvest_portal')
               | (model.PackageExtra.key.is_(None)))
        ).group_by(model.Resource.format).filter(
            model.Resource.format != '', model.Resource.state == 'active'
        )
        db_formats = db_formats.all()

        format_types = {
            f: {
                True: 'Partially external',
                e == 0: 'Local',
                t - e == 0: 'External'
            }[True]
            for (f, t, e) in db_formats
        }
        used = set(format_types)
        undefined = used - defined

        extra_vars = {
            'undefined': undefined,
            'defined': defined,
            'format_types': format_types
        }
        return tk.render('admin/format_mapping.html', extra_vars)