Beispiel #1
0
def score_by_url_extension(resource, score_reasons, log):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    formats_by_extension = Formats.by_extension()
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append('Nepoznata ekstenzija datoteke.')
        return (None, None)
    for extension in extension_variants_:
        if extension.lower() in formats_by_extension:
            format_ = Formats.by_extension().get(extension.lower())
            score = format_['openness']
            score_reasons.append(
                'URL ekstenzija "%s" je povezana s formatom "%s" i ima ocjenu: %s.'
                % (extension, format_['display_name'], score))
            return score, format_['display_name']
        score_reasons.append('URL ekstenzija "%s" je nepoznat format.' %
                             extension)
    return (None, None)
Beispiel #2
0
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_ = Formats.by_display_name().get(format_field) or \
              Formats.by_extension().get(format_field.lower()) or \
              Formats.by_reduced_name().get(Formats.reduce(format_field))
    if not format_:
        score_reasons.append('Polje formata "%s" ne odgovara ni jednom poznatom formatu.' % format_field)
        return (None, None)
    score = format_['openness']
    score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \
                         (format_field, score))
    return (score, format_['display_name'])
Beispiel #3
0
def score_by_format_field(resource, score_reasons, log):
    '''
    Looks at the format field of a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    format_field = resource.format or ''
    if not format_field:
        score_reasons.append('Format field is blank.')
        return (None, None)
    format_ = Formats.by_display_name().get(format_field) or \
              Formats.by_extension().get(format_field.lower()) or \
              Formats.by_reduced_name().get(Formats.reduce(format_field))
    if not format_:
        score_reasons.append(
            'Polje formata "%s" ne odgovara ni jednom poznatom formatu.' %
            format_field)
        return (None, None)
    score = format_['openness']
    score_reasons.append('Polje formata "%s" ima ocijenu otvorenosti: %s.' % \
                         (format_field, score))
    return (score, format_['display_name'])
Beispiel #4
0
def run_bsd_file(filepath, log):
    '''Run the BSD command-line tool "file" to determine file type. Returns
    a Format or None if it fails.'''
    result = check_output(['file', filepath])
    match = re.search('Name of Creating Application: ([^,]*),', result)
    if match:
        app_name = match.groups()[0]
        format_map = {
            'Microsoft Office PowerPoint': 'ppt',
            'Microsoft PowerPoint': 'ppt',
            'Microsoft Excel': 'xls',
            'Microsoft Office Word': 'doc',
            'Microsoft Word 10.0': 'doc',
            'Microsoft Macintosh Word': 'doc',
        }
        if app_name in format_map:
            extension = format_map[app_name]
            format_ = Formats.by_extension()[extension]
            log.info('"file" detected file format: %s',
                     format_['display_name'])
            return format_
    match = re.search(': ESRI Shapefile', result)
    if match:
        format_ = Formats.by_extension()['shp']
        log.info('"file" detected file format: %s', format_['display_name'])
        return format_
    log.info('"file" could not determine file format of "%s": %s', filepath,
             result)
Beispiel #5
0
 def test_match(self):
     res_type_map = {
         # raw: expected_canonised
         "xls": "XLS",
         ".xls": "XLS",
         ".XLS": "XLS",
         "csv": "CSV",
         ".html": "HTML",
         "html": "HTML",
         "rdf/xml": "RDF",
         "rdf": "RDF",
         ".rdf": "RDF",
         ".RDF": "RDF",
         "pdf": "PDF",
         "PDF ": "PDF",
         "ppt": "PPT",
         "odp": "ODP",
         "shp": "SHP",
         "kml": "KML",
         "doc": "DOC",
         "json": "JSON",
     }
     for raw, expected_match in res_type_map.items():
         assert Formats.match(raw), raw
         assert_equal(Formats.match(raw)["display_name"], expected_match)
Beispiel #6
0
 def test_match(self):
     res_type_map = {
         # raw: expected_canonised
         'xls': 'XLS',
         '.xls': 'XLS',
         '.XLS': 'XLS',
         'csv': 'CSV',
         '.html': 'HTML',
         'html': 'HTML',
         'rdf/xml': 'RDF',
         'rdf': 'RDF',
         '.rdf': 'RDF',
         '.RDF': 'RDF',
         'pdf': 'PDF',
         'PDF ': 'PDF',
         'ppt': 'PPT',
         'odp': 'ODP',
         'shp': 'SHP',
         'kml': 'KML',
         'doc': 'DOC',
         'json': 'JSON',
     }
     for raw, expected_match in res_type_map.items():
         assert Formats.match(raw), raw
         assert_equal(Formats.match(raw)['display_name'], expected_match)
Beispiel #7
0
def run_bsd_file(filepath, log):
    '''Run the BSD command-line tool "file" to determine file type. Returns
    a Format or None if it fails.'''
    result = check_output(['file', filepath])
    match = re.search('Name of Creating Application: ([^,]*),', result)
    if match:
        app_name = match.groups()[0]
        format_map = {'Microsoft Office PowerPoint': 'ppt',
                      'Microsoft PowerPoint': 'ppt',
                      'Microsoft Excel': 'xls',
                      'Microsoft Office Word': 'doc',
                      'Microsoft Word 10.0': 'doc',
                      'Microsoft Macintosh Word': 'doc',
                      }
        if app_name in format_map:
            extension = format_map[app_name]
            format_ = Formats.by_extension()[extension]
            log.info('"file" detected file format: %s',
                     format_['display_name'])
            return format_
    match = re.search(': ESRI Shapefile', result)
    if match:
        format_ = Formats.by_extension()['shp']
        log.info('"file" detected file format: %s',
                 format_['display_name'])
        return format_
    log.info('"file" could not determine file format of "%s": %s',
             filepath, result)
Beispiel #8
0
def get_xml_variant_without_xml_declaration(buf, log):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    xml_re = '.{0,3}\s*<([^>\s]*)'
    match = re.match(xml_re, buf)
    if match:
        top_level_tag_name = match.groups()[-1].lower()
        top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
        top_level_tag_name = top_level_tag_name.replace('wms_capabilities', 'wms')
        if top_level_tag_name in Formats.by_extension():
            format_ = Formats.by_extension()[top_level_tag_name]
            log.info('XML variant detected: %s', format_['display_name'])
            return format_
        log.warning('Did not recognise XML format: %s', top_level_tag_name)
        return Formats.by_extension()['xml']
    log.debug('XML tags not found: %s', buf)
Beispiel #9
0
def is_html(buf, log):
    '''If this buffer is HTML, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('HTML tag detected')
        return Formats.by_extension()['html']
    log.debug('Not HTML')
Beispiel #10
0
def is_iati(buf, log):
    '''If this buffer is IATI format, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('IATI tag detected')
        return Formats.by_extension()['iati']
    log.debug('Not IATI', buf)
Beispiel #11
0
def get_xml_variant_without_xml_declaration(buf, log):
    '''If this buffer is in a format based on XML, without any XML declaration
    or other boilerplate, return the format type.'''
    xml_re = '.{0,3}\s*<([^>\s]*)'
    match = re.match(xml_re, buf)
    if match:
        top_level_tag_name = match.groups()[-1].lower()
        top_level_tag_name = top_level_tag_name.replace('rdf:rdf', 'rdf')
        top_level_tag_name = top_level_tag_name.replace(
            'wms_capabilities', 'wms')
        if top_level_tag_name in Formats.by_extension():
            format_ = Formats.by_extension()[top_level_tag_name]
            log.info('XML variant detected: %s', format_['display_name'])
            return format_
        log.warning('Did not recognise XML format: %s', top_level_tag_name)
        return Formats.by_extension()['xml']
    log.debug('XML tags not found: %s', buf)
Beispiel #12
0
 def _clean_format(cls, format_string):
     if isinstance(format_string, basestring):
         matched_format = Formats.match(format_string)
         if matched_format:
             return matched_format['display_name']
         return re.sub(cls._disallowed_characters, '', format_string).strip()
     else:
         return format_string
Beispiel #13
0
def is_html(buf, log):
    '''If this buffer is HTML, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<html[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('HTML tag detected')
        return Formats.by_extension()['html']
    log.debug('Not HTML')
Beispiel #14
0
def is_iati(buf, log):
    '''If this buffer is IATI format, return that format type, else None.'''
    xml_re = '.{0,3}\s*(<\?xml[^>]*>\s*)?(<!doctype[^>]*>\s*)?<iati-(activities|organisations)[^>]*>'
    match = re.match(xml_re, buf, re.IGNORECASE)
    if match:
        log.info('IATI tag detected')
        return Formats.by_extension()['iati']
    log.debug('Not IATI', buf)
Beispiel #15
0
 def _clean_format(cls, format_string):
     if isinstance(format_string, basestring):
         matched_format = Formats.match(format_string)
         if matched_format:
             return matched_format['display_name']
         return re.sub(cls._disallowed_characters, '',
                       format_string).strip()
     else:
         return format_string
Beispiel #16
0
 def test_fugue_icons_exist(self):
     # List all icon files in the fugue folder
     path = os.path.dirname(__file__)  # /ckanext/dgu/tests/lib
     path = os.path.dirname(path)  # /ckanext/dgu/tests
     path = os.path.dirname(path)  # /ckanext/dgu
     # /ckanext/dgu/theme/public/images/fugue
     path = os.path.join(path, 'theme', 'public', 'images', 'fugue')
     assert os.path.isdir(path)
     files = os.listdir(path)
     # Each format should have an icon in that folder
     assert 'document.png' in files, 'document.png not found in %s' % path
     for fmt in Formats.by_display_name().values():
         if fmt['icon'] == '': continue
         icon_name = fmt['icon'] + '.png'
         assert icon_name in files, '%s not found in %s' % (icon_name, path)
Beispiel #17
0
def score_by_url_extension(resource, score_reasons, log):
    '''
    Looks at the URL for a resource to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    formats_by_extension = Formats.by_extension()
    extension_variants_ = extension_variants(resource.url.strip())
    if not extension_variants_:
        score_reasons.append('Nepoznata ekstenzija datoteke.')
        return (None, None)
    for extension in extension_variants_:
        if extension.lower() in formats_by_extension:
            format_ = Formats.by_extension().get(extension.lower())
            score = format_['openness']
            score_reasons.append('URL ekstenzija "%s" je povezana s formatom "%s" i ima ocjenu: %s.' % (extension, format_['display_name'], score))
            return score, format_['display_name']
        score_reasons.append('URL ekstenzija "%s" je nepoznat format.' % extension)
    return (None, None)
Beispiel #18
0
 def test_fugue_icons_exist(self):
     # List all icon files in the fugue folder
     path = os.path.dirname(__file__)  # /ckanext/dgu/tests/lib
     path = os.path.dirname(path)  # /ckanext/dgu/tests
     path = os.path.dirname(path)  # /ckanext/dgu
     # /ckanext/dgu/theme/public/images/fugue
     path = os.path.join(path, "theme", "public", "images", "fugue")
     assert os.path.isdir(path)
     files = os.listdir(path)
     # Each format should have an icon in that folder
     assert "document.png" in files, "document.png not found in %s" % path
     for fmt in Formats.by_display_name().values():
         if fmt["icon"] == "":
             continue
         icon_name = fmt["icon"] + ".png"
         assert icon_name in files, "%s not found in %s" % (icon_name, path)
Beispiel #19
0
 def test_by_mime_type(self):
     assert_equal(Formats.by_mime_type()['text/x-json']['display_name'],
                  'JSON')
Beispiel #20
0
 def test_reduce(self):
     assert_equal(Formats.reduce('.XLS '), 'xls')
Beispiel #21
0
 def test_by_mime_type(self):
     assert_equal(Formats.by_mime_type()["text/x-json"]["display_name"], "JSON")
Beispiel #22
0
def set_sniffed_format(format_display_name):
    global sniffed_format
    if format_display_name:
        sniffed_format = Formats.by_display_name()[format_display_name]
    else:
        sniffed_format = None
Beispiel #23
0
 def test_by_display_name(self):
     assert_equal(Formats.by_display_name()["JSON"]["extension"], "json")
Beispiel #24
0
 def test_reduce(self):
     assert_equal(Formats.reduce(".XLS "), "xls")
Beispiel #25
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']

        if format_:
            return format_

        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s',
                        mime_type)

        if format_:
            log.info('Mimetype translates to filetype: %s',
                     format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)

    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_
Beispiel #26
0
 def test_by_extension(self):
     assert_equal(Formats.by_extension()["json"]["display_name"], "JSON")
Beispiel #27
0
        try:
            filenames = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s', e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s', e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(
        int)  # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        if extension in Formats.by_extension():
            format_ = Formats.by_extension()[extension]
            if format_['openness'] > top_score:
                top_score = format_['openness']
                top_scoring_extension_counts = defaultdict(int)
            if format_['openness'] == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension,
                     filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return Formats.by_display_name()['Zip']

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
Beispiel #28
0
def set_sniffed_format(format_display_name):
    global sniffed_format
    if format_display_name:
        sniffed_format = Formats.by_display_name()[format_display_name]
    else:
        sniffed_format = None
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        inv_dataset = InventoryDocument.dataset_to_dict(
                       InventoryDocument.parse_xml_string(harvest_object.content)
                       )

        pkg = dict(
            title=inv_dataset['title'],
            notes=inv_dataset['description'],
            state='active' if inv_dataset['active'] else 'deleted',
            resources=[],
            extras={self.IDENTIFIER_KEY: inv_dataset['identifier'],
                    'harvest_source_reference': harvest_object.guid
                    }
            )
        # License
        rights = inv_dataset.get('rights')
        if rights:
            register = model.Package.get_license_register()
            if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/':
                pkg['license_id'] = 'uk-ogl'
            else:
                for l in register.values():
                    if l.url == rights:
                        pkg['license_id'] = l.id
                        break
                else:
                    # just save it as it is
                    pkg['license_id'] = register
                    log.info('Did not recognize license %r', register)
        else:
            pkg['license_id'] = None

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = Formats.by_mime_type().get(inv_resource['mimetype'])
            if format_:
                format_ = format_['display_name']
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {'url': inv_resource['url'],
                   'format': format_,
                   'description': description,
                   'resource_type': resource_type,
                   'schema-url': schema_url,
                   'schema-type': schema_type,
                   }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self.check_name(self.munge_title_to_name(
                '%s %s' % (pkg['title'], publisher_abbrev)))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        themes = dgutheme.categorize_package(pkg)
        log.debug('%s given themes: %r', pkg['name'], themes)
        if themes:
            pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0]
            if len(themes) == 2:
                pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1]

        pkg['extras'] = self.extras_from_dict(pkg['extras'])
        return pkg
Beispiel #30
0
 def test_by_extension(self):
     assert_equal(Formats.by_extension()['json']['display_name'], 'JSON')
Beispiel #31
0
    def get_pkg_dict(cls, resource, ldr):
        from ckan import model

        pkg_dict = OrderedDict()
        extras = OrderedDict()
        uri = str(resource.identifier)
        pkg_dict['title'] = unicode(resource[RDFS.label].next())
        extras['registry_uri'] = uri

        # Create or update?
        pkg = model.Session.query(model.Package) \
                .filter_by(state='active') \
                .join(model.PackageExtra) \
                .filter_by(state='active') \
                .filter_by(key='registry_uri') \
                .filter_by(value=uri).first()
        if pkg:
            pkg_dict['id'] = pkg.id
            pkg_dict['name'] = pkg.name
            action = 'update'
        else:
            pkg_dict['id'] = unicode(uuid.uuid4())
            pkg_dict['name'] = cls._gen_new_name(pkg_dict['title'])
            action = 'new'

        dgu_type = cls.get_dgu_type(resource)
        extras['data_standard_type'] = dgu_type

        resources = []
        for format_display, format_extension, format_dgu in (('RDF ttl', 'ttl',
                                                              'RDF'),
                                                             ('RDF/XML', 'rdf',
                                                              'RDF'),
                                                             ('JSON-LD',
                                                              'jsonld',
                                                              'JSON')):
            url = uri + '?_format=%s' % format_extension
            assert format_dgu in Formats().by_display_name()
            resources.append({
                'description':
                '%s as %s' % (dgu_type, format_display),
                'url':
                url,
                'format':
                format_dgu,
                'resource_type':
                'file'
            })
            resources.append({
                'description':
                '%s and metadata as %s' % (dgu_type, format_display),
                'url':
                url + METADATA_PARAM,
                'format':
                format_dgu,
                'resource_type':
                'file'
            })

        pkg_dict['notes'] = unicode(resource[DCT.description].next())
        licence_url = str(resource[DCT.license].next())
        if 'open-government-licence' in licence_url:
            pkg_dict['licence_id'] = 'uk-ogl'
        else:
            extras['licence_url'] = licence_url
            # not sure how this will display as just as URL
        pkg_dict['owner_org'] = cls.get_publisher(resource).id
        resources.append({
            'description':
            'Web page for this %s on a Linked Data Registry' % dgu_type,
            'url':
            uri,
            'format':
            'HTML',
            'resource_type':
            'documentation'
        })
        metadata = cls.get_resource_metadata(uri)
        status = metadata[REG.status].next()
        extras['status'] = str(status).split('#')[-1]
        extras['harvested_version'] = str(metadata[OWL.versionInfo].next())
        extras['data_standard_type'] = dgu_type
        pkg_dict['type'] = 'data-standard'

        pkg_dict['extras'] = [{
            'key': k,
            'value': v
        } for k, v in extras.items()]
        pkg_dict['resources'] = resources
        return pkg_dict, action
Beispiel #32
0
 def test_by_display_name(self):
     assert_equal(Formats.by_display_name()['JSON']['extension'], 'json')
Beispiel #33
0
            filenames = zip.namelist()
        finally:
            zip.close()
    except zipfile.BadZipfile, e:
        log.info('Zip file open raised error %s: %s',
                    e, e.args)
        return
    except Exception, e:
        log.warning('Zip file open raised exception %s: %s',
                    e, e.args)
        return
    top_score = 0
    top_scoring_extension_counts = defaultdict(int) # extension: number_of_files
    for filename in filenames:
        extension = os.path.splitext(filename)[-1][1:].lower()
        if extension in Formats.by_extension():
            format_ = Formats.by_extension()[extension]
            if format_['openness'] > top_score:
                top_score = format_['openness']
                top_scoring_extension_counts = defaultdict(int)
            if format_['openness'] == top_score:
                top_scoring_extension_counts[extension] += 1
        else:
            log.info('Zipped file of unknown extension: "%s" (%s)', extension, filepath)
    if not top_scoring_extension_counts:
        log.info('Zip has no known extensions: %s', filepath)
        return Formats.by_display_name()['Zip']

    top_scoring_extension_counts = sorted(top_scoring_extension_counts.items(),
                                          key=lambda x: x[1])
    top_extension = top_scoring_extension_counts[-1][0]
Beispiel #34
0
    def get_package_dict(self, harvest_object, package_dict_defaults,
                         source_config, existing_dataset):
        '''
        Constructs a package_dict suitable to be passed to package_create or
        package_update. See documentation on
        ckan.logic.action.create.package_create for more details

        * name - a new package must have a unique name; if it had a name in the
          previous harvest, that will be in the package_dict_defaults.
        * resource.id - should be the same as the old object if updating a
          package
        * errors - call self._save_object_error() and return False
        * default values for name, owner_org, tags etc can be merged in using:
            package_dict = package_dict_defaults.merge(package_dict_harvested)
        '''
        inv_dataset = InventoryDocument.dataset_to_dict(
            InventoryDocument.parse_xml_string(harvest_object.content))

        pkg = dict(title=inv_dataset['title'],
                   notes=inv_dataset['description'],
                   state='active' if inv_dataset['active'] else 'deleted',
                   resources=[],
                   extras={
                       self.IDENTIFIER_KEY: inv_dataset['identifier'],
                       'harvest_source_reference': harvest_object.guid
                   })
        # License
        rights = inv_dataset.get('rights')
        if rights:
            register = model.Package.get_license_register()
            if rights == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/':
                pkg['license_id'] = 'uk-ogl'
            else:
                for l in register.values():
                    if l.url == rights:
                        pkg['license_id'] = l.id
                        break
                else:
                    # just save it as it is
                    pkg['license_id'] = register
                    log.info('Did not recognize license %r', register)
        else:
            pkg['license_id'] = None

        # Resources
        inv_resources = [r for r in inv_dataset['resources'] if r['active']]
        existing_resource_urls = dict((r.url, r.id)
                                      for r in existing_dataset.resources) \
                                 if existing_dataset else {}
        pkg['resources'] = []
        for inv_resource in inv_resources:
            format_ = Formats.by_mime_type().get(inv_resource['mimetype'])
            if format_:
                format_ = format_['display_name']
            else:
                format_ = inv_resource['mimetype']
            description = inv_resource['title']
            if inv_resource['availability']:
                description += ' - %s' % inv_resource['availability']
            # if it is temporal, it should be a timeseries,
            # if it is not data, it should be an additional resource
            resource_type = 'file' if inv_resource['resource_type'] == 'Data' \
                else 'documentation'
            # Schema
            if inv_resource['conforms_to']:
                schema_url = inv_resource['conforms_to']
                schema_type = SCHEMA_TYPE_MAP.get(format_)
            else:
                schema_url = schema_type = ''
            res = {
                'url': inv_resource['url'],
                'format': format_,
                'description': description,
                'resource_type': resource_type,
                'schema-url': schema_url,
                'schema-type': schema_type,
            }
            if res['url'] in existing_resource_urls:
                res['id'] = existing_resource_urls[res['url']]
            pkg['resources'].append(res)

        # Local Authority Services and Functions
        if inv_dataset['services']:
            log.info('Local Authority Services: %r', inv_dataset['services'])
            # e.g. {http://id.esd.org.uk/service/190}
            pkg['extras']['la_service'] = ' '.join(inv_dataset['services'])
        else:
            pkg['extras']['la_service'] = ''
        if inv_dataset['functions']:
            log.info('Local Authority Functions %r', inv_dataset['functions'])
            pkg['extras']['la_function'] = ' '.join(inv_dataset['functions'])
        else:
            pkg['extras']['la_function'] = ''

        pkg = package_dict_defaults.merge(pkg)
        if not pkg.get('name'):
            # append the publisher name to differentiate similar titles better
            # than just a numbers suffix
            publisher = model.Group.get(harvest_object.job.source.publisher_id)
            publisher_abbrev = self._get_publisher_abbreviation(publisher)
            pkg['name'] = self.check_name(
                self.munge_title_to_name('%s %s' %
                                         (pkg['title'], publisher_abbrev)))

        # Themes based on services/functions
        if 'tags' not in pkg:
            pkg['tags'] = []
        themes = dgutheme.categorize_package(pkg)
        log.debug('%s given themes: %r', pkg['name'], themes)
        if themes:
            pkg['extras'][dgutheme.PRIMARY_THEME] = themes[0]
            if len(themes) == 2:
                pkg['extras'][dgutheme.SECONDARY_THEMES] = '["%s"]' % themes[1]

        pkg['extras'] = self.extras_from_dict(pkg['extras'])
        return pkg
Beispiel #35
0
def sniff_file_format(filepath, log):
    '''For a given filepath, work out what file format it is.
    Returns Format dict with a key to say if it is contained
    in a zip or something.
    e.g. {'display_name': 'CSV',
          'container': 'zip',
           ...}
    or None if it can\'t tell what it is.

    Note, log is a logger, either a Celery one or a standard
    Python logging one.
    '''
    format_ = None
    log.info('Sniffing file format of: %s', filepath)
    filepath_utf8 = filepath.encode('utf8') if isinstance(filepath, unicode) \
                    else filepath
    mime_type = magic.from_file(filepath_utf8, mime=True)
    log.info('Magic detects file as: %s', mime_type)
    if mime_type:
        if mime_type == 'application/xml':
            with open(filepath) as f:
                buf = f.read(5000)
            format_ = get_xml_variant_including_xml_declaration(buf, log)
        elif mime_type == 'application/zip':
            format_ = get_zipped_format(filepath, log)
        elif mime_type == 'application/msword':
            # Magic gives this mime-type for other MS Office files too
            format_ = run_bsd_file(filepath, log)
            if not format_ and is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
        elif mime_type == 'application/octet-stream':
            # Excel files sometimes come up as this
            if is_excel(filepath, log):
                format_ = Formats.by_display_name()['XLS']
            else:
                # e.g. Shapefile
                format_ = run_bsd_file(filepath, log)
            if not format_:
                with open(filepath) as f:
                    buf = f.read(500)
                format_ = is_html(buf, log)
        elif mime_type == 'text/html':
            # Magic can mistake IATI for HTML
            with open(filepath) as f:
                buf = f.read(100)
            if is_iati(buf, log):
                format_ = Formats.by_display_name()['IATI']
                
        if format_:
            return format_
                
        format_ = Formats.by_mime_type().get(mime_type)

        if not format_:
            if mime_type.startswith('text/'):
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']

        if not format_:
            log.warning('Mimetype not recognised by CKAN as a data format: %s', mime_type)
            
        if format_:
            log.info('Mimetype translates to filetype: %s', format_['display_name'])

            if format_['display_name'] == 'TXT':
                # is it JSON?
                with open(filepath, 'rU') as f:
                    buf = f.read(10000)
                if is_json(buf, log):
                    format_ = Formats.by_extension()['json']
                # is it CSV?
                elif is_csv(buf, log):
                    format_ = Formats.by_extension()['csv']
                elif is_psv(buf, log):
                    format_ = Formats.by_extension()['psv']
                # XML files without the "<?xml ... ?>" tag end up here
                elif is_xml_but_without_declaration(buf, log):
                    format_ = get_xml_variant_without_xml_declaration(buf, log)
                elif is_ttl(buf, log):
                    format_ = Formats.by_extension()['ttl']

            elif format_['display_name'] == 'HTML':
                # maybe it has RDFa in it
                with open(filepath) as f:
                    buf = f.read(100000)
                if has_rdfa(buf, log):
                    format_ = Formats.by_display_name()['RDFa']

    else:
        # Excel files sometimes not picked up by magic, so try alternative
        if is_excel(filepath, log):
            format_ = Formats.by_display_name()['XLS']
        # BSD file picks up some files that Magic misses
        # e.g. some MS Word files
        if not format_:
            format_ = run_bsd_file(filepath, log)
                
    if not format_:
        log.warning('Could not detect format of file: %s', filepath)
    return format_