Ejemplo n.º 1
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata_oai_dc = data['metadata']['oai_dc']
    titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces)
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = titles.get('title_0', identifier)
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    esc_identifier = identifier.replace('/','-')
    pkg = Package.get(esc_identifier)
    if not pkg:
        pkg = Package(name=name, title=title, id=esc_identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = titles
    idx = 0
    for s in ('subject', 'type'):
        for tag in metadata_oai_dc.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://www.yso.fi'):
                tags = label_list_yso(tagi)
                extras['tag_source_%i' % idx] = tagi
                idx += 1
            elif tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                #tagi = munge_tag(tagi[:100]) # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                    model.PackageTag.tag_id == tag_obj.id).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    lastidx = 0
    for auth in metadata_oai_dc.get('creator', []):
        extras['organization_%d' % lastidx] = ''
        extras['author_%d' % lastidx] = auth
        lastidx += 1
    extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces))
    extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Causes failure in commit for some reason.
    #for f in _handle_format(metadata.get('formatNode', []), namespaces):
    #    pprint.pprint(f)
    #    pkg.add_resource(**f)
    # There may be multiple identifiers (URL, ISBN, ...) in the metadata.
    id_idx = 0
    for ident in metadata_oai_dc.get('identifier', []):
        extras['identifier_%i' % id_idx] = ident
        id_idx += 1
    # Check that we have a language.
    lang = metadata_oai_dc.get('language', [])
    if lang and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    if 'date' in extras:
        pkg.version = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    
    # Metadata may have different identifiers, pick link, if exists.
    for ids in metadata_oai_dc['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            pkg.add_resource(ids, name=pkg.title, format='html')
    # All belong to the main group even if they do not belong to any set.
    if group:
        group.add_package_by_name(pkg.name)
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for mdp, metadata in data['metadata'].items():
        for key, value in metadata.items():
            if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode',
                                                           'publisherNode', 'creator', 'contributorNode',
                                                           'description', 'identifier', 'language', 'formatNode'):
                continue
            extras[key] = ' '.join(value)
        #description = metadata['description'][0] if len(metadata['description']) else ''
        notes = ' '.join(metadata.get('description', []))
        pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    
    for mdp, resource in data['package_resource'].items():
        ofs = get_ofs()
        ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {})
        pkg.add_resource(**(resource))
    
    if harvest_object:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    
    model.repo.commit()
    return pkg.id
Ejemplo n.º 2
0
def _oai_dc2ckan(data, namespaces, group, harvest_object):
    model.repo.new_revision()
    identifier = data['identifier']
    metadata = data['metadata']
    # Store title in pkg.title and keep all in extras as well. That way
    # UI will work some way in any case.
    title = metadata.get('title', identifier)[0]
    #title = metadata['title'][0] if len(metadata['title']) else identifier
    name = data['package_name']
    pkg = Package.get(name)
    if not pkg:
        pkg = Package(name=name, title=title, id=identifier)
        pkg.save()
        setup_default_user_roles(pkg)
    else:
        log.debug('Updating: %s' % name)
        # There are old resources which are replaced by new ones if they are
        # relevant anymore so "delete" all existing resources now.
        for r in pkg.resources:
            r.state = 'deleted'
    extras = {}
    idx = 0
    for s in ('subject', 'type',):
        for tag in metadata.get(s, []):
            # Turn each subject or type field into it's own tag.
            tagi = tag.strip()
            if tagi.startswith('http://') or tagi.startswith('https://'):
                extras['tag_source_%i' % idx] = tagi
                idx += 1
                tags = []  # URL tags break links in UI.
            else:
                tags = [tagi]
            for tagi in tags:
                tagi = tagi[:100]  # 100 char limit in DB.
                tag_obj = model.Tag.by_name(tagi)
                if not tag_obj:
                    tag_obj = model.Tag(name=tagi)
                    tag_obj.save()
                pkgtag = model.Session.query(model.PackageTag).filter(
                    model.PackageTag.package_id == pkg.id).filter(
                        model.PackageTag.tag_id == tag_obj.id
                    ).limit(1).first()
                if pkgtag is None:
                    pkgtag = model.PackageTag(tag=tag_obj, package=pkg)
                    pkgtag.save()  # Avoids duplicates if tags have duplicates.
    extras.update(
        _handle_contributor(metadata.get('contributorNode', []), namespaces))
    extras.update(
        _handle_publisher(metadata.get('publisherNode', []), namespaces))
    # This value belongs to elsewhere.
    if 'package.maintainer_email' in extras:
        pkg.maintainer_email = extras['package.maintainer_email']
        del extras['package.maintainer_email']
    extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces))
    if 'package.license' in extras:
        pkg.license = extras['package.license']
        del extras['package.license']
    # Check that we have a language.
    lang = metadata.get('language', [])
    if lang is not None and len(lang) and len(lang[0]) > 1:
        pkg.language = lang[0]
    # The rest.
    # description below goes to pkg.notes. I think it should not added here.
    for key, value in metadata.items():
        if value is None or len(value) == 0 or key in (
            'title',
            'description',
            'publisherNode',
            'contributorNode',
            'formatNode',
            'identifier',
            'source',
            'rightsNode'
        ):
            continue
        extras[key] = value[0]
    #description = metadata['description'][0] if len(metadata['description']) else ''
    notes = ' '.join(metadata.get('description', []))
    pkg.notes = notes.replace('\n', ' ').replace('  ', ' ')
    if 'date' in extras:
        pkg.version = extras['date']
        extras['modified'] = extras['date']
        del extras['date']
    pkg.extras = extras
    pkg.url = data['package_url']
    if 'package_resource' in data:
        try:
            ofs = get_ofs()
            ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {})
            pkg.add_resource(**(data['package_resource']))
        except KeyError:
            pass
    if harvest_object is not None:
        harvest_object.package_id = pkg.id
        harvest_object.content = None
        harvest_object.current = True
        harvest_object.save()
    # Metadata may have different identifiers, pick link, if exists.

    # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png
    # "Data" format is used by CKAN to identify unknown resources.
    # You can use it if you want (default format is "html"). For example:
    # - http://my.data.com/my-generated-resource?data
    # - http://my.data.com/my-resource.data
    available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html']
    default_format = 'html'

    for ids in metadata['identifier']:
        if ids.startswith('http://') or ids.startswith('https://'):
            # The end of the URL must be the format, otherwise it will use "html" by default
            infer_format = default_format

            for ext in available_formats:
                if ids.endswith(ext):
                    infer_format = ext

            pkg.add_resource(ids, name=pkg.title, format=infer_format)
    # All belong to the main group even if they do not belong to any set.
    if group is not None:
        group.add_package_by_name(pkg.name)
    model.repo.commit()
    return pkg.id