def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata_oai_dc = data['metadata']['oai_dc'] titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces) # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = titles.get('title_0', identifier) #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] esc_identifier = identifier.replace('/','-') pkg = Package.get(esc_identifier) if not pkg: pkg = Package(name=name, title=title, id=esc_identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = titles idx = 0 for s in ('subject', 'type'): for tag in metadata_oai_dc.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://www.yso.fi'): tags = label_list_yso(tagi) extras['tag_source_%i' % idx] = tagi idx += 1 elif tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. #tagi = munge_tag(tagi[:100]) # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. lastidx = 0 for auth in metadata_oai_dc.get('creator', []): extras['organization_%d' % lastidx] = '' extras['author_%d' % lastidx] = auth lastidx += 1 extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces)) extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Causes failure in commit for some reason. #for f in _handle_format(metadata.get('formatNode', []), namespaces): # pprint.pprint(f) # pkg.add_resource(**f) # There may be multiple identifiers (URL, ISBN, ...) in the metadata. id_idx = 0 for ident in metadata_oai_dc.get('identifier', []): extras['identifier_%i' % id_idx] = ident id_idx += 1 # Check that we have a language. lang = metadata_oai_dc.get('language', []) if lang and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] if 'date' in extras: pkg.version = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] # Metadata may have different identifiers, pick link, if exists. for ids in metadata_oai_dc['identifier']: if ids.startswith('http://') or ids.startswith('https://'): pkg.add_resource(ids, name=pkg.title, format='html') # All belong to the main group even if they do not belong to any set. if group: group.add_package_by_name(pkg.name) # The rest. # description below goes to pkg.notes. I think it should not added here. for mdp, metadata in data['metadata'].items(): for key, value in metadata.items(): if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode', 'publisherNode', 'creator', 'contributorNode', 'description', 'identifier', 'language', 'formatNode'): continue extras[key] = ' '.join(value) #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') for mdp, resource in data['package_resource'].items(): ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {}) pkg.add_resource(**(resource)) if harvest_object: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() model.repo.commit() return pkg.id
def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata = data['metadata'] # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = metadata.get('title', identifier)[0] #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] pkg = Package.get(name) if not pkg: pkg = Package(name=name, title=title, id=identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = {} idx = 0 for s in ('subject', 'type',): for tag in metadata.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id ).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. extras.update( _handle_contributor(metadata.get('contributorNode', []), namespaces)) extras.update( _handle_publisher(metadata.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Check that we have a language. lang = metadata.get('language', []) if lang is not None and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] # The rest. # description below goes to pkg.notes. I think it should not added here. for key, value in metadata.items(): if value is None or len(value) == 0 or key in ( 'title', 'description', 'publisherNode', 'contributorNode', 'formatNode', 'identifier', 'source', 'rightsNode' ): continue extras[key] = value[0] #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') if 'date' in extras: pkg.version = extras['date'] extras['modified'] = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] if 'package_resource' in data: try: ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {}) pkg.add_resource(**(data['package_resource'])) except KeyError: pass if harvest_object is not None: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() # Metadata may have different identifiers, pick link, if exists. # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png # "Data" format is used by CKAN to identify unknown resources. # You can use it if you want (default format is "html"). For example: # - http://my.data.com/my-generated-resource?data # - http://my.data.com/my-resource.data available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html'] default_format = 'html' for ids in metadata['identifier']: if ids.startswith('http://') or ids.startswith('https://'): # The end of the URL must be the format, otherwise it will use "html" by default infer_format = default_format for ext in available_formats: if ids.endswith(ext): infer_format = ext pkg.add_resource(ids, name=pkg.title, format=infer_format) # All belong to the main group even if they do not belong to any set. if group is not None: group.add_package_by_name(pkg.name) model.repo.commit() return pkg.id