def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata_oai_dc = data['metadata']['oai_dc'] titles = _handle_title(metadata_oai_dc.get('titleNode', []), namespaces) # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = titles.get('title_0', identifier) #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] esc_identifier = identifier.replace('/','-') pkg = Package.get(esc_identifier) if not pkg: pkg = Package(name=name, title=title, id=esc_identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = titles idx = 0 for s in ('subject', 'type'): for tag in metadata_oai_dc.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://www.yso.fi'): tags = label_list_yso(tagi) extras['tag_source_%i' % idx] = tagi idx += 1 elif tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. #tagi = munge_tag(tagi[:100]) # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. lastidx = 0 for auth in metadata_oai_dc.get('creator', []): extras['organization_%d' % lastidx] = '' extras['author_%d' % lastidx] = auth lastidx += 1 extras.update(_handle_contributor(metadata_oai_dc.get('contributorNode', []), namespaces)) extras.update(_handle_publisher(metadata_oai_dc.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata_oai_dc.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Causes failure in commit for some reason. #for f in _handle_format(metadata.get('formatNode', []), namespaces): # pprint.pprint(f) # pkg.add_resource(**f) # There may be multiple identifiers (URL, ISBN, ...) in the metadata. id_idx = 0 for ident in metadata_oai_dc.get('identifier', []): extras['identifier_%i' % id_idx] = ident id_idx += 1 # Check that we have a language. lang = metadata_oai_dc.get('language', []) if lang and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] if 'date' in extras: pkg.version = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] # Metadata may have different identifiers, pick link, if exists. for ids in metadata_oai_dc['identifier']: if ids.startswith('http://') or ids.startswith('https://'): pkg.add_resource(ids, name=pkg.title, format='html') # All belong to the main group even if they do not belong to any set. if group: group.add_package_by_name(pkg.name) # The rest. # description below goes to pkg.notes. I think it should not added here. for mdp, metadata in data['metadata'].items(): for key, value in metadata.items(): if value is None or len(value) == 0 or key in ('titleNode', 'subject', 'type', 'rightsNode', 'publisherNode', 'creator', 'contributorNode', 'description', 'identifier', 'language', 'formatNode'): continue extras[key] = ' '.join(value) #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') for mdp, resource in data['package_resource'].items(): ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save'][mdp]['label'], data['package_xml_save'][mdp]['xml'], {}) pkg.add_resource(**(resource)) if harvest_object: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() model.repo.commit() return pkg.id
def _oai_dc2ckan(data, namespaces, group, harvest_object): model.repo.new_revision() identifier = data['identifier'] metadata = data['metadata'] # Store title in pkg.title and keep all in extras as well. That way # UI will work some way in any case. title = metadata.get('title', identifier)[0] #title = metadata['title'][0] if len(metadata['title']) else identifier name = data['package_name'] pkg = Package.get(name) if not pkg: pkg = Package(name=name, title=title, id=identifier) pkg.save() setup_default_user_roles(pkg) else: log.debug('Updating: %s' % name) # There are old resources which are replaced by new ones if they are # relevant anymore so "delete" all existing resources now. for r in pkg.resources: r.state = 'deleted' extras = {} idx = 0 for s in ('subject', 'type',): for tag in metadata.get(s, []): # Turn each subject or type field into it's own tag. tagi = tag.strip() if tagi.startswith('http://') or tagi.startswith('https://'): extras['tag_source_%i' % idx] = tagi idx += 1 tags = [] # URL tags break links in UI. else: tags = [tagi] for tagi in tags: tagi = tagi[:100] # 100 char limit in DB. tag_obj = model.Tag.by_name(tagi) if not tag_obj: tag_obj = model.Tag(name=tagi) tag_obj.save() pkgtag = model.Session.query(model.PackageTag).filter( model.PackageTag.package_id == pkg.id).filter( model.PackageTag.tag_id == tag_obj.id ).limit(1).first() if pkgtag is None: pkgtag = model.PackageTag(tag=tag_obj, package=pkg) pkgtag.save() # Avoids duplicates if tags have duplicates. extras.update( _handle_contributor(metadata.get('contributorNode', []), namespaces)) extras.update( _handle_publisher(metadata.get('publisherNode', []), namespaces)) # This value belongs to elsewhere. if 'package.maintainer_email' in extras: pkg.maintainer_email = extras['package.maintainer_email'] del extras['package.maintainer_email'] extras.update(_handle_rights(metadata.get('rightsNode', []), namespaces)) if 'package.license' in extras: pkg.license = extras['package.license'] del extras['package.license'] # Check that we have a language. lang = metadata.get('language', []) if lang is not None and len(lang) and len(lang[0]) > 1: pkg.language = lang[0] # The rest. # description below goes to pkg.notes. I think it should not added here. for key, value in metadata.items(): if value is None or len(value) == 0 or key in ( 'title', 'description', 'publisherNode', 'contributorNode', 'formatNode', 'identifier', 'source', 'rightsNode' ): continue extras[key] = value[0] #description = metadata['description'][0] if len(metadata['description']) else '' notes = ' '.join(metadata.get('description', [])) pkg.notes = notes.replace('\n', ' ').replace(' ', ' ') if 'date' in extras: pkg.version = extras['date'] extras['modified'] = extras['date'] del extras['date'] pkg.extras = extras pkg.url = data['package_url'] if 'package_resource' in data: try: ofs = get_ofs() ofs.put_stream(BUCKET, data['package_xml_save']['label'], data['package_xml_save']['xml'], {}) pkg.add_resource(**(data['package_resource'])) except KeyError: pass if harvest_object is not None: harvest_object.package_id = pkg.id harvest_object.content = None harvest_object.current = True harvest_object.save() # Metadata may have different identifiers, pick link, if exists. # See: https://github.com/okfn/ckan/blob/master/ckan/public/base/images/sprite-resource-icons.png # "Data" format is used by CKAN to identify unknown resources. # You can use it if you want (default format is "html"). For example: # - http://my.data.com/my-generated-resource?data # - http://my.data.com/my-resource.data available_formats = ['data', 'rdf', 'pdf', 'api', 'zip', 'xls', 'csv', 'txt', 'xml', 'json', 'html'] default_format = 'html' for ids in metadata['identifier']: if ids.startswith('http://') or ids.startswith('https://'): # The end of the URL must be the format, otherwise it will use "html" by default infer_format = default_format for ext in available_formats: if ids.endswith(ext): infer_format = ext pkg.add_resource(ids, name=pkg.title, format=infer_format) # All belong to the main group even if they do not belong to any set. if group is not None: group.add_package_by_name(pkg.name) model.repo.commit() return pkg.id
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True
def import_stage(self, harvest_object): """Import the metadata received in the fetch stage to a dataset and create groups if ones are defined. Fill in metadata from study and document description. """ try: xml_dict = {} xml_dict["source"] = harvest_object.content udict = json.loads(harvest_object.content) if "url" in udict: f = urllib2.urlopen(udict["url"]).read() ddi_xml = BeautifulSoup(f, "xml") else: self._save_object_error("No url in content!", harvest_object) return False except urllib2.URLError: self._save_object_error("Could not fetch from url %s!" % udict["url"], harvest_object) return False except etree.XMLSyntaxError: self._save_object_error("Unable to parse XML!", harvest_object) return False model.repo.new_revision() study_descr = ddi_xml.codeBook.stdyDscr document_info = ddi_xml.codeBook.docDscr.citation title = study_descr.citation.titlStmt.titl.string if not title: title = document_info.titlStmt.titl.string name = study_descr.citation.titlStmt.IDNo.string update = True pkg = Package.get(name) if not pkg: pkg = Package(name=name) update = False producer = study_descr.citation.prodStmt.producer if not producer: producer = study_descr.citation.rspStmt.AuthEnty if not producer: producer = study_descr.citation.rspStmt.othId pkg.author = producer.string pkg.maintainer = producer.string if study_descr.citation.distStmt.contact: pkg.maintainer = study_descr.citation.distStmt.contact.string if document_info.titlStmt.IDNo: pkg.id = document_info.titlStmt.IDNo.string keywords = study_descr.stdyInfo.subject(re.compile("keyword|topcClas")) keywords = list(set(keywords)) for kw in keywords: if kw: vocab = None kw_str = "" if kw.string: kw_str = kw.string if "vocab" in kw.attrs: vocab = kw.attrs.get("vocab", None) if vocab and kw.string: kw_str = vocab + " " + kw.string pkg.add_tag_by_name(munge_tag(kw_str)) if study_descr.stdyInfo.abstract: description_array = study_descr.stdyInfo.abstract("p") else: description_array = study_descr.citation.serStmt.serInfo("p") pkg.notes = "<br />".join([description.string for description in description_array]) pkg.title = title[:100] pkg.url = udict["url"] if not update: ofs = get_ofs() nowstr = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f") idno = study_descr.citation.titlStmt.IDNo agencyxml = (idno["agency"] if "agency" in idno.attrs else "") + idno.string label = "%s/%s.xml" % (nowstr, agencyxml) ofs.put_stream(BUCKET, label, f, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Original metadata record", format="xml", size=len(f)) pkg.add_resource( url=document_info.holdings["URI"] if "URI" in document_info.holdings else "", description=title ) metas = {} descendants = [desc for desc in document_info.descendants] + [sdesc for sdesc in study_descr.descendants] for docextra in descendants: if isinstance(docextra, Tag): if docextra: if docextra.name == "p": docextra.name = docextra.parent.name if not docextra.name in metas and docextra.string: metas[docextra.name] = docextra.string if docextra.string else self._collect_attribs(docextra) else: if docextra.string: metas[docextra.name] += ( " " + docextra.string if docextra.string else self._collect_attribs(docextra) ) if ddi_xml.codeBook.dataDscr and not update: vars = ddi_xml.codeBook.dataDscr("var") heads = self._get_headers() c_heads = ["ID", "catValu", "labl", "catStat"] f_var = StringIO.StringIO() c_var = StringIO.StringIO() varwriter = csv.DictWriter(f_var, heads) codewriter = csv.DictWriter(c_var, c_heads) heading_row = {} for head in heads: heading_row[head] = head c_heading_row = {} for head in c_heads: c_heading_row[head] = head varwriter.writerow(heading_row) codewriter.writerow(c_heading_row) for var in vars: try: varwriter.writerow(self._construct_csv(var, heads)) codewriter.writerows(self._create_code_rows(var)) except ValueError, e: raise IOError("Failed to import DDI to CSV! %s" % e) f_var.flush() label = "%s/%s_var.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, f_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable metadata", format="csv", size=f_var.len) label = "%s/%s_code.csv" % (nowstr, name) ofs.put_stream(BUCKET, label, c_var, {}) fileurl = config.get("ckan.site_url") + h.url_for("storage_file", label=label) pkg.add_resource(url=fileurl, description="Variable code values", format="csv", size=c_var.len) f_var.seek(0) reader = csv.DictReader(f_var) for var in reader: metas[var["ID"]] = var["labl"] if "labl" in var else var["qstnLit"]