def test_records(self): """ Test record fetching via http-request to prevent accidental changes to interface """ model.User(name="test", sysadmin=True).save() organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization', 'title': "Test organization"}) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False package_2_data = deepcopy(package_1_data) for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() for pid in package_2_data.get('pids', []): pid['id'] = utils.generate_pid() packages = [get_action('package_create')({'user': '******'}, package_1_data), get_action('package_create')({'user': '******'}, package_2_data)] url = url_for('/oai') result = self.app.get(url, {'verb': 'ListSets'}) root = lxml.etree.fromstring(result.body) request_set = self._get_single_result(root, "//o:set") set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces) set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces) self.assertEquals(organization['name'], set_spec) self.assertEquals(organization['title'], set_name) result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) fail = True package_identifiers = [package['id'] for package in packages] package_org_names = [Group.get(package['owner_org']).name for package in packages] for header in root.xpath("//o:header", namespaces=self._namespaces): fail = False set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces) identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) self.assertTrue(set_spec in package_org_names) self.assertTrue(identifier in package_identifiers) result = self.app.get(url, {'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) fail_record = True for record_result in root.xpath("//o:record", namespaces=self._namespaces): fail_record = False header = self._get_single_result(record_result, 'o:header') self._get_single_result(record_result, 'o:metadata') self.assertTrue(header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers) self.assertTrue(header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names) self.assertFalse(fail_record, "No records received") self.assertFalse(fail, "No headers (packages) received")
def test_coverage_temporal_rdf(self): """ For some reason _get_results(... "...*") finds temporal nodes four times. """ organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization-coverage-rdf2', 'title': "Test organization rdf 2"}) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package = get_action('package_create')({'user': '******'}, package_1_data) package_name = package['name'] url = url_for('/oai') result = self.app.get(url, {'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'rdf'}) root = lxml.etree.fromstring(result.body) expected = ['2003-07-10T06:36:27-12:00', '2010-04-15T03:24:47+12:45'] found = 0 for temporal in self._get_results(root, "//dct:temporal/dct:PeriodOfTime/*"): self.assertTrue(temporal.text in expected) found += 1 self.assertEquals(4, found, "Unexpected coverage results: {f}".format(f=found)) get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
def update_pid(key, data, errors, context): ''' Replace an empty unicode string with random PID. ''' if type(data[key]) == unicode: if len(data[key]) == 0: data[key] = utils.generate_pid()
def test_private_record(self): ''' Test that private packages are not listed but public packages are ''' package_1_data = deepcopy(TEST_DATADICT) model.User(name="privateuser", sysadmin=True).save() organization = get_action('organization_create')({'user': '******'}, {'name': 'private-organization', 'title': "Private organization"}) package_1_data['private'] = True package_1_data['owner_org'] = organization['name'] package_1_data['name'] = 'private-package' for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package1 = get_action('package_create')({'user': '******'}, package_1_data) package_2_data = deepcopy(TEST_DATADICT) package_2_data['private'] = False package_2_data['owner_org'] = organization['name'] package_2_data['name'] = 'public-package' for pid in package_2_data.get('pids', []): pid['id'] = utils.generate_pid() url = url_for('/oai') result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': 'private-organization', 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces)) now = datetime.datetime.isoformat(datetime.datetime.today()) result = self.app.get(url, {'verb': 'ListRecords', 'set': 'private-organization', 'metadataPrefix': 'rdf', 'until': now}) root = lxml.etree.fromstring(result.body) self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces)) package2 = get_action('package_create')({'user': '******'}, package_2_data) result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': 'private-organization', 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) for header in root.xpath("//o:header", namespaces=self._namespaces): identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) print identifier self.assertTrue(identifier == package2['id']) result = self.app.get(url, {'verb': 'ListRecords', 'metadataPrefix': 'rdf'}) root = lxml.etree.fromstring(result.body) for header in root.xpath("//o:header", namespaces=self._namespaces): identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) self.assertTrue(identifier == package2['id']) get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
def _handle_pids(context, data_dict): ''' Do some PID modifications to data_dict ''' if not 'pids' in data_dict: data_dict['pids'] = [] else: # Clean up empty PIDs non_empty = [] for pid in data_dict['pids']: if pid.get('id'): non_empty.append(pid) data_dict['pids'] = non_empty if data_dict.get('generate_version_pid') == 'on': data_dict['pids'] += [{'id': utils.generate_pid(), 'type': 'version', 'provider': 'Etsin', }] # If no primary data PID, generate one if this is a new dataset if not utils.get_pids_by_type('data', data_dict, primary=True): model = context["model"] session = context["session"] if data_dict.get('id'): query = session.query(model.Package.id).filter_by(name=data_dict['id']) # id contains name ! result = query.first() if result: return # Existing dataset, don't generate new data PID data_dict['pids'].insert(0, {'id': utils.generate_pid(), 'type': 'data', 'primary': 'True', 'provider': 'Etsin', })
def test_coverage_temporal_rdf(self): """ For some reason _get_results(... "...*") finds temporal nodes four times. """ organization = get_action('organization_create')( { 'user': '******' }, { 'name': 'test-organization-coverage-rdf2', 'title': "Test organization rdf 2" }) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package = get_action('package_create')({ 'user': '******' }, package_1_data) package_name = package['name'] url = url_for('/oai') result = self.app.get( url, { 'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'rdf' }) root = lxml.etree.fromstring(result.body) expected = ['2003-07-10T06:36:27-12:00', '2010-04-15T03:24:47+12:45'] found = 0 for temporal in self._get_results(root, "//dct:temporal/dct:PeriodOfTime/*"): self.assertTrue(temporal.text in expected) found += 1 self.assertEquals(4, found, "Unexpected coverage results: {f}".format(f=found)) get_action('organization_delete')({ 'user': '******' }, { 'id': organization['id'] })
def test_coverage(self): model.User(name="test_coverage", sysadmin=True).save() organization = get_action('organization_create')( { 'user': '******' }, { 'name': 'test-organization-coverage', 'title': "Test organization" }) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package = get_action('package_create')({ 'user': '******' }, package_1_data) package_name = package['name'] url = url_for('/oai') result = self.app.get( url, { 'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) expected = [ 'Keilaniemi (populated place)', 'Espoo (city)', '2003-07-10T06:36:27-12:00/2010-04-15T03:24:47+12:45' ] found = 0 for coverage in self._get_results(root, "//dc:coverage"): self.assertTrue(coverage.text in expected) found += 1 self.assertEquals(3, found, "Unexpected coverage results") get_action('organization_delete')({ 'user': '******' }, { 'id': organization['id'] })
def test_coverage_spatial_rdf(self): organization = get_action('organization_create')( { 'user': '******' }, { 'name': 'test-organization-coverage-rdf', 'title': "Test organization rdf" }) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package = get_action('package_create')({ 'user': '******' }, package_1_data) package_name = package['name'] url = url_for('/oai') result = self.app.get( url, { 'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'rdf' }) root = lxml.etree.fromstring(result.body) expected = ['Keilaniemi (populated place),Espoo (city)'] found = 0 for spatial in self._get_results( root, "//dct:spatial_ref/rdf:Description/dct:Location/rdf:Description/rdfs:label" ): self.assertTrue(spatial.text in expected) found += 1 self.assertEquals(1, found, "Unexpected coverage results") get_action('organization_delete')({ 'user': '******' }, { 'id': organization['id'] })
def test_coverage_spatial_rdf(self): organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization-coverage-rdf', 'title': "Test organization rdf"}) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package = get_action('package_create')({'user': '******'}, package_1_data) package_name = package['name'] url = url_for('/oai') result = self.app.get(url, {'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'rdf'}) root = lxml.etree.fromstring(result.body) expected = ['Keilaniemi (populated place),Espoo (city)'] found = 0 for spatial in self._get_results(root, "//dct:spatial_ref/rdf:Description/dct:Location/rdf:Description/rdfs:label"): self.assertTrue(spatial.text in expected) found += 1 self.assertEquals(1, found, "Unexpected coverage results") get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
def test_coverage(self): model.User(name="test_coverage", sysadmin=True).save() organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization-coverage', 'title': "Test organization"}) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package = get_action('package_create')({'user': '******'}, package_1_data) package_name = package['name'] url = url_for('/oai') result = self.app.get(url, {'verb': 'GetRecord', 'identifier': package_name, 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) expected = ['Keilaniemi (populated place)', 'Espoo (city)', '2003-07-10T06:36:27-12:00/2010-04-15T03:24:47+12:45'] found = 0 for coverage in self._get_results(root, "//dc:coverage"): self.assertTrue(coverage.text in expected) found += 1 self.assertEquals(3, found, "Unexpected coverage results") get_action('organization_delete')({'user': '******'}, {'id': organization['id']})
def test_records(self): """ Test record fetching via http-request to prevent accidental changes to interface """ model.User(name="test", sysadmin=True).save() organization = get_action('organization_create')( { 'user': '******' }, { 'name': 'test-organization', 'title': "Test organization" }) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False package_2_data = deepcopy(package_1_data) for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() for pid in package_2_data.get('pids', []): pid['id'] = utils.generate_pid() packages = [ get_action('package_create')({ 'user': '******' }, package_1_data), get_action('package_create')({ 'user': '******' }, package_2_data) ] url = url_for('/oai') result = self.app.get(url, {'verb': 'ListSets'}) root = lxml.etree.fromstring(result.body) request_set = self._get_single_result(root, "//o:set") set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces) set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces) self.assertEquals(organization['name'], set_spec) self.assertEquals(organization['title'], set_name) result = self.app.get(url, { 'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) fail = True package_identifiers = [package['id'] for package in packages] package_org_names = [ Group.get(package['owner_org']).name for package in packages ] for header in root.xpath("//o:header", namespaces=self._namespaces): fail = False set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces) identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) self.assertTrue(set_spec in package_org_names) self.assertTrue(identifier in package_identifiers) result = self.app.get( url, { 'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) fail_record = True for record_result in root.xpath("//o:record", namespaces=self._namespaces): fail_record = False header = self._get_single_result(record_result, 'o:header') self._get_single_result(record_result, 'o:metadata') self.assertTrue( header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers) self.assertTrue( header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names) self.assertFalse(fail_record, "No records received") self.assertFalse(fail, "No headers (packages) received")
def _ddi2ckan(self, original_url, original_xml, harvest_object): '''Extract package values from bs4 object 'ddi_xml' parsed from xml ''' # TODO: Use .extract() and .string.extract() function so handled elements are removed from ddi_xml. doc_citation = "ddi_xml.codeBook.docDscr.citation" stdy_dscr = "ddi_xml.codeBook.stdyDscr" #################################################################### # Read mandatory metadata fields: # #################################################################### # Authors & organizations authors = self.get_authors(self.ddi_xml.stdyDscr.citation, 'AuthEnty') agent = authors[:] agent.extend(self.get_contributors(self.ddi_xml.stdyDscr.citation)) # Availability availability = AVAILABILITY_DEFAULT if _access_request_URL_is_found(): availability = 'direct_download' if _is_fsd(original_url): availability = AVAILABILITY_FSD # Keywords keywords = self.get_keywords(self.ddi_xml.stdyDscr.stdyInfo.subject) # Language # TODO: Where/how to extract multiple languages: 'language': u'eng, fin, swe' ? language = self.convert_language( self._read_value("ddi_xml.codeBook.get('xml:lang')")) # Titles titles = self._read_value(stdy_dscr + ".citation.titlStmt(['titl', 'parTitl'])") or \ self._read_value(doc_citation + ".titlStmt(['titl', 'parTitl'])", mandatory_field=True) # langtitle=[dict(lang=self.convert_language(a.get('xml:lang', '')), value=a.text) for a in titles] # [{"lang":"fin", "value":"otsikko"}, {"lang:"en", "value":"title"}] # convert the titles to a JSON string of type {"fin":"otsikko", "eng","title"} transl_json = {} first_title = "" # default to finnish, since first title has no lang value, which causes the validator to whine # we might want to update the DDI harvester to accept a language configuration parameter, if # we decide to harvest DDI resources from other sources. default_lang = "fi" for title in titles: transl_json[self.convert_language(title.get('xml:lang', default_lang))] = title.text # we want to get save the first title for use lateron if not first_title: first_title = title.text title = json.dumps(transl_json) # License # TODO: Extract prettier output. Should we check that element contains something? # Should this be in optional section if not mandatory_field? license_url = self._read_value(stdy_dscr + ".dataAccs.useStmt.get_text(separator=u' ')", mandatory_field=False) if _is_fsd(original_url): license_id = LICENSE_ID_FSD else: license_id = LICENSE_ID_DEFAULT # Contact (package_extra.key: contact_[k]_name in database, contact in WUI) contact_name = self._read_value(stdy_dscr + ".citation.distStmt('contact')") or \ self._read_value(stdy_dscr + ".citation.distStmt('distrbtr')") or \ self._read_value(doc_citation + ".prodStmt('producer')", mandatory_field=True) # TODO: clean out (or ask FSD to clean) mid text newlines (eg. in FSD2482) if contact_name and contact_name[0].text: contact_name = contact_name[0].text else: contact_name = self._read_value(stdy_dscr + ".citation.prodStmt.producer.get('affiliation')", mandatory_field=True) if _is_fsd(original_url): contact_email = CONTACT_EMAIL_FSD # TODO: Allow trying other email also in FSD metadata else: contact_email = self._read_value(stdy_dscr + ".citation.distStmt.contact.get('email')", mandatory_field=True) # Modified date version = self.get_attr_optional(self.ddi_xml.stdyDscr.citation, 'prodDate', 'date') or \ self.get_attr_mandatory(self.ddi_xml.stdyDscr.citation, 'version', 'date') # Name name_prefix = self._read_value(stdy_dscr + ".citation.titlStmt.IDNo.get('agency')", mandatory_field=False) name_id = self._read_value(stdy_dscr + ".citation.titlStmt.IDNo.text", mandatory_field=False) if not name_prefix: name_prefix = self._read_value(doc_citation + ".titlStmt.IDNo['agency']", mandatory_field=True) if not name_id: name_id = self._read_value(doc_citation + ".titlStmt.IDNo.text", mandatory_field=True) name = utils.datapid_to_name(name_prefix + name_id) pids = list() pids.append({'id': name, 'type': 'data', 'primary': 'True', 'provider': name_prefix}) # Should we generate a version PID? # vpid = utils.generate_pid() # pids.append({'id': vpid, 'type': 'version', 'provider': 'kata'}) # Original web page as resource # For FSD 'URI' leads to summary web page of data, hence format='html' orig_web_page = self._read_value(doc_citation + ".holdings.get('URI', '')") if orig_web_page: orig_web_page_resource = {'description': first_title, 'format': u'html', 'resource_type': 'documentation', 'url': orig_web_page} else: orig_web_page_resource = {} # Owner owner = self._read_value(stdy_dscr + ".citation.prodStmt.producer.text") or \ self._read_value(stdy_dscr + ".citation.rspStmt.AuthEnty.text") or \ self._read_value(doc_citation + ".prodStmt.producer.string", mandatory_field=True) agent.append({'role': 'owner', 'name': owner}) # Owner organisation if harvest_object: hsid = harvest_object.harvest_source_id hsooid = model.Session.query(model.Package).filter(model.Package.id==hsid).one().owner_org owner_org = model.Session.query(model.Group).filter(model.Group.id==hsooid).one().name else: owner_org = u'' # Distributor (Agent: distributor, the same is used as contact) agent.append({ 'role': 'distributor', 'name': contact_name}) #################################################################### # Read optional metadata fields: # #################################################################### # Availability if _is_fsd(original_url): access_request_url = ACCESS_REQUEST_URL_FSD else: access_request_url = u'' # Contact contact_phone = self._read_value(doc_citation + ".holdings.get('callno')") or \ self._read_value(stdy_dscr + ".citation.holdings.get('callno')") contact_URL = self._read_value( stdy_dscr + ".dataAccs.setAvail.accsPlac.get('URI')") or \ self._read_value( stdy_dscr + ".citation.distStmt.contact.get('URI')") or \ self._read_value( stdy_dscr + ".citation.distStmt.distrbtr.get('URI')") or \ CONTACT_URL_FSD if _is_fsd(original_url) else None # convert the descriptions to a JSON string of type {"fin":"aineiston kuvaus", "eng","dataset description"} descriptions = self._read_value(stdy_dscr + ".stdyInfo.abstract('p')") if not descriptions: descriptions = self._read_value(stdy_dscr + ".citation.serStmt.serInfo('p')") translated_notes = {} for des in descriptions: lang = self.convert_language(des.get('xml:lang', 'fi')) if lang in translated_notes: translated_notes[lang] += '\r\n\r\n' + des.text else: translated_notes[lang] = des.text notes = json.dumps(translated_notes) # Discipline discipline = self.get_discipline(self.ddi_xml.stdyDscr.stdyInfo.subject) # Dataset lifetime events events = self._get_events(stdy_dscr, authors) # Geographic coverage geo_cover = self.get_geo_coverage(self.ddi_xml) # Temporal coverage temp_start, temp_end = self.get_temporal_coverage(self.ddi_xml) # Citation citation = self._read_value(stdy_dscr + ".citation.biblCit.text", mandatory_field=False) #################################################################### # Flatten rest to 'XPath/path/to/element': 'value' pairs # #################################################################### etree_xml = etree.fromstring(str(self.ddi_xml)) flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}docDscr')) xpath_dict = flattened_ddi.getMap() flattened_ddi = importcore.generic_xml_metadata_reader(etree_xml.find('.//{*}stdyDscr')) xpath_dict.update(flattened_ddi.getMap()) package_dict = dict( access_application_URL=u'', access_request_URL=unicode(access_request_url), agent=agent, algorithm=u'', # To be implemented straight in 'resources' availability=unicode(availability), contact=[{'name': contact_name, 'email': contact_email, 'URL': contact_URL, 'phone': contact_phone}], direct_download_URL=u'', # To be implemented straight in 'resources discipline=discipline, event=events, geographic_coverage=geo_cover, groups=[], id=self._get_id_by_name(name) or generate_pid(), # langtitle=langtitle, langdis=u'True', # HUOMAA! language=language, license_URL=license_url, license_id=license_id, mimetype=u'', # To be implemented straight in 'resources name=name, notes=notes or u'', pids=pids, owner_org=owner_org, resources=[orig_web_page_resource], tag_string=keywords, temporal_coverage_begin=temp_start, temporal_coverage_end=temp_end, # title=langtitle[0].get('value'), # Must exist in package dict title=title, type='dataset', version=version, version_PID='', citation=citation ) package_dict['xpaths'] = xpath_dict # Above line creates: # package_dict = { # 'access_request_url': 'some_url', # # ... # 'xpaths': {'stdyDscr/othrStdyMat.0/relPubl.34': # 'Uskon asia: nuorisobarometri 2006 (2006).'}, # {'stdyD...': 'Some value'}] # } #package_dict['extras'].update(_save_ddi_variables_to_csv(ddi_xml, somepkg)) # Vanhojen koodien järjestys: #_save_original_xml_and_link_as_resources() #_save_ddi_variables_to_csv() #_create_group_based_on_organizations() #_last_statements_to_rewrite() # JuhoL: Set harvest object to some end state and commit if harvest_object is not None: harvest_object.content = None # Should this be flushed? model.Session.flush() #model.repo.commit() return package_dict
def test_generate_pid2(self): pid = utils.generate_pid() pid2 = utils.generate_pid() assert pid != pid2
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package should be added to the HarvestObject. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' if not harvest_object: log.error('No harvest object received') return False if harvest_object.report_status == "deleted": if harvest_object.package_id: get_action('package_delete')({ 'model': model, 'session': model.Session, 'user': '******' }, { 'id': harvest_object.package_id }) return True return True if not harvest_object.content: self._save_object_error( 'Import: Empty content for object {id}'.format( id=harvest_object.id), harvest_object) return False content = json.loads(harvest_object.content) # import pprint; pprint.pprint(content) package_dict = content.pop('unified') package_dict['xpaths'] = content # If package exists use old PID, otherwise create new pkg_id = ckanext.kata.utils.get_package_id_by_primary_pid(package_dict) pkg = Session.query(Package).filter( Package.id == pkg_id).first() if pkg_id else None log.debug('Package: "{pkg}"'.format(pkg=pkg)) if pkg and not self._recreate(harvest_object): log.debug("Not re-creating package: %s", pkg_id) return True if not package_dict.get('id', None): package_dict['id'] = pkg.id if pkg else generate_pid() uploader = '' try: package = model.Package.get(harvest_object.harvest_source_id) if package and package.owner_org: package_dict['owner_org'] = package.owner_org config = self._get_configuration(harvest_object) if config.get('type') == 'ida': if package_dict.get('owner_org', False): package_dict['private'] = "true" uploader = package_dict.get('uploader', False) package_dict.pop('uploader') if config.get('type') == 'ida': package_dict['persist_schema'] = u'True' schema = self.get_schema(config, pkg) # schema['xpaths'] = [ignore_missing, ckanext.kata.converters.xpath_to_extras] result = self._create_or_update_package( package_dict, harvest_object, schema=schema, # s_schema=ckanext.kata.plugin.KataPlugin.show_package_schema() ) if uploader and asbool(c.get('kata.ldap.enabled', False)): try: usr = ld.get_user_from_ldap(uploader) if usr: # by_openid leaves session hanging if usr is not set usrname = model.User.by_openid(usr) if usrname: editor_dict = { "name": package_dict['name'], "role": "admin", "username": usrname.name } context = { 'model': model, 'session': model.Session, 'user': '******' } try: # if we fail the adding, no problem ckanext.kata.actions.dataset_editor_add( context, editor_dict) except ValidationError: pass except NotFound: pass except NotAuthorized: pass except: pass except Exception as e: import traceback traceback.print_exc() self._save_object_error( 'Import: Could not create {id}. {e}'.format( id=harvest_object.id, e=e), harvest_object) return False return result
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package should be added to the HarvestObject. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' if not harvest_object: log.error('No harvest object received') return False if harvest_object.report_status == "deleted": if harvest_object.package_id: get_action('package_delete')({'model': model, 'session': model.Session, 'user': '******'}, {'id': harvest_object.package_id}) return True return True if not harvest_object.content: self._save_object_error('Import: Empty content for object {id}'.format( id=harvest_object.id), harvest_object) return False content = json.loads(harvest_object.content) # import pprint; pprint.pprint(content) package_dict = content.pop('unified') package_dict['xpaths'] = content # If package exists use old PID, otherwise create new pkg_id = ckanext.kata.utils.get_package_id_by_primary_pid(package_dict) pkg = Session.query(Package).filter(Package.id == pkg_id).first() if pkg_id else None log.debug('Package: "{pkg}"'.format(pkg=pkg)) if pkg and not self._recreate(harvest_object): log.debug("Not re-creating package: %s", pkg_id) return True if not package_dict.get('id', None): package_dict['id'] = pkg.id if pkg else generate_pid() uploader = '' try: package = model.Package.get(harvest_object.harvest_source_id) if package and package.owner_org: package_dict['owner_org'] = package.owner_org config = self._get_configuration(harvest_object) if config.get('type') == 'ida': if package_dict.get('owner_org', False): package_dict['private'] = "true" uploader = package_dict.get('uploader', False) package_dict.pop('uploader') if config.get('type') == 'ida': package_dict['persist_schema'] = u'True' schema = self.get_schema(config, pkg) # schema['xpaths'] = [ignore_missing, ckanext.kata.converters.xpath_to_extras] result = self._create_or_update_package(package_dict, harvest_object, schema=schema, # s_schema=ckanext.kata.plugin.KataPlugin.show_package_schema() ) if uploader and asbool(c.get('kata.ldap.enabled', False)): try: usr = ld.get_user_from_ldap(uploader) if usr: # by_openid leaves session hanging if usr is not set usrname = model.User.by_openid(usr) if usrname: editor_dict = {"name": package_dict['name'], "role": "admin", "username": usrname.name } context = {'model': model, 'session': model.Session, 'user': '******'} try: # if we fail the adding, no problem ckanext.kata.actions.dataset_editor_add(context, editor_dict) except ValidationError: pass except NotFound: pass except NotAuthorized: pass except: pass except Exception as e: import traceback traceback.print_exc() self._save_object_error('Import: Could not create {id}. {e}'.format( id=harvest_object.id, e=e), harvest_object) return False return result
def test_private_record(self): ''' Test that private packages are not listed but public packages are ''' package_1_data = deepcopy(TEST_DATADICT) model.User(name="privateuser", sysadmin=True).save() organization = get_action('organization_create')( { 'user': '******' }, { 'name': 'private-organization', 'title': "Private organization" }) package_1_data['private'] = True package_1_data['owner_org'] = organization['name'] package_1_data['name'] = 'private-package' for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() package1 = get_action('package_create')({ 'user': '******' }, package_1_data) package_2_data = deepcopy(TEST_DATADICT) package_2_data['private'] = False package_2_data['owner_org'] = organization['name'] package_2_data['name'] = 'public-package' for pid in package_2_data.get('pids', []): pid['id'] = utils.generate_pid() url = url_for('/oai') result = self.app.get( url, { 'verb': 'ListIdentifiers', 'set': 'private-organization', 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces)) now = datetime.datetime.isoformat(datetime.datetime.today()) result = self.app.get( url, { 'verb': 'ListRecords', 'set': 'private-organization', 'metadataPrefix': 'rdf', 'until': now }) root = lxml.etree.fromstring(result.body) self.assertFalse(root.xpath("//o:header", namespaces=self._namespaces)) package2 = get_action('package_create')({ 'user': '******' }, package_2_data) result = self.app.get( url, { 'verb': 'ListIdentifiers', 'set': 'private-organization', 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) for header in root.xpath("//o:header", namespaces=self._namespaces): identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) print identifier self.assertTrue(identifier == package2['id']) result = self.app.get(url, { 'verb': 'ListRecords', 'metadataPrefix': 'rdf' }) root = lxml.etree.fromstring(result.body) for header in root.xpath("//o:header", namespaces=self._namespaces): identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) self.assertTrue(identifier == package2['id']) get_action('organization_delete')({ 'user': '******' }, { 'id': organization['id'] })
def test_generate_pid(self): pid = utils.generate_pid() assert pid.startswith('urn') assert len(pid) >= 10
def _read(self): project_funder, project_funding, project_name, project_homepage = _get_project_stuff(self.dc) or ('', '', '', '') # Todo! This needs to be improved to use also simple-dc # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False) availability, license_id, license_url, access_application_url = _get_rights(self.dc) or ('', '', '', '') if not availability: availability = first(self._get_availability()) uploader = self._get_uploader() data_pids = list(_get_data_pids(self.dc)) tags = [] #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]): # tags.extend(self._resolve_tags(tag)) tags = [a.string for a in self.dc('subject', recursive=False)] transl_json = {} for title in self.dc('title', recursive=False): lang = utils.convert_language(title.get('xml:lang', '').strip()) transl_json[lang] = title.string.strip() title = json.dumps(transl_json) def _get_primary_pid(data_pids): for dpid in data_pids: if dpid.startswith('urn:nbn:fi:csc-ida'): data_pids.remove(dpid) return [dpid] return [] # Create a unified internal harvester format dict unified = dict( # ?=dc('source', recursive=False), # ?=dc('relation', recursive=False), # ?=dc('type', recursive=False), access_application_URL=access_application_url or '', # Todo! Implement access_request_URL='', algorithm=first(_get_algorithm(self.dc)) or '', # TODO: Handle availabilities better availability=availability, checksum=_get_checksum(self.dc) or '', direct_download_URL=first(_get_download(self.dc)) or '', # Todo! Implement discipline='', # Todo! Should be possible to implement with QDC, but not with OAI_DC # evdescr=[], # evtype=[], # evwhen=[], # evwho=[], # Todo! Implement geographic_coverage='', #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)], title=title, language=','.join(sorted([a.string for a in self.dc('language', recursive=False)])), license_URL=license_url or '', license_id=license_id or 'notspecified', # Todo! Using only the first entry, for now contact=[dict(name=name or "", email=email or "", URL=url or "", phone=phone or "") for name, email, phone, url in self._get_maintainer_stuff()], # Todo! IDA currently doesn't produce this, maybe in future # dc('hasFormat', recursive=False) mimetype=self._get_mime_type(), notes=self._read_notes(), # Todo! Using only the first entry, for now # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '', pids=[dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids)] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids()] + [dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc)], agent=[dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc)] + [dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc)] + [dict(role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '',)] + [dict(role='owner', name=first([a.get('resource') for a in self.dc('rightsHolder', recursive=False)]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='')], tag_string=','.join(tags) or '', # Todo! Implement if possible temporal_coverage_begin='', temporal_coverage_end='', type='dataset', uploader=uploader, # Used in smear harvest code to extract variable, station and year values, but is not used when # creating the dataset via API. smear_url=first(_get_download(self.dc, False)) or '', # Todo! This should be more exactly picked version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '', # version=dc( # partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc( # partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string, ) if not unified['language']: unified['langdis'] = 'True' # Create id and name unified['id'] = generate_pid() unified['name'] = pid_to_name(unified['id']) # If primary pid is missing, set package id as primary pid if not any(pid.get('type', None) == u'primary' for pid in unified['pids']): unified['pids'].append(dict(id=unified['id'], type=u'primary', provider=None)) # if not unified['project_name']: # unified['projdis'] = 'True' return unified
def get_unique_pids(self, ddict): for pid in ddict.get('pids', []): pid['id'] = utils.generate_pid() return ddict
def _read(self): project_funder, project_funding, project_name, project_homepage = _get_project_stuff( self.dc) or ('', '', '', '') # Todo! This needs to be improved to use also simple-dc # dc(filter_tag_name_namespace('publisher', ns['dc']), recursive=False) availability, license_id, license_url, access_application_url = _get_rights( self.dc) or ('', '', '', '') if not availability: availability = first(self._get_availability()) uploader = self._get_uploader() data_pids = list(_get_data_pids(self.dc)) tags = [] #for tag in sorted([a.string for a in self.dc('subject', recursive=False)]): # tags.extend(self._resolve_tags(tag)) tags = [a.string for a in self.dc('subject', recursive=False)] transl_json = {} for title in self.dc('title', recursive=False): lang = utils.convert_language(title.get('xml:lang', '').strip()) transl_json[lang] = title.string.strip() title = json.dumps(transl_json) def _get_primary_pid(data_pids): for dpid in data_pids: if dpid.startswith('urn:nbn:fi:csc-ida'): data_pids.remove(dpid) return [dpid] return [] # Create a unified internal harvester format dict unified = dict( # ?=dc('source', recursive=False), # ?=dc('relation', recursive=False), # ?=dc('type', recursive=False), access_application_URL=access_application_url or '', # Todo! Implement access_request_URL='', algorithm=first(_get_algorithm(self.dc)) or '', # TODO: Handle availabilities better availability=availability, checksum=_get_checksum(self.dc) or '', direct_download_URL=first(_get_download(self.dc)) or '', # Todo! Implement discipline='', # Todo! Should be possible to implement with QDC, but not with OAI_DC # evdescr=[], # evtype=[], # evwhen=[], # evwho=[], # Todo! Implement geographic_coverage='', #langtitle=[dict(lang=a.get('xml:lang', ''), value=a.string) for a in self.dc('title', recursive=False)], title=title, language=','.join( sorted( [a.string for a in self.dc('language', recursive=False)])), license_URL=license_url or '', license_id=license_id or 'notspecified', # Todo! Using only the first entry, for now contact=[ dict(name=name or "", email=email or "", URL=url or "", phone=phone or "") for name, email, phone, url in self._get_maintainer_stuff() ], # Todo! IDA currently doesn't produce this, maybe in future # dc('hasFormat', recursive=False) mimetype=self._get_mime_type(), notes=self._read_notes(), # Todo! Using only the first entry, for now # owner=first([a.get('resource') for a in dc('rightsHolder', recursive=False)]) or '', pids=[ dict(id=pid, provider=_get_provider(self.bs), type=u'primary') for pid in _get_primary_pid(data_pids) ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in data_pids ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in self._get_version_pids() ] + [ dict(id=pid, provider=_get_provider(self.bs), type=u'relation', relation=u'generalRelation') for pid in _get_metadata_pid(self.dc) ], agent=[ dict(role='author', name=orgauth.get('value', ''), id='', organisation=orgauth.get('org', ''), URL='', fundingid='') for orgauth in _get_org_auth(self.dc) ] + [ dict(role='contributor', name=contributor.get('value', ''), id='', organisation=contributor.get('org', ''), URL='', fundingid='') for contributor in _get_contributor(self.dc) ] + [ dict( role='funder', name=first(project_name) or '', id=first(project_name) or '', organisation=first(project_funder) or "", URL=first(project_homepage) or '', fundingid=first(project_funding) or '', ) ] + [ dict(role='owner', name=first([ a.get('resource') for a in self.dc('rightsHolder', recursive=False) ]) or first(_get_rightsholder(self.dc)) or '', id='', organisation='', URL='', fundingid='') ], tag_string=','.join(tags) or '', # Todo! Implement if possible temporal_coverage_begin='', temporal_coverage_end='', type='dataset', uploader=uploader, # Used in smear harvest code to extract variable, station and year values, but is not used when # creating the dataset via API. smear_url=first(_get_download(self.dc, False)) or '', # Todo! This should be more exactly picked version=(self.dc.modified or self.dc.date).string if (self.dc.modified or self.dc.date) else '', # version=dc( # partial(filter_tag_name_namespace, 'modified', ns['dct']), recursive=False)[0].string or dc( # partial(filter_tag_name_namespace, 'date', ns['dc']), recursive=False)[0].string, ) if not unified['language']: unified['langdis'] = 'True' # Create id and name unified['id'] = generate_pid() unified['name'] = pid_to_name(unified['id']) # If primary pid is missing, set package id as primary pid if not any( pid.get('type', None) == u'primary' for pid in unified['pids']): unified['pids'].append( dict(id=unified['id'], type=u'primary', provider=None)) # if not unified['project_name']: # unified['projdis'] = 'True' return unified