def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def _get_group(self, domain, in_revision=True): group = Group.by_name(domain) if not group: if not in_revision: model.repo.new_revision() group = Group(name=domain, description=domain) setup_default_user_roles(group) group.save() if not in_revision: model.repo.commit() return group
def membership_request(self, org_name): '''Request membership for an organization''' if not toolkit.request.method == 'POST': raise toolkit.abort(400, 'Expected POST method') user = toolkit.c.userobj if not user: raise toolkit.NotAuthorized('Membership request requires an user') organization = Group.by_name(org_name) comment = toolkit.request.params.get('comment') membership_request = MembershipRequest(user, organization, comment) DB.add(membership_request) DB.commit() membership_request.notify_admins() return self.json_response({})
def setUp(self): licenses = get_voc_file(LICENSES_FILE) load_licenses(load_graph(licenses)) Session.flush() user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org: self.org = org.__dict__ else: self.org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa')
def test_mapping(self): # multilang requires lang to be set from pylons.i18n.translation import set_lang, get_lang import pylons class dummyreq(object): class p(object): translator = object() environ = {'pylons.pylons': p()} pylons.request = dummyreq() pylons.translator.pylons_lang = ['en_GB'] set_lang('en_GB') assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config" contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy', 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified': datetime.now(), 'guid': unicode(uuid.uuid4), 'identifier': 'dummy'} package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file package_dict['theme'] = ['non-mappable', 'thememap1'] expected_groups_existing = ['existing-group'] expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2'] expected_groups_multi = expected_groups_new + ['othergroup'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) #meta.Session.flush() #meta.Session.revision = repo.new_revision() # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,) config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,) assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,) package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) meta.Session.rollback()
def test_clean_tags(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single', 'owner_org': 'test-org', 'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'), 'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'), } user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('test-org') if org is None: org = call_action('organization_create', context={'user': user_name}, name='test-org') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******'} package_schema = default_update_package_schema() context['schema'] = package_schema package_dict = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'fakename', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'fakename', 'notes': 'dummy', 'owner_org': 'test-org', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified' : datetime.now(), 'guid': unicode(uuid4()), 'identifier': 'dummy'} package_data = call_action('package_create', context=context, **package_dict) package = Package.get('fakename') source, job = self._create_source_and_job(source_fixture) job.package = package job.guid = uuid4() harvester = SpatialHarvester() with open(os.path.join('..', 'data', 'dataset.json')) as f: dataset = json.load(f) # long tags are invalid in all cases TAG_LONG_INVALID = 'abcdefghij' * 20 # if clean_tags is not set to true, tags will be truncated to 50 chars TAG_LONG_VALID = TAG_LONG_INVALID[:50] # default truncate to 100 TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100] assert len(TAG_LONG_VALID) == 50 assert TAG_LONG_VALID[-1] == 'j' TAG_CHARS_INVALID = '[email protected]!' TAG_CHARS_VALID = 'pretty-invlidtag' dataset['tags'].append(TAG_LONG_INVALID) dataset['tags'].append(TAG_CHARS_INVALID) harvester.source_config = {'clean_tags': False} out = harvester.get_package_dict(dataset, job) tags = out['tags'] # no clean tags, so invalid chars are in # but tags are truncated to 50 chars assert {'name': TAG_CHARS_VALID} not in tags assert {'name': TAG_CHARS_INVALID} in tags assert {'name': TAG_LONG_VALID_LONG} in tags assert {'name': TAG_LONG_INVALID} not in tags harvester.source_config = {'clean_tags': True} out = harvester.get_package_dict(dataset, job) tags = out['tags'] assert {'name': TAG_CHARS_VALID} in tags assert {'name': TAG_LONG_VALID_LONG} in tags
def _fetch_import_set(self, harvest_object, master_data, client, group): # Could be genuine fetch or retry of set insertions. if 'set' in master_data: # Fetch stage. args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']} if 'from_' in master_data: args['from_'] = self._datetime_from_str(master_data['from_']) if 'until' in master_data: args['until'] = self._datetime_from_str(master_data['until']) ids = [] try: for identity in client.listIdentifiers(**args): ids.append(identity.identifier()) except NoRecordsMatchError: return False # Ok, empty set. Nothing to do. except socket.error: errno, errstr = sys.exc_info()[:2] self._save_object_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,), harvest_object, stage='Fetch') return False except httplib.BadStatusLine: self._save_object_error( 'Bad HTTP response status line.', harvest_object, stage='Fetch') return False master_data['record_ids'] = ids else: log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),)) # Do not save to DB because we can't. # Import stage. model.repo.new_revision() subg_name = '%s - %s' % (group.name, master_data['set_name'],) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) setup_default_user_roles(subgroup) subgroup.save() missed = [] for ident in master_data['record_ids']: pkg_name = self._package_name_from_identifier(ident) # Package may have been omitted due to missing metadata. pkg = Package.get(pkg_name) if pkg: subgroup.add_package_by_name(pkg_name) subgroup.save() if 'set' not in master_data: log.debug('Inserted %s into %s' % (pkg_name, subg_name,)) else: # Either omitted due to missing metadata or fetch error. # In the latter case, we want to add record later once the # fetch succeeds after retry. missed.append(ident) if 'set' not in master_data: log.debug('Omitted %s from %s' % (pkg_name, subg_name,)) if len(missed): # Store missing names for retry. master_data['record_ids'] = missed if 'set' in master_data: del master_data['set'] # Omit fetch later. harvest_object.content = json.dumps(master_data) log.debug('Missed %s %i' % (master_data['set_name'], len(missed),)) else: harvest_object.content = None # Clear data. model.repo.commit() return True
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True
def test_holder(self): org = {'name': 'org-test', 'title': 'Test org', 'identifier': 'abc'} pkg1 = { # 'id': '2b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset-1', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'modified': '2016-11-29', 'identifier': str(uuid.uuid4()), 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '789789789', 'holder_name': 'bolzano', 'holder_identifier': '234234234', FIELD_THEMES_AGGREGATE: themes_to_aggr_json(('ECON', )), 'theme': json.dumps([theme_name_to_uri(name) for name in ('ECON', )]), 'dataset_is_local': False, 'language': '{DEU,ENG,ITA}', } pkg2 = { # 'id': 'eb6fe9ca-dc77-4cec-92a4-55c6624a5b00', 'name': 'test-dataset-2', 'title': 'Dataset di test DCAT_AP-IT 2', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'modified': '2016-11-29', 'identifier': str(uuid.uuid4()), 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '123123123123', FIELD_THEMES_AGGREGATE: themes_to_aggr_json(('ENVI', )), 'theme': json.dumps([theme_name_to_uri(name) for name in ('ENVI', )]), 'dataset_is_local': True, 'language': '{DEU,ENG,ITA}', 'owner_org': org['name'], } src_packages = [pkg1, pkg2] ctx = {'ignore_auth': True, 'user': self._get_user()['name']} org_loaded = Group.by_name(org['name']) if org_loaded: org_dict = org_loaded.__dict__ else: org_dict = helpers.call_action('organization_create', context=ctx, **org) pkg1['owner_org'] = org_dict['id'] pkg2['owner_org'] = org_dict['id'] created_packages = [ helpers.call_action('package_create', context=ctx, **pkg) for pkg in src_packages ] for pkg in created_packages: s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(pkg) has_identifier = False rights_holders = list(g.objects(dataset_ref, DCT.rightsHolder)) assert len(rights_holders), 'There should be one rights holder for\n {}:\n {}'.\ format(pkg, s.serialize_dataset(pkg)) for holder_ref in rights_holders: _holder_names = list(g.objects(holder_ref, FOAF.name)) _holder_ids = list( (str(ob) for ob in g.objects(holder_ref, DCT.identifier))) # local dataset will use organization name only # while remote will have at least two names - one with lang, one default without lang if pkg['dataset_is_local']: num_holder_names = 1 else: num_holder_names = 2 assert len(_holder_names) == num_holder_names, _holder_names assert len(_holder_ids) == 1 test_id = pkg.get( 'holder_identifier') or org_dict['identifier'] has_identifier = _holder_ids[0] == test_id assert has_identifier, \ f'No identifier in {_holder_ids} (expected {test_id}) for\n {pkg}\n{s.serialize_dataset(pkg)}'
def test_theme_to_group_mapping(self): # multilang requires lang to be set # class dummyreq(object): # class p(object): # translator = object() # environ = {'pylons.pylons': p()} # CKANRequest(dummyreq) # pylons.request = dummyreq() # pylons.translator.pylons_lang = ['en_GB'] #set_lang('en_GB') #assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config[ 'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config' with open(get_example_file('dataset.rdf'), 'r') as f: contents = f.read() p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = { 'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{ 'key': 'theme', 'value': ['non-mappable', 'thememap1'] }], 'groups': [], # [{'name':existing_g.name}], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy-' + uuid4().hex, 'identifier': 'dummy' + uuid4().hex, 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created': datetime.now(), 'metadata_modified': datetime.now(), 'guid': str(uuid.uuid4), } package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert { 'theme': ['non-mappable', 'thememap1'] } == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups( group_type='group'), 'should be {}, got {}'.format( [], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created expected_groups = ['existing-group'] self.assertSetEqual(set(expected_groups), set(groups), 'Error in assigned groups') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group and 2 other groups that have been automatically created expected_groups = expected_groups + ['somegroup1', 'somegroup2'] self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ') # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE]) aggr.append({'theme': 'thememap-multi', 'subthemes': []}) package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups expected_groups = expected_groups + ['othergroup'] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') meta.Session.rollback()