def _get_group(self, domain, in_revision=True): group = Group.by_name(domain) if not group: if not in_revision: model.repo.new_revision() group = Group(name=domain, description=domain) setup_default_user_roles(group) group.save() if not in_revision: model.repo.commit() return group
def listIdentifiers(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''List all identifiers for this repository. ''' data = [] packages = [] group = None if not set: if not from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').\ filter(Package.private!=True).filter(Package.state=='active').all() else: if from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if until and not from_: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if from_ and until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() else: group = Group.get(set) if group: packages = group.packages(return_query=True).filter(Package.type=='dataset').\ filter(Package.private!=True).filter(Package.state=='active') if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active') if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active') if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active') packages = packages.all() if cursor: packages = packages[cursor:] for package in packages: spec = package.name if group: spec = group.name else: if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name group = None data.append(common.Header('', package.id, package.metadata_created, [spec], False)) return data
def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''Show a selection of records, basically lists all datasets. ''' data = [] packages = [] group = None if not set: if not from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.state=='active').all() if from_ and not until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp > from_).filter(Package.name==PackageRevision.name).\ filter(Package.state=='active').all() if until and not from_: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(PackageRevision.revision_timestamp < until).filter(Package.name==PackageRevision.name).\ filter(Package.state=='active').all() if from_ and until: packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\ filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() else: group = Group.get(set) if group: packages = group.packages(return_query=True) if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.type=='dataset').filter(Package.private!=True).\ filter(Package.name==PackageRevision.name).filter(Package.state=='active').all() if cursor: packages = packages[cursor:] for res in packages: spec = res.name if group: spec = group.name else: if res.owner_org: group = Group.get(res.owner_org) if group and group.name: spec = group.name group = None data.append(self._record_for_dataset(res, spec)) return data
def get_discipline(context, data_dict): model = context['model'] terms = data_dict.get('query') or data_dict.get('q') or [] if isinstance(terms, basestring): terms = [terms] terms = [t.strip() for t in terms if t.strip()] if 'fields' in data_dict: log.warning('"fields" parameter is deprecated. ' 'Use the "query" parameter instead') offset = data_dict.get('offset') limit = data_dict.get('limit') # TODO: should we check for user authentication first? q = model.Session.query(model.Group) if not len(terms): return [], 0 katagrp = Group.get('KATA') res = [] for term in terms: escaped_term = misc.escape_sql_like_special_characters(term, escape='\\') for child in katagrp.get_children_groups(): if escaped_term in child['name']: res.append(child) return res
def test_zaincremental_harvester(self): client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry)) harv = OAIPMHHarvester() harvest_job = HarvestJob() harvest_job.source = HarvestSource() harvest_job.source.title = "Test" harvest_job.source.url = "http://helda.helsinki.fi/oai/request" harvest_job.gather_started = ((datetime.now() + timedelta(days=1))) harvest_job.source.config = '{"incremental":"True"}' harvest_job.source.type = "OAI-PMH" Session.add(harvest_job) rev = model.repo.new_revision() rev.timestamp = ((datetime.now() + timedelta(days=2))) pkg = Package(name='footest', revision=rev) Session.add(pkg) pkg.save() roger = Group.get('roger') roger.add_package_by_name('footest') Session.add(roger) roger.save() gathered = harv.gather_stage(harvest_job) harvest_object = HarvestObject.get(gathered[0]) harv.fetch_stage(harvest_object) harvobj = json.loads(harvest_object.content) self.assert_(harvobj['records'])
def add_to_group(key, data, errors, context): val = data.get(key) if val: repo.new_revision() grp = Group.get(val) grp.add_package_by_name(data[('name',)]) grp.save()
def listIdentifiers(self, metadataPrefix=None, set=None, cursor=None, from_=None, until=None, batch_size=None): '''List all identifiers for this repository. ''' data = [] packages, group = self._filter_packages(set, cursor, from_, until, batch_size) for package in packages: spec = package.name if group: spec = group.name else: if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name data.append( common.Header('', package.id, package.metadata_created, [spec], False)) return data
def listRecords(self, metadataPrefix=None, set=None, cursor=None, from_=None, until=None, batch_size=None): '''Show a selection of records, basically lists all datasets. ''' data = [] packages, setspc = self._filter_packages(set, cursor, from_, until, batch_size) for package in packages: set_spec = [] if setspc: set_spec.append(setspc) if package.owner_org: group = Group.get(package.owner_org) if group and group.name: set_spec.append(group.name) if not set_spec: set_spec = [package.name] if metadataPrefix == 'rdf': data.append(self._record_for_dataset_dcat(package, set_spec)) if metadataPrefix == 'oai_openaire': data.append( self._record_for_dataset_datacite(package, set_spec)) else: data.append(self._record_for_dataset(package, set_spec)) return data
def listRecords(self, metadataPrefix=None, set=None, cursor=None, from_=None, until=None, batch_size=None): '''Show a selection of records, basically lists all datasets. ''' data = [] packages, group = self._filter_packages(set, cursor, from_, until, batch_size) for package in packages: spec = package.name if group: spec = group.name else: if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name if metadataPrefix == 'rdf': data.append(self._record_for_dataset_dcat(package, spec)) else: data.append(self._record_for_dataset(package, spec)) return data
def test_records(self): """ Test record fetching via http-request to prevent accidental changes to interface """ model.User(name="test", sysadmin=True).save() organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization', 'title': "Test organization"}) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False package_2_data = deepcopy(package_1_data) for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() for pid in package_2_data.get('pids', []): pid['id'] = utils.generate_pid() packages = [get_action('package_create')({'user': '******'}, package_1_data), get_action('package_create')({'user': '******'}, package_2_data)] url = url_for('/oai') result = self.app.get(url, {'verb': 'ListSets'}) root = lxml.etree.fromstring(result.body) request_set = self._get_single_result(root, "//o:set") set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces) set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces) self.assertEquals(organization['name'], set_spec) self.assertEquals(organization['title'], set_name) result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) fail = True package_identifiers = [package['id'] for package in packages] package_org_names = [Group.get(package['owner_org']).name for package in packages] for header in root.xpath("//o:header", namespaces=self._namespaces): fail = False set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces) identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) self.assertTrue(set_spec in package_org_names) self.assertTrue(identifier in package_identifiers) result = self.app.get(url, {'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc'}) root = lxml.etree.fromstring(result.body) fail_record = True for record_result in root.xpath("//o:record", namespaces=self._namespaces): fail_record = False header = self._get_single_result(record_result, 'o:header') self._get_single_result(record_result, 'o:metadata') self.assertTrue(header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers) self.assertTrue(header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names) self.assertFalse(fail_record, "No records received") self.assertFalse(fail, "No headers (packages) received")
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def add_to_group(key, data, errors, context): ''' Add a new group if it doesn't yet exist. :param key: key :param data: data :param errors: validation errors :param context: context ''' val = data.get(key) if val: repo.new_revision() grp = Group.get(val) # UI code needs group created if it does not match. Hence do so. if not grp: grp = Group(name=val, description=val, title=val) setup_default_user_roles(grp) grp.save() repo.commit()
def getRecord(self, metadataPrefix, identifier): '''Simple getRecord for a dataset. ''' package = Package.get(identifier) if not package: raise IdDoesNotExistError("No dataset with id %s" % identifier) spec = package.name if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name return self._record_for_dataset(package, spec)
def _filter_packages(set, cursor, from_, until, batch_size): '''Get a part of datasets for "listNN" verbs. ''' packages = [] setspc = None if not set: packages = Session.query(Package).filter(Package.type=='dataset'). \ filter(Package.state == 'active').filter(Package.private!=True) if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name) if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name) if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name) if batch_size: packages = packages.limit(batch_size) if cursor: packages = packages.offset(cursor) packages = packages.all() elif set == 'openaire_data': oa_tag = Session.query(Tag).filter( Tag.name == 'openaire_data').first() if oa_tag: packages = oa_tag.packages setspc = set else: group = Group.get(set) if group: # Note that group.packages never returns private datasets regardless of 'with_private' parameter. packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \ filter(Package.state == 'active') if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name) if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name) if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name) if batch_size: packages = packages.limit(batch_size) if cursor: packages = packages.offset(cursor) packages = packages.all() # if cursor is not None: # cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages) # packages = packages[cursor:cursor_end] return packages, setspc
def harvest_source_dictize(source, context): out = source.as_dict() out['publisher_title'] = u'' publisher_id = out.get('publisher_id') if publisher_id: group = Group.get(publisher_id) if group: out['publisher_title'] = group.title out['status'] = _get_source_status(source, context) return out
def getRecord(self, metadataPrefix, identifier): '''Simple getRecord for a dataset. ''' package = Package.get(identifier) if not package: raise IdDoesNotExistError("No dataset with id %s" % identifier) spec = package.name if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name if metadataPrefix == 'rdf': return self._record_for_dataset_dcat(package, spec) return self._record_for_dataset(package, spec)
def listIdentifiers(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''List all identifiers for this repository. ''' data = [] packages = [] if not set: if not from_ and not until: packages = Session.query(Package).all() else: if from_: packages = Session.query(Package).\ filter(PackageRevision.revision_timestamp > from_).\ all() if until: packages = Session.query(Package).\ filter(PackageRevision.revision_timestamp < until).\ all() if from_ and until: packages = Session.query(Package).\ filter(between(PackageRevision.revision_timestamp, from_, until)\ ).all() else: group = Group.get(set) if group: packages = group.active_packages() if from_ and not until: packages = packages.\ filter(PackageRevision.revision_timestamp > from_) if until and not from_: packages = packages.\ filter(PackageRevision.revision_timestamp < until) if from_ and until: packages = packages.filter( between(PackageRevision.revision_timestamp, from_, until)) packages = packages.all() if cursor: packages = packages[:cursor] for package in packages: data.append(common.Header(package.id, package.metadata_created, [package.name], False)) return data
def initdb(self): kata = Group.get('KATA') if not kata: repo.new_revision() kata = Group(name="KATA", title="Tieteenalat") kata.save() for tiede in tieteet.tieteet: t = Group(description=tiede['description'], name=tiede['name'], title=tiede['title']) t.save() m = Member(group=kata, table_id=t.id, table_name="group") m.save() setup()
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' # Do common tasks and then call different methods depending on what # kind of info the harvest object contains. self._set_config(harvest_object.job.source.config) ident = json.loads(harvest_object.content) registry = MetadataRegistry() if 'metadata_formats' in self.config: for mdp in self.config['metadata_formats']: registry.registerReader(mdp, kata_oai_dc_reader) if self.metadata_prefix_value not in self.config['metadata_formats']: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) client = oaipmh.client.Client(harvest_object.job.source.url, registry) client.updateGranularity() #quickfix for granularity domain = ident['domain'] group = Group.get(domain) # Checked in gather_stage so exists. try: if ident['fetch_type'] == 'record': return self._fetch_import_record(harvest_object, ident, client, group) if ident['fetch_type'] == 'set': return self._fetch_import_set(harvest_object, ident, client, group) # This should not happen... log.error('Unknown fetch type: %s' % ident['fetch_type']) except Exception as e: # Guard against miscellaneous stuff. Probably plain bugs. # Also very rare exceptions we haven't seen yet. self._add_retry(harvest_object) log.debug(traceback.format_exc(e)) return False
def listIdentifiers(self, metadataPrefix=None, set=None, cursor=None, from_=None, until=None, batch_size=None): '''List all identifiers for this repository. ''' data = [] packages, group = self._filter_packages(set, cursor, from_, until, batch_size) for package in packages: spec = package.name if group: spec = group.name else: if package.owner_org: group = Group.get(package.owner_org) if group and group.name: spec = group.name data.append(common.Header('', package.id, package.metadata_created, [spec], False)) return data
def harvest_source_dictize(source, context, last_job_status=False): out = source.as_dict() out['publisher_title'] = u'' publisher_id = out.get('publisher_id') if publisher_id: group = Group.get(publisher_id) if group: out['publisher_title'] = group.title out['status'] = _get_source_status(source, context) if last_job_status: source_status = logic.get_action('harvest_source_show_status')(context, {'id': source.id}) out['last_job_status'] = source_status.get('last_job', {}) return out
def get_site_extra_statistics(): orgs = Group.all("organization") org_data = {} all_assets = list(meta.Session.query(Package).all()) for org in orgs: org_data[org.display_name] = {} # assets = [x for x in all_assets if (x.owner_org == org.id) and (x.state == 'active')] assets = meta.Session.query(Package).filter_by(owner_org=org.id, state='active').all() asset_count = 0 resource_count = 0 for asset in assets: asset_count += 1 resource_count += len(asset.resources) org_data[org.display_name] = (asset_count, resource_count) return org_data
def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None, until=None, batch_size=None): '''Show a selection of records, basically lists all datasets. ''' data = [] packages = [] if not set: if not from_ and not until: packages = Session.query(Package).all() if from_: packages = Session.query(Package).\ filter(PackageRevision.revision_timestamp > from_).all() if until: packages = Session.query(Package).\ filter(PackageRevision.revision_timestamp < until).all() if from_ and until: packages = Session.query(Package).filter( between(PackageRevision.revision_timestamp,from_,until)).\ all() else: group = Group.get(set) if group: packages = group.active_packages() if from_ and not until: packages = packages.\ filter(PackageRevision.revision_timestamp > from_).\ all() if until and not from_: packages = packages.\ filter(PackageRevision.revision_timestamp < until).\ all() if from_ and until: packages = packages.filter( between(PackageRevision.revision_timestamp, from_, until))\ .all() if cursor: packages = packages[:cursor] for res in packages: data.append(self._record_for_dataset(res)) return data
def membership_request(self, org_name): '''Request membership for an organization''' if not toolkit.request.method == 'POST': raise toolkit.abort(400, 'Expected POST method') user = toolkit.c.userobj if not user: raise toolkit.NotAuthorized('Membership request requires an user') organization = Group.by_name(org_name) comment = toolkit.request.params.get('comment') membership_request = MembershipRequest(user, organization, comment) DB.add(membership_request) DB.commit() membership_request.notify_admins() return self.json_response({})
def getRecord(self, metadataPrefix, identifier): '''Simple getRecord for a dataset. ''' package = Package.get(identifier) if not package: raise IdDoesNotExistError("No dataset with id %s" % identifier) set_spec = [] if package.owner_org: group = Group.get(package.owner_org) if group and group.name: set_spec.append(group.name) if 'openaire_data' in package.as_dict().get('tags'): set_spec.append('openaire_data') if not set_spec: set_spec = [package.name] if metadataPrefix == 'rdf': return self._record_for_dataset_dcat(package, set_spec) if metadataPrefix == 'oai_openaire': return self._record_for_dataset_datacite(package, set_spec) return self._record_for_dataset(package, set_spec)
def _filter_packages(set, cursor, from_, until, batch_size): '''Get a part of datasets for "listNN" verbs. ''' packages = [] group = None if not set: packages = Session.query(Package).filter(Package.type=='dataset'). \ filter(Package.state == 'active').filter(Package.private!=True) if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name) if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name) if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name) packages = packages.all() else: group = Group.get(set) if group: # Note that group.packages never returns private datasets regardless of 'with_private' parameter. packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \ filter(Package.state == 'active') if from_ and not until: packages = packages.filter(PackageRevision.revision_timestamp > from_).\ filter(Package.name==PackageRevision.name) if until and not from_: packages = packages.filter(PackageRevision.revision_timestamp < until).\ filter(Package.name==PackageRevision.name) if from_ and until: packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\ filter(Package.name==PackageRevision.name) packages = packages.all() if cursor is not None: cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages) packages = packages[cursor:cursor_end] return packages, group
def setUp(self): licenses = get_voc_file(LICENSES_FILE) load_licenses(load_graph(licenses)) Session.flush() user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org: self.org = org.__dict__ else: self.org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa')
def test_mapping(self): # multilang requires lang to be set from pylons.i18n.translation import set_lang, get_lang import pylons class dummyreq(object): class p(object): translator = object() environ = {'pylons.pylons': p()} pylons.request = dummyreq() pylons.translator.pylons_lang = ['en_GB'] set_lang('en_GB') assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config" contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy', 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified': datetime.now(), 'guid': unicode(uuid.uuid4), 'identifier': 'dummy'} package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file package_dict['theme'] = ['non-mappable', 'thememap1'] expected_groups_existing = ['existing-group'] expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2'] expected_groups_multi = expected_groups_new + ['othergroup'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) #meta.Session.flush() #meta.Session.revision = repo.new_revision() # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,) config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,) assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,) package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() meta.Session.revision = repo.new_revision() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,) assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,) meta.Session.rollback()
def test_clean_tags(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml', 'source_type': u'gemini-single', 'owner_org': 'test-org', 'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'), 'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'), } user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('test-org') if org is None: org = call_action('organization_create', context={'user': user_name}, name='test-org') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******'} package_schema = default_update_package_schema() context['schema'] = package_schema package_dict = {'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}], 'groups': [], 'title': 'fakename', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'fakename', 'notes': 'dummy', 'owner_org': 'test-org', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created' : datetime.now(), 'metadata_modified' : datetime.now(), 'guid': unicode(uuid4()), 'identifier': 'dummy'} package_data = call_action('package_create', context=context, **package_dict) package = Package.get('fakename') source, job = self._create_source_and_job(source_fixture) job.package = package job.guid = uuid4() harvester = SpatialHarvester() with open(os.path.join('..', 'data', 'dataset.json')) as f: dataset = json.load(f) # long tags are invalid in all cases TAG_LONG_INVALID = 'abcdefghij' * 20 # if clean_tags is not set to true, tags will be truncated to 50 chars TAG_LONG_VALID = TAG_LONG_INVALID[:50] # default truncate to 100 TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100] assert len(TAG_LONG_VALID) == 50 assert TAG_LONG_VALID[-1] == 'j' TAG_CHARS_INVALID = '[email protected]!' TAG_CHARS_VALID = 'pretty-invlidtag' dataset['tags'].append(TAG_LONG_INVALID) dataset['tags'].append(TAG_CHARS_INVALID) harvester.source_config = {'clean_tags': False} out = harvester.get_package_dict(dataset, job) tags = out['tags'] # no clean tags, so invalid chars are in # but tags are truncated to 50 chars assert {'name': TAG_CHARS_VALID} not in tags assert {'name': TAG_CHARS_INVALID} in tags assert {'name': TAG_LONG_VALID_LONG} in tags assert {'name': TAG_LONG_INVALID} not in tags harvester.source_config = {'clean_tags': True} out = harvester.get_package_dict(dataset, job) tags = out['tags'] assert {'name': TAG_CHARS_VALID} in tags assert {'name': TAG_LONG_VALID_LONG} in tags
def test_records(self): """ Test record fetching via http-request to prevent accidental changes to interface """ model.User(name="test", sysadmin=True).save() organization = get_action('organization_create')( { 'user': '******' }, { 'name': 'test-organization', 'title': "Test organization" }) package_1_data = deepcopy(TEST_DATADICT) package_1_data['owner_org'] = organization['name'] package_1_data['private'] = False package_2_data = deepcopy(package_1_data) for pid in package_1_data.get('pids', []): pid['id'] = utils.generate_pid() for pid in package_2_data.get('pids', []): pid['id'] = utils.generate_pid() packages = [ get_action('package_create')({ 'user': '******' }, package_1_data), get_action('package_create')({ 'user': '******' }, package_2_data) ] url = url_for('/oai') result = self.app.get(url, {'verb': 'ListSets'}) root = lxml.etree.fromstring(result.body) request_set = self._get_single_result(root, "//o:set") set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces) set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces) self.assertEquals(organization['name'], set_spec) self.assertEquals(organization['title'], set_name) result = self.app.get(url, { 'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) fail = True package_identifiers = [package['id'] for package in packages] package_org_names = [ Group.get(package['owner_org']).name for package in packages ] for header in root.xpath("//o:header", namespaces=self._namespaces): fail = False set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces) identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces) self.assertTrue(set_spec in package_org_names) self.assertTrue(identifier in package_identifiers) result = self.app.get( url, { 'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc' }) root = lxml.etree.fromstring(result.body) fail_record = True for record_result in root.xpath("//o:record", namespaces=self._namespaces): fail_record = False header = self._get_single_result(record_result, 'o:header') self._get_single_result(record_result, 'o:metadata') self.assertTrue( header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers) self.assertTrue( header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names) self.assertFalse(fail_record, "No records received") self.assertFalse(fail, "No headers (packages) received")
def test_theme_to_group_mapping(self): # multilang requires lang to be set # class dummyreq(object): # class p(object): # translator = object() # environ = {'pylons.pylons': p()} # CKANRequest(dummyreq) # pylons.request = dummyreq() # pylons.translator.pylons_lang = ['en_GB'] #set_lang('en_GB') #assert get_lang() == ['en_GB'] assert 'dcatapit_theme_group_mapper' in config[ 'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config' with open(get_example_file('dataset.rdf'), 'r') as f: contents = f.read() p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) package_dict = datasets[0] user = User.get('dummy') if not user: user = call_action('user_create', name='dummy', password='******', email='*****@*****.**') user_name = user['name'] else: user_name = user.name org = Group.by_name('dummy') if org is None: org = call_action('organization_create', context={'user': user_name}, name='dummy', identifier='aaaaaa') existing_g = Group.by_name('existing-group') if existing_g is None: existing_g = call_action('group_create', context={'user': user_name}, name='existing-group') context = {'user': '******', 'ignore_auth': True, 'defer_commit': False} package_schema = schema.default_create_package_schema() context['schema'] = package_schema _p = { 'frequency': 'manual', 'publisher_name': 'dummy', 'extras': [{ 'key': 'theme', 'value': ['non-mappable', 'thememap1'] }], 'groups': [], # [{'name':existing_g.name}], 'title': 'dummy', 'holder_name': 'dummy', 'holder_identifier': 'dummy', 'name': 'dummy-' + uuid4().hex, 'identifier': 'dummy' + uuid4().hex, 'notes': 'dummy', 'owner_org': 'dummy', 'modified': datetime.now(), 'publisher_identifier': 'dummy', 'metadata_created': datetime.now(), 'metadata_modified': datetime.now(), 'guid': str(uuid.uuid4), } package_dict.update(_p) config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = '' config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' package_data = call_action('package_create', context=context, **package_dict) p = Package.get(package_data['id']) # no groups should be assigned at this point (no map applied) assert { 'theme': ['non-mappable', 'thememap1'] } == p.extras, '{} vs {}'.format(_p['extras'], p.extras) assert [] == p.get_groups( group_type='group'), 'should be {}, got {}'.format( [], p.get_groups(group_type='group')) package_data = call_action('package_show', context=context, id=package_data['id']) # use test mapping, which replaces thememap1 to thememap2 and thememap3 test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_dict.pop('extras', None) p = Package.get(package_data['id']) context['package'] = p package_data = call_action('package_update', context=context, **package_dict) # check - only existing group should be assigned p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created expected_groups = ['existing-group'] self.assertSetEqual(set(expected_groups), set(groups), 'Error in assigned groups') config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true' # package_dict['theme'] = ['non-mappable', 'thememap1'] package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - this time, new groups should appear p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # the map file maps ECON to existing group and 2 other groups that have been automatically created expected_groups = expected_groups + ['somegroup1', 'somegroup2'] self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ') # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi'] aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE]) aggr.append({'theme': 'thememap-multi', 'subthemes': []}) package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr) package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups expected_groups = expected_groups + ['othergroup'] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') package_data = call_action('package_update', context=context, **package_dict) meta.Session.flush() # recheck - there still should be no duplicates p = Package.get(package_data['id']) groups = [g.name for g in p.get_groups(group_type='group')] self.assertEqual(len(expected_groups), len(groups), 'New groups differ - there may be duplicated groups') self.assertSetEqual(set(expected_groups), set(groups), 'New groups differ') meta.Session.rollback()
def _fetch_import_set(self, harvest_object, master_data, client, group): # Could be genuine fetch or retry of set insertions. if 'set' in master_data: # Fetch stage. args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']} if 'from_' in master_data: args['from_'] = self._datetime_from_str(master_data['from_']) if 'until' in master_data: args['until'] = self._datetime_from_str(master_data['until']) ids = [] try: for identity in client.listIdentifiers(**args): ids.append(identity.identifier()) except NoRecordsMatchError: return False # Ok, empty set. Nothing to do. except socket.error: errno, errstr = sys.exc_info()[:2] self._save_object_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,), harvest_object, stage='Fetch') return False except httplib.BadStatusLine: self._save_object_error( 'Bad HTTP response status line.', harvest_object, stage='Fetch') return False master_data['record_ids'] = ids else: log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),)) # Do not save to DB because we can't. # Import stage. model.repo.new_revision() subg_name = '%s - %s' % (group.name, master_data['set_name'],) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) setup_default_user_roles(subgroup) subgroup.save() missed = [] for ident in master_data['record_ids']: pkg_name = self._package_name_from_identifier(ident) # Package may have been omitted due to missing metadata. pkg = Package.get(pkg_name) if pkg: subgroup.add_package_by_name(pkg_name) subgroup.save() if 'set' not in master_data: log.debug('Inserted %s into %s' % (pkg_name, subg_name,)) else: # Either omitted due to missing metadata or fetch error. # In the latter case, we want to add record later once the # fetch succeeds after retry. missed.append(ident) if 'set' not in master_data: log.debug('Omitted %s from %s' % (pkg_name, subg_name,)) if len(missed): # Store missing names for retry. master_data['record_ids'] = missed if 'set' in master_data: del master_data['set'] # Omit fetch later. harvest_object.content = json.dumps(master_data) log.debug('Missed %s %i' % (master_data['set_name'], len(missed),)) else: harvest_object.content = None # Clear data. model.repo.commit() return True
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found ''' model.repo.new_revision() master_data = json.loads(harvest_object.content) domain = master_data['domain'] group = Group.get(domain) if not group: group = Group(name=domain, description=domain) if 'records' in master_data: records = master_data['records'] set_name = master_data['set_name'] for rec in records: identifier, metadata, _ = rec if metadata: name = metadata['title'][0] if len(metadata['title'])\ else identifier title = name norm_title = unicodedata.normalize('NFKD', name)\ .encode('ASCII', 'ignore')\ .lower().replace(' ', '_')[:35] slug = ''.join(e for e in norm_title if e in string.ascii_letters + '_') name = slug creator = metadata['creator'][0]\ if len(metadata['creator']) else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg = Package.by_name(name) if not pkg: pkg = Package(name=name, title=title) extras = {} for met in metadata.items(): key, value = met if len(value) > 0: if key == 'subject' or key == 'type': for tag in value: if tag: tag = munge_tag(tag[:100]) tag_obj = model.Tag.by_name(tag) if not tag_obj: tag_obj = model.Tag(name=tag) if tag_obj: pkgtag = model.PackageTag( tag=tag_obj, package=pkg) Session.add(tag_obj) Session.add(pkgtag) else: extras[key] = ' '.join(value) pkg.author = creator pkg.author_email = creator pkg.title = title pkg.notes = description pkg.extras = extras pkg.url = \ "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\ % (harvest_object.job.source.url, identifier) pkg.save() harvest_object.package_id = pkg.id Session.add(harvest_object) setup_default_user_roles(pkg) url = '' for ids in metadata['identifier']: if ids.startswith('http://'): url = ids title = metadata['title'][0] if len(metadata['title'])\ else '' description = metadata['description'][0]\ if len(metadata['description']) else '' pkg.add_resource(url, description=description, name=title) group.add_package_by_name(pkg.name) subg_name = "%s - %s" % (domain, set_name) subgroup = Group.by_name(subg_name) if not subgroup: subgroup = Group(name=subg_name, description=subg_name) subgroup.add_package_by_name(pkg.name) Session.add(group) Session.add(subgroup) setup_default_user_roles(group) setup_default_user_roles(subgroup) model.repo.commit() else: self._save_object_error('Could not receive any objects from fetch!' , harvest_object, stage='Import') return False return True
def test_holder(self): org = {'name': 'org-test', 'title': 'Test org', 'identifier': 'abc'} pkg1 = { # 'id': '2b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset-1', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'modified': '2016-11-29', 'identifier': str(uuid.uuid4()), 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '789789789', 'holder_name': 'bolzano', 'holder_identifier': '234234234', FIELD_THEMES_AGGREGATE: themes_to_aggr_json(('ECON', )), 'theme': json.dumps([theme_name_to_uri(name) for name in ('ECON', )]), 'dataset_is_local': False, 'language': '{DEU,ENG,ITA}', } pkg2 = { # 'id': 'eb6fe9ca-dc77-4cec-92a4-55c6624a5b00', 'name': 'test-dataset-2', 'title': 'Dataset di test DCAT_AP-IT 2', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'modified': '2016-11-29', 'identifier': str(uuid.uuid4()), 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '123123123123', FIELD_THEMES_AGGREGATE: themes_to_aggr_json(('ENVI', )), 'theme': json.dumps([theme_name_to_uri(name) for name in ('ENVI', )]), 'dataset_is_local': True, 'language': '{DEU,ENG,ITA}', 'owner_org': org['name'], } src_packages = [pkg1, pkg2] ctx = {'ignore_auth': True, 'user': self._get_user()['name']} org_loaded = Group.by_name(org['name']) if org_loaded: org_dict = org_loaded.__dict__ else: org_dict = helpers.call_action('organization_create', context=ctx, **org) pkg1['owner_org'] = org_dict['id'] pkg2['owner_org'] = org_dict['id'] created_packages = [ helpers.call_action('package_create', context=ctx, **pkg) for pkg in src_packages ] for pkg in created_packages: s = RDFSerializer() g = s.g dataset_ref = s.graph_from_dataset(pkg) has_identifier = False rights_holders = list(g.objects(dataset_ref, DCT.rightsHolder)) assert len(rights_holders), 'There should be one rights holder for\n {}:\n {}'.\ format(pkg, s.serialize_dataset(pkg)) for holder_ref in rights_holders: _holder_names = list(g.objects(holder_ref, FOAF.name)) _holder_ids = list( (str(ob) for ob in g.objects(holder_ref, DCT.identifier))) # local dataset will use organization name only # while remote will have at least two names - one with lang, one default without lang if pkg['dataset_is_local']: num_holder_names = 1 else: num_holder_names = 2 assert len(_holder_names) == num_holder_names, _holder_names assert len(_holder_ids) == 1 test_id = pkg.get( 'holder_identifier') or org_dict['identifier'] has_identifier = _holder_ids[0] == test_id assert has_identifier, \ f'No identifier in {_holder_ids} (expected {test_id}) for\n {pkg}\n{s.serialize_dataset(pkg)}'