def _get_group(self, domain, in_revision=True):
     group = Group.by_name(domain)
     if not group:
         if not in_revision:
             model.repo.new_revision()
         group = Group(name=domain, description=domain)
         setup_default_user_roles(group)
         group.save()
         if not in_revision:
             model.repo.commit()
     return group
Exemple #2
0
    def listIdentifiers(self, metadataPrefix, set=None, cursor=None,
                        from_=None, until=None, batch_size=None):
        '''List all identifiers for this repository.
        '''
        data = []
        packages = []
        group = None
        if not set:
            if not from_ and not until:
                packages = Session.query(Package).filter(Package.type=='dataset').\
                    filter(Package.private!=True).filter(Package.state=='active').all()
            else:
                if from_ and not until:
                    packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                        filter(PackageRevision.revision_timestamp > from_).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
                if until and not from_:
                    packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                        filter(PackageRevision.revision_timestamp < until).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
                if from_ and until:
                    packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                        filter(between(PackageRevision.revision_timestamp, from_, until)).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
        else:
            group = Group.get(set)
            if group:
                packages = group.packages(return_query=True).filter(Package.type=='dataset').\
                    filter(Package.private!=True).filter(Package.state=='active')
                if from_ and not until:
                    packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active')
                if until and not from_:
                    packages = packages.filter(PackageRevision.revision_timestamp < until).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active')
                if from_ and until:
                    packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active')
                packages = packages.all()
        if cursor:
            packages = packages[cursor:]
        for package in packages:
            spec = package.name
            if group:
                spec = group.name
            else:
                if package.owner_org:
                    group = Group.get(package.owner_org)
                    if group and group.name:
                        spec = group.name
                    group = None
            data.append(common.Header('', package.id, package.metadata_created, [spec], False))

        return data
Exemple #3
0
 def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None,
                 until=None, batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages = []
     group = None
     if not set:
         if not from_ and not until:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(Package.state=='active').all()
         if from_ and not until:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(PackageRevision.revision_timestamp > from_).filter(Package.name==PackageRevision.name).\
                 filter(Package.state=='active').all()
         if until and not from_:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(PackageRevision.revision_timestamp < until).filter(Package.name==PackageRevision.name).\
                 filter(Package.state=='active').all()
         if from_ and until:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(between(PackageRevision.revision_timestamp, from_, until)).\
                 filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
     else:
         group = Group.get(set)
         if group:
             packages = group.packages(return_query=True)
             if from_ and not until:
                 packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                     filter(Package.type=='dataset').filter(Package.private!=True).\
                     filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
             if until and not from_:
                 packages = packages.filter(PackageRevision.revision_timestamp < until).\
                     filter(Package.type=='dataset').filter(Package.private!=True).\
                     filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
             if from_ and until:
                 packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                     filter(Package.type=='dataset').filter(Package.private!=True).\
                     filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
     if cursor:
         packages = packages[cursor:]
     for res in packages:
         spec = res.name
         if group:
             spec = group.name
         else:
             if res.owner_org:
                 group = Group.get(res.owner_org)
                 if group and group.name:
                     spec = group.name
                 group = None
         data.append(self._record_for_dataset(res, spec))
     return data
def get_discipline(context, data_dict):
    model = context['model']

    terms = data_dict.get('query') or data_dict.get('q') or []
    if isinstance(terms, basestring):
        terms = [terms]
    terms = [t.strip() for t in terms if t.strip()]

    if 'fields' in data_dict:
        log.warning('"fields" parameter is deprecated.  '
                    'Use the "query" parameter instead')

    offset = data_dict.get('offset')
    limit = data_dict.get('limit')

    # TODO: should we check for user authentication first?
    q = model.Session.query(model.Group)

    if not len(terms):
        return [], 0
    katagrp = Group.get('KATA')
    res = []
    for term in terms:
        escaped_term = misc.escape_sql_like_special_characters(term, escape='\\')
        for child in katagrp.get_children_groups():
            if escaped_term in child['name']:
                res.append(child)
    return res
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
def add_to_group(key, data, errors, context):
    val = data.get(key)
    if val:
        repo.new_revision()
        grp = Group.get(val)
        grp.add_package_by_name(data[('name',)])
        grp.save()
Exemple #7
0
 def listIdentifiers(self,
                     metadataPrefix=None,
                     set=None,
                     cursor=None,
                     from_=None,
                     until=None,
                     batch_size=None):
     '''List all identifiers for this repository.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until,
                                             batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         data.append(
             common.Header('', package.id, package.metadata_created, [spec],
                           False))
     return data
Exemple #8
0
    def listRecords(self,
                    metadataPrefix=None,
                    set=None,
                    cursor=None,
                    from_=None,
                    until=None,
                    batch_size=None):
        '''Show a selection of records, basically lists all datasets.
        '''
        data = []
        packages, setspc = self._filter_packages(set, cursor, from_, until,
                                                 batch_size)

        for package in packages:
            set_spec = []
            if setspc:
                set_spec.append(setspc)
            if package.owner_org:
                group = Group.get(package.owner_org)
                if group and group.name:
                    set_spec.append(group.name)
            if not set_spec:
                set_spec = [package.name]
            if metadataPrefix == 'rdf':
                data.append(self._record_for_dataset_dcat(package, set_spec))
            if metadataPrefix == 'oai_openaire':
                data.append(
                    self._record_for_dataset_datacite(package, set_spec))
            else:
                data.append(self._record_for_dataset(package, set_spec))
        return data
Exemple #9
0
 def listRecords(self,
                 metadataPrefix=None,
                 set=None,
                 cursor=None,
                 from_=None,
                 until=None,
                 batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until,
                                             batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         if metadataPrefix == 'rdf':
             data.append(self._record_for_dataset_dcat(package, spec))
         else:
             data.append(self._record_for_dataset(package, spec))
     return data
    def test_records(self):
        """ Test record fetching via http-request to prevent accidental changes to interface """
        model.User(name="test", sysadmin=True).save()
        organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization', 'title': "Test organization"})
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        package_2_data = deepcopy(package_1_data)

        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        packages = [get_action('package_create')({'user': '******'}, package_1_data),
                    get_action('package_create')({'user': '******'}, package_2_data)]

        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'ListSets'})

        root = lxml.etree.fromstring(result.body)
        request_set = self._get_single_result(root, "//o:set")

        set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces)
        set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces)
        self.assertEquals(organization['name'], set_spec)
        self.assertEquals(organization['title'], set_name)

        result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc'})

        root = lxml.etree.fromstring(result.body)
        fail = True

        package_identifiers = [package['id'] for package in packages]
        package_org_names = [Group.get(package['owner_org']).name for package in packages]

        for header in root.xpath("//o:header", namespaces=self._namespaces):
            fail = False
            set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces)
            identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces)
            self.assertTrue(set_spec in package_org_names)
            self.assertTrue(identifier in package_identifiers)

            result = self.app.get(url, {'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc'})

            root = lxml.etree.fromstring(result.body)

            fail_record = True
            for record_result in root.xpath("//o:record", namespaces=self._namespaces):
                fail_record = False
                header = self._get_single_result(record_result, 'o:header')
                self._get_single_result(record_result, 'o:metadata')

                self.assertTrue(header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers)
                self.assertTrue(header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names)

            self.assertFalse(fail_record, "No records received")

        self.assertFalse(fail, "No headers (packages) received")
Exemple #11
0
    def gather_stage(self, harvest_job):
        '''
        The gather stage will recieve a HarvestJob object and will be
        responsible for:
            - gathering all the necessary objects to fetch on a later.
              stage (e.g. for a CSW server, perform a GetRecords request)
            - creating the necessary HarvestObjects in the database, specifying
              the guid and a reference to its source and job.
            - creating and storing any suitable HarvestGatherErrors that may
              occur.
            - returning a list with all the ids of the created HarvestObjects.

        :param harvest_job: HarvestJob object
        :returns: A list of HarvestObject ids
        '''
        self._set_config(harvest_job.source.config)
        sets = []
        harvest_objs = []
        registry = MetadataRegistry()
        registry.registerReader('oai_dc', oai_dc_reader)
        client = oaipmh.client.Client(harvest_job.source.url, registry)
        try:
            identifier = client.identify()
        except urllib2.URLError:
            self._save_gather_error('Could not gather anything from %s!' %
                                    harvest_job.source.url, harvest_job)
            return None
        domain = identifier.repositoryName()
        group = Group.by_name(domain)
        if not group:
            group = Group(name=domain, description=domain)
        query = self.config['query'] if 'query' in self.config else ''
        try:
            for set in client.listSets():
                identifier, name, _ = set
                if 'query' in self.config:
                    if query in name:
                        sets.append((identifier, name))
                else:
                    sets.append((identifier, name))
        except NoSetHierarchyError:
            sets.append(('1', 'Default'))
            self._save_gather_error('Could not fetch sets!', harvest_job)

        for set_id, set_name in sets:
            harvest_obj = HarvestObject(job=harvest_job)
            harvest_obj.content = json.dumps(
                                             {
                                              'set': set_id, \
                                              'set_name': set_name, \
                                              'domain': domain
                                              }
                                             )
            harvest_obj.save()
            harvest_objs.append(harvest_obj.id)
        model.repo.commit()
        return harvest_objs
Exemple #12
0
def add_to_group(key, data, errors, context):
    '''
    Add a new group if it doesn't yet exist.

    :param key: key
    :param data: data
    :param errors: validation errors
    :param context: context
    '''
    val = data.get(key)
    if val:
        repo.new_revision()
        grp = Group.get(val)
        # UI code needs group created if it does not match. Hence do so.
        if not grp:
            grp = Group(name=val, description=val, title=val)
            setup_default_user_roles(grp)
            grp.save()
        repo.commit()
Exemple #13
0
 def getRecord(self, metadataPrefix, identifier):
     '''Simple getRecord for a dataset.
     '''
     package = Package.get(identifier)
     if not package:
         raise IdDoesNotExistError("No dataset with id %s" % identifier)
     spec = package.name
     if package.owner_org:
         group = Group.get(package.owner_org)
         if group and group.name:
             spec = group.name
     return self._record_for_dataset(package, spec)
Exemple #14
0
 def _filter_packages(set, cursor, from_, until, batch_size):
     '''Get a part of datasets for "listNN" verbs.
     '''
     packages = []
     setspc = None
     if not set:
         packages = Session.query(Package).filter(Package.type=='dataset'). \
             filter(Package.state == 'active').filter(Package.private!=True)
         if from_ and not until:
             packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                 filter(Package.name==PackageRevision.name)
         if until and not from_:
             packages = packages.filter(PackageRevision.revision_timestamp < until).\
                 filter(Package.name==PackageRevision.name)
         if from_ and until:
             packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                 filter(Package.name==PackageRevision.name)
         if batch_size:
             packages = packages.limit(batch_size)
         if cursor:
             packages = packages.offset(cursor)
         packages = packages.all()
     elif set == 'openaire_data':
         oa_tag = Session.query(Tag).filter(
             Tag.name == 'openaire_data').first()
         if oa_tag:
             packages = oa_tag.packages
         setspc = set
     else:
         group = Group.get(set)
         if group:
             # Note that group.packages never returns private datasets regardless of 'with_private' parameter.
             packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \
                 filter(Package.state == 'active')
             if from_ and not until:
                 packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                     filter(Package.name==PackageRevision.name)
             if until and not from_:
                 packages = packages.filter(PackageRevision.revision_timestamp < until).\
                     filter(Package.name==PackageRevision.name)
             if from_ and until:
                 packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                     filter(Package.name==PackageRevision.name)
             if batch_size:
                 packages = packages.limit(batch_size)
             if cursor:
                 packages = packages.offset(cursor)
             packages = packages.all()
     # if cursor is not None:
     #     cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages)
     #     packages = packages[cursor:cursor_end]
     return packages, setspc
def harvest_source_dictize(source, context):
    out = source.as_dict()

    out['publisher_title'] = u''

    publisher_id = out.get('publisher_id')
    if publisher_id:
        group = Group.get(publisher_id)
        if group:
            out['publisher_title'] = group.title

    out['status'] = _get_source_status(source, context)

    return out
Exemple #16
0
 def getRecord(self, metadataPrefix, identifier):
     '''Simple getRecord for a dataset.
     '''
     package = Package.get(identifier)
     if not package:
         raise IdDoesNotExistError("No dataset with id %s" % identifier)
     spec = package.name
     if package.owner_org:
         group = Group.get(package.owner_org)
         if group and group.name:
             spec = group.name
     if metadataPrefix == 'rdf':
         return self._record_for_dataset_dcat(package, spec)
     return self._record_for_dataset(package, spec)
def harvest_source_dictize(source, context):
    out = source.as_dict()

    out['publisher_title'] = u''

    publisher_id = out.get('publisher_id')
    if publisher_id:
        group  = Group.get(publisher_id)
        if group:
            out['publisher_title'] = group.title

    out['status'] = _get_source_status(source, context)


    return out
Exemple #18
0
 def listIdentifiers(self, metadataPrefix, set=None, cursor=None,
                     from_=None, until=None, batch_size=None):
     '''List all identifiers for this repository.
     '''
     data = []
     packages = []
     if not set:
         if not from_ and not until:
             packages = Session.query(Package).all()
         else:
             if from_:
                 packages = Session.query(Package).\
                     filter(PackageRevision.revision_timestamp > from_).\
                     all()
             if until:
                 packages = Session.query(Package).\
                     filter(PackageRevision.revision_timestamp < until).\
                     all()
             if from_ and until:
                 packages = Session.query(Package).\
                     filter(between(PackageRevision.revision_timestamp,
                                    from_,
                                    until)\
                            ).all()
     else:
         group = Group.get(set)
         if group:
             packages = group.active_packages()
             if from_ and not until:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp > from_)
             if until and not from_:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp < until)
             if from_ and until:
                 packages = packages.filter(
                     between(PackageRevision.revision_timestamp,
                             from_,
                             until))
             packages = packages.all()
     if cursor:
         packages = packages[:cursor]
     for package in packages:
         data.append(common.Header(package.id,
                                   package.metadata_created,
                                   [package.name],
                                   False))
     return data
Exemple #19
0
 def initdb(self):
     kata = Group.get('KATA')
     if not kata:
         repo.new_revision()
         kata = Group(name="KATA", title="Tieteenalat")
         kata.save()
         for tiede in tieteet.tieteet:
             t = Group(description=tiede['description'],
                       name=tiede['name'],
                       title=tiede['title'])
             t.save()
             m = Member(group=kata, table_id=t.id, table_name="group")
             m.save()
     setup()
Exemple #20
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.
        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        # Do common tasks and then call different methods depending on what
        # kind of info the harvest object contains.
        self._set_config(harvest_object.job.source.config)
        ident = json.loads(harvest_object.content)
        
        registry = MetadataRegistry()
        if 'metadata_formats' in self.config: 
            for mdp in self.config['metadata_formats']: 
                registry.registerReader(mdp, kata_oai_dc_reader) 
            if self.metadata_prefix_value not in self.config['metadata_formats']: 
                registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) 
        else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader)

        client = oaipmh.client.Client(harvest_object.job.source.url, registry)
        client.updateGranularity() #quickfix for granularity
        domain = ident['domain']
        group = Group.get(domain)  # Checked in gather_stage so exists.
        try:
            if ident['fetch_type'] == 'record':
                return self._fetch_import_record(harvest_object, ident, client, group)
            if ident['fetch_type'] == 'set':
                return self._fetch_import_set(harvest_object, ident, client, group)
            # This should not happen...
            log.error('Unknown fetch type: %s' % ident['fetch_type'])
        except Exception as e:
            # Guard against miscellaneous stuff. Probably plain bugs.
            # Also very rare exceptions we haven't seen yet.
            self._add_retry(harvest_object)
            log.debug(traceback.format_exc(e))
        return False
 def listIdentifiers(self, metadataPrefix=None, set=None, cursor=None,
                     from_=None, until=None, batch_size=None):
     '''List all identifiers for this repository.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until, batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         data.append(common.Header('', package.id, package.metadata_created, [spec], False))
     return data
Exemple #22
0
def harvest_source_dictize(source, context, last_job_status=False):
    out = source.as_dict()

    out['publisher_title'] = u''

    publisher_id = out.get('publisher_id')
    if publisher_id:
        group = Group.get(publisher_id)
        if group:
            out['publisher_title'] = group.title

    out['status'] = _get_source_status(source, context)

    if last_job_status:
        source_status = logic.get_action('harvest_source_show_status')(context, {'id': source.id})
        out['last_job_status'] = source_status.get('last_job', {})

    return out
Exemple #23
0
def get_site_extra_statistics():
    orgs = Group.all("organization")
    org_data = {}
    all_assets = list(meta.Session.query(Package).all())
    for org in orgs:
        org_data[org.display_name] = {}
        # assets = [x for x in all_assets if (x.owner_org == org.id) and (x.state == 'active')]
        assets = meta.Session.query(Package).filter_by(owner_org=org.id,
                                                       state='active').all()
        asset_count = 0
        resource_count = 0
        for asset in assets:
            asset_count += 1
            resource_count += len(asset.resources)

        org_data[org.display_name] = (asset_count, resource_count)

    return org_data
Exemple #24
0
 def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None,
                 until=None, batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages = []
     if not set:
         if not from_ and not until:
             packages = Session.query(Package).all()
         if from_:
             packages = Session.query(Package).\
                 filter(PackageRevision.revision_timestamp > from_).all()
         if until:
             packages = Session.query(Package).\
                 filter(PackageRevision.revision_timestamp < until).all()
         if from_ and until:
             packages = Session.query(Package).filter(
                 between(PackageRevision.revision_timestamp,from_,until)).\
                 all()
     else:
         group = Group.get(set)
         if group:
             packages = group.active_packages()
             if from_ and not until:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp > from_).\
                     all()
             if until and not from_:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp < until).\
                     all()
             if from_ and until:
                 packages = packages.filter(
                         between(PackageRevision.revision_timestamp,
                                 from_,
                                 until))\
                                 .all()
     if cursor:
         packages = packages[:cursor]
     for res in packages:
         data.append(self._record_for_dataset(res))
     return data
    def membership_request(self, org_name):
        '''Request membership for an organization'''
        if not toolkit.request.method == 'POST':
            raise toolkit.abort(400, 'Expected POST method')

        user = toolkit.c.userobj
        if not user:
            raise toolkit.NotAuthorized('Membership request requires an user')

        organization = Group.by_name(org_name)

        comment = toolkit.request.params.get('comment')
        membership_request = MembershipRequest(user, organization, comment)

        DB.add(membership_request)
        DB.commit()

        membership_request.notify_admins()

        return self.json_response({})
 def listRecords(self, metadataPrefix=None, set=None, cursor=None, from_=None,
                 until=None, batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until, batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         if metadataPrefix == 'rdf':
             data.append(self._record_for_dataset_dcat(package, spec))
         else:
             data.append(self._record_for_dataset(package, spec))
     return data
    def membership_request(self, org_name):
        '''Request membership for an organization'''
        if not toolkit.request.method == 'POST':
            raise toolkit.abort(400, 'Expected POST method')

        user = toolkit.c.userobj
        if not user:
            raise toolkit.NotAuthorized('Membership request requires an user')

        organization = Group.by_name(org_name)

        comment = toolkit.request.params.get('comment')
        membership_request = MembershipRequest(user, organization, comment)

        DB.add(membership_request)
        DB.commit()

        membership_request.notify_admins()

        return self.json_response({})
Exemple #28
0
    def getRecord(self, metadataPrefix, identifier):
        '''Simple getRecord for a dataset.
        '''
        package = Package.get(identifier)
        if not package:
            raise IdDoesNotExistError("No dataset with id %s" % identifier)

        set_spec = []
        if package.owner_org:
            group = Group.get(package.owner_org)
            if group and group.name:
                set_spec.append(group.name)
        if 'openaire_data' in package.as_dict().get('tags'):
            set_spec.append('openaire_data')
        if not set_spec:
            set_spec = [package.name]
        if metadataPrefix == 'rdf':
            return self._record_for_dataset_dcat(package, set_spec)
        if metadataPrefix == 'oai_openaire':
            return self._record_for_dataset_datacite(package, set_spec)
        return self._record_for_dataset(package, set_spec)
 def _filter_packages(set, cursor, from_, until, batch_size):
     '''Get a part of datasets for "listNN" verbs.
     '''
     packages = []
     group = None
     if not set:
         packages = Session.query(Package).filter(Package.type=='dataset'). \
             filter(Package.state == 'active').filter(Package.private!=True)
         if from_ and not until:
             packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                 filter(Package.name==PackageRevision.name)
         if until and not from_:
             packages = packages.filter(PackageRevision.revision_timestamp < until).\
                 filter(Package.name==PackageRevision.name)
         if from_ and until:
             packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                 filter(Package.name==PackageRevision.name)
         packages = packages.all()
     else:
         group = Group.get(set)
         if group:
             # Note that group.packages never returns private datasets regardless of 'with_private' parameter.
             packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \
                 filter(Package.state == 'active')
             if from_ and not until:
                 packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                     filter(Package.name==PackageRevision.name)
             if until and not from_:
                 packages = packages.filter(PackageRevision.revision_timestamp < until).\
                     filter(Package.name==PackageRevision.name)
             if from_ and until:
                 packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                     filter(Package.name==PackageRevision.name)
             packages = packages.all()
     if cursor is not None:
         cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages)
         packages = packages[cursor:cursor_end]
     return packages, group
    def setUp(self):
        licenses = get_voc_file(LICENSES_FILE)
        load_licenses(load_graph(licenses))
        Session.flush()

        user = User.get('dummy')

        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org:
            self.org = org.__dict__
        else:
            self.org = call_action('organization_create',
                              context={'user': user_name},
                              name='dummy',
                              identifier='aaaaaa')
    def test_mapping(self):

        # multilang requires lang to be set
        from pylons.i18n.translation import set_lang, get_lang
        import pylons
        class dummyreq(object):
            class p(object):
                translator = object()
            environ = {'pylons.pylons': p()}
        pylons.request = dummyreq()
        pylons.translator.pylons_lang = ['en_GB']
        set_lang('en_GB')
        assert get_lang() == ['en_GB']

        assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config"
        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        datasets = [d for d in p.datasets()]
        eq_(len(datasets), 1)
        package_dict = datasets[0]


        user = User.get('dummy')
        
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='dummy',
                                identifier='aaaaaa')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******',
                   'ignore_auth': True,
                   'defer_commit': False}
        package_schema = schema.default_create_package_schema()
        context['schema'] = package_schema
        _p = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'dummy',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'dummy',
              'notes': 'dummy',
              'owner_org': 'dummy',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified': datetime.now(),
              'guid': unicode(uuid.uuid4),
              'identifier': 'dummy'}
        
        package_dict.update(_p)
        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = ''
        package_data = call_action('package_create', context=context, **package_dict)

        p = Package.get(package_data['id'])

        # no groups should be assigned at this point (no map applied)
        assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras)
        assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group'))

        package_data = call_action('package_show', context=context, id=package_data['id'])

        # use test mapping, which replaces thememap1 to thememap2 and thememap3
        test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini')
        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file

        package_dict['theme'] = ['non-mappable', 'thememap1']

        expected_groups_existing = ['existing-group']
        expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2']
        expected_groups_multi = expected_groups_new + ['othergroup']

        package_dict.pop('extras', None)
        p = Package.get(package_data['id'])
        context['package'] = p 

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)
        
        #meta.Session.flush()
        #meta.Session.revision = repo.new_revision()

        # check - only existing group should be assigned
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,)

        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true'


        package_dict['theme'] = ['non-mappable', 'thememap1']
        package_data = call_action('package_update', context=context, **package_dict)


        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - this time, new groups should appear
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,)
        assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,)

        package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi']
        package_data = call_action('package_update', context=context, **package_dict)

        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - there should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,)
        assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,)

        package_data = call_action('package_update', context=context, **package_dict)

        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - there still should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,)
        assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,)

        meta.Session.rollback()
Exemple #32
0
    def test_clean_tags(self):
        
        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single',
            'owner_org': 'test-org',
            'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),
            'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),

        }

        user = User.get('dummy')
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('test-org')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='test-org')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******'} 
        package_schema = default_update_package_schema()
        context['schema'] = package_schema
        package_dict = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'fakename',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'fakename',
              'notes': 'dummy',
              'owner_org': 'test-org',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified' : datetime.now(),
              'guid': unicode(uuid4()),
              'identifier': 'dummy'}
        
        package_data = call_action('package_create', context=context, **package_dict)

        package = Package.get('fakename')
        source, job = self._create_source_and_job(source_fixture)
        job.package = package
        job.guid = uuid4()
        harvester = SpatialHarvester()
        with open(os.path.join('..', 'data', 'dataset.json')) as f:
            dataset = json.load(f)

        # long tags are invalid in all cases
        TAG_LONG_INVALID = 'abcdefghij' * 20
        # if clean_tags is not set to true, tags will be truncated to 50 chars
        TAG_LONG_VALID = TAG_LONG_INVALID[:50]
        # default truncate to 100
        TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100]

        assert len(TAG_LONG_VALID) == 50
        assert TAG_LONG_VALID[-1] == 'j'
        TAG_CHARS_INVALID = '[email protected]!'
        TAG_CHARS_VALID = 'pretty-invlidtag'

        dataset['tags'].append(TAG_LONG_INVALID)
        dataset['tags'].append(TAG_CHARS_INVALID)

        harvester.source_config = {'clean_tags': False}
        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']

        # no clean tags, so invalid chars are in
        # but tags are truncated to 50 chars
        assert {'name': TAG_CHARS_VALID} not in tags
        assert {'name': TAG_CHARS_INVALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
        assert {'name': TAG_LONG_INVALID} not in tags

        harvester.source_config = {'clean_tags': True}

        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']
        assert {'name': TAG_CHARS_VALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
    def test_records(self):
        """ Test record fetching via http-request to prevent accidental changes to interface """
        model.User(name="test", sysadmin=True).save()
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'test-organization',
                'title': "Test organization"
            })
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        package_2_data = deepcopy(package_1_data)

        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        packages = [
            get_action('package_create')({
                'user': '******'
            }, package_1_data),
            get_action('package_create')({
                'user': '******'
            }, package_2_data)
        ]

        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'ListSets'})

        root = lxml.etree.fromstring(result.body)
        request_set = self._get_single_result(root, "//o:set")

        set_name = request_set.xpath("string(o:setName)",
                                     namespaces=self._namespaces)
        set_spec = request_set.xpath("string(o:setSpec)",
                                     namespaces=self._namespaces)
        self.assertEquals(organization['name'], set_spec)
        self.assertEquals(organization['title'], set_name)

        result = self.app.get(url, {
            'verb': 'ListIdentifiers',
            'set': set_spec,
            'metadataPrefix': 'oai_dc'
        })

        root = lxml.etree.fromstring(result.body)
        fail = True

        package_identifiers = [package['id'] for package in packages]
        package_org_names = [
            Group.get(package['owner_org']).name for package in packages
        ]

        for header in root.xpath("//o:header", namespaces=self._namespaces):
            fail = False
            set_spec = header.xpath("string(o:setSpec)",
                                    namespaces=self._namespaces)
            identifier = header.xpath("string(o:identifier)",
                                      namespaces=self._namespaces)
            self.assertTrue(set_spec in package_org_names)
            self.assertTrue(identifier in package_identifiers)

            result = self.app.get(
                url, {
                    'verb': 'GetRecord',
                    'identifier': identifier,
                    'metadataPrefix': 'oai_dc'
                })

            root = lxml.etree.fromstring(result.body)

            fail_record = True
            for record_result in root.xpath("//o:record",
                                            namespaces=self._namespaces):
                fail_record = False
                header = self._get_single_result(record_result, 'o:header')
                self._get_single_result(record_result, 'o:metadata')

                self.assertTrue(
                    header.xpath("string(o:identifier)",
                                 namespaces=self._namespaces) in
                    package_identifiers)
                self.assertTrue(
                    header.xpath("string(o:setSpec)",
                                 namespaces=self._namespaces) in
                    package_org_names)

            self.assertFalse(fail_record, "No records received")

        self.assertFalse(fail, "No headers (packages) received")
Exemple #34
0
    def test_theme_to_group_mapping(self):
        # multilang requires lang to be set
        # class dummyreq(object):
        #     class p(object):
        #         translator = object()
        #     environ = {'pylons.pylons': p()}

        # CKANRequest(dummyreq)
        # pylons.request = dummyreq()
        # pylons.translator.pylons_lang = ['en_GB']

        #set_lang('en_GB')
        #assert get_lang() == ['en_GB']
        assert 'dcatapit_theme_group_mapper' in config[
            'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config'

        with open(get_example_file('dataset.rdf'), 'r') as f:
            contents = f.read()

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        package_dict = datasets[0]

        user = User.get('dummy')

        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org is None:
            org = call_action('organization_create',
                              context={'user': user_name},
                              name='dummy',
                              identifier='aaaaaa')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g = call_action('group_create',
                                     context={'user': user_name},
                                     name='existing-group')

        context = {'user': '******', 'ignore_auth': True, 'defer_commit': False}
        package_schema = schema.default_create_package_schema()
        context['schema'] = package_schema
        _p = {
            'frequency': 'manual',
            'publisher_name': 'dummy',
            'extras': [{
                'key': 'theme',
                'value': ['non-mappable', 'thememap1']
            }],
            'groups': [],  #  [{'name':existing_g.name}],
            'title': 'dummy',
            'holder_name': 'dummy',
            'holder_identifier': 'dummy',
            'name': 'dummy-' + uuid4().hex,
            'identifier': 'dummy' + uuid4().hex,
            'notes': 'dummy',
            'owner_org': 'dummy',
            'modified': datetime.now(),
            'publisher_identifier': 'dummy',
            'metadata_created': datetime.now(),
            'metadata_modified': datetime.now(),
            'guid': str(uuid.uuid4),
        }

        package_dict.update(_p)

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = ''
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false'

        package_data = call_action('package_create',
                                   context=context,
                                   **package_dict)

        p = Package.get(package_data['id'])

        # no groups should be assigned at this point (no map applied)
        assert {
            'theme': ['non-mappable', 'thememap1']
        } == p.extras, '{} vs {}'.format(_p['extras'], p.extras)
        assert [] == p.get_groups(
            group_type='group'), 'should be {}, got {}'.format(
                [], p.get_groups(group_type='group'))

        package_data = call_action('package_show',
                                   context=context,
                                   id=package_data['id'])

        # use test mapping, which replaces thememap1 to thememap2 and thememap3
        test_map_file = os.path.join(os.path.dirname(__file__), '..', '..',
                                     '..', 'examples', 'test_map.ini')

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false'

        # package_dict['theme'] = ['non-mappable', 'thememap1']

        package_dict.pop('extras', None)
        p = Package.get(package_data['id'])
        context['package'] = p

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        # check - only existing group should be assigned
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created
        expected_groups = ['existing-group']
        self.assertSetEqual(set(expected_groups), set(groups),
                            'Error in assigned groups')

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true'

        # package_dict['theme'] = ['non-mappable', 'thememap1']
        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - this time, new groups should appear
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # the map file maps ECON to existing group and 2 other groups that have been automatically created
        expected_groups = expected_groups + ['somegroup1', 'somegroup2']
        self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ')

        # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi']
        aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE])
        aggr.append({'theme': 'thememap-multi', 'subthemes': []})
        package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr)

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - there should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups
        expected_groups = expected_groups + ['othergroup']
        self.assertEqual(len(expected_groups), len(groups),
                         'New groups differ - there may be duplicated groups')
        self.assertSetEqual(set(expected_groups), set(groups),
                            'New groups differ')

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - there still should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        self.assertEqual(len(expected_groups), len(groups),
                         'New groups differ - there may be duplicated groups')
        self.assertSetEqual(set(expected_groups), set(groups),
                            'New groups differ')

        meta.Session.rollback()
Exemple #35
0
    def test_clean_tags(self):
        
        # Create source
        source_fixture = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
            'source_type': u'gemini-single',
            'owner_org': 'test-org',
            'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),
            'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),

        }

        user = User.get('dummy')
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('test-org')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='test-org')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******'} 
        package_schema = default_update_package_schema()
        context['schema'] = package_schema
        package_dict = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'fakename',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'fakename',
              'notes': 'dummy',
              'owner_org': 'test-org',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified' : datetime.now(),
              'guid': unicode(uuid4()),
              'identifier': 'dummy'}
        
        package_data = call_action('package_create', context=context, **package_dict)

        package = Package.get('fakename')
        source, job = self._create_source_and_job(source_fixture)
        job.package = package
        job.guid = uuid4()
        harvester = SpatialHarvester()
        with open(os.path.join('..', 'data', 'dataset.json')) as f:
            dataset = json.load(f)

        # long tags are invalid in all cases
        TAG_LONG_INVALID = 'abcdefghij' * 20
        # if clean_tags is not set to true, tags will be truncated to 50 chars
        TAG_LONG_VALID = TAG_LONG_INVALID[:50]
        # default truncate to 100
        TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100]

        assert len(TAG_LONG_VALID) == 50
        assert TAG_LONG_VALID[-1] == 'j'
        TAG_CHARS_INVALID = '[email protected]!'
        TAG_CHARS_VALID = 'pretty-invlidtag'

        dataset['tags'].append(TAG_LONG_INVALID)
        dataset['tags'].append(TAG_CHARS_INVALID)

        harvester.source_config = {'clean_tags': False}
        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']

        # no clean tags, so invalid chars are in
        # but tags are truncated to 50 chars
        assert {'name': TAG_CHARS_VALID} not in tags
        assert {'name': TAG_CHARS_INVALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
        assert {'name': TAG_LONG_INVALID} not in tags

        harvester.source_config = {'clean_tags': True}

        out = harvester.get_package_dict(dataset, job)
        tags = out['tags']
        assert {'name': TAG_CHARS_VALID} in tags
        assert {'name': TAG_LONG_VALID_LONG} in tags
 def _fetch_import_set(self, harvest_object, master_data, client, group):
     # Could be genuine fetch or retry of set insertions.
     if 'set' in master_data:
         # Fetch stage.
         args = {self.metadata_prefix_key: self.metadata_prefix_value, 'set': master_data['set']}
         if 'from_' in master_data:
             args['from_'] = self._datetime_from_str(master_data['from_'])
         if 'until' in master_data:
             args['until'] = self._datetime_from_str(master_data['until'])
         ids = []
         try:
             for identity in client.listIdentifiers(**args):
                 ids.append(identity.identifier())
         except NoRecordsMatchError:
             return False  # Ok, empty set. Nothing to do.
         except socket.error:
             errno, errstr = sys.exc_info()[:2]
             self._save_object_error(
                 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr,),
                 harvest_object, stage='Fetch')
             return False
         except httplib.BadStatusLine:
             self._save_object_error(
                 'Bad HTTP response status line.',
                 harvest_object, stage='Fetch')
             return False
         master_data['record_ids'] = ids
     else:
         log.debug('Reinsert: %s %i' % (master_data['set_name'], len(master_data['record_ids']),))
     # Do not save to DB because we can't.
     # Import stage.
     model.repo.new_revision()
     subg_name = '%s - %s' % (group.name, master_data['set_name'],)
     subgroup = Group.by_name(subg_name)
     if not subgroup:
         subgroup = Group(name=subg_name, description=subg_name)
         setup_default_user_roles(subgroup)
         subgroup.save()
     missed = []
     for ident in master_data['record_ids']:
         pkg_name = self._package_name_from_identifier(ident)
         # Package may have been omitted due to missing metadata.
         pkg = Package.get(pkg_name)
         if pkg:
             subgroup.add_package_by_name(pkg_name)
             subgroup.save()
             if 'set' not in master_data:
                 log.debug('Inserted %s into %s' % (pkg_name, subg_name,))
         else:
             # Either omitted due to missing metadata or fetch error.
             # In the latter case, we want to add record later once the
             # fetch succeeds after retry.
             missed.append(ident)
             if 'set' not in master_data:
                 log.debug('Omitted %s from %s' % (pkg_name, subg_name,))
     if len(missed):
         # Store missing names for retry.
         master_data['record_ids'] = missed
         if 'set' in master_data:
             del master_data['set']  # Omit fetch later.
         harvest_object.content = json.dumps(master_data)
         log.debug('Missed %s %i' % (master_data['set_name'], len(missed),))
     else:
         harvest_object.content = None  # Clear data.
     model.repo.commit()
     return True
Exemple #37
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True
Exemple #38
0
    def test_holder(self):
        org = {'name': 'org-test', 'title': 'Test org', 'identifier': 'abc'}

        pkg1 = {
            # 'id': '2b6fe9ca-dc77-4cec-92a4-55c6624a5bd6',
            'name': 'test-dataset-1',
            'title': 'Dataset di test DCAT_AP-IT',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'modified': '2016-11-29',
            'identifier': str(uuid.uuid4()),
            'frequency': 'UPDATE_CONT',
            'publisher_name': 'bolzano',
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '789789789',
            'holder_name': 'bolzano',
            'holder_identifier': '234234234',
            FIELD_THEMES_AGGREGATE: themes_to_aggr_json(('ECON', )),
            'theme':
            json.dumps([theme_name_to_uri(name) for name in ('ECON', )]),
            'dataset_is_local': False,
            'language': '{DEU,ENG,ITA}',
        }

        pkg2 = {
            # 'id': 'eb6fe9ca-dc77-4cec-92a4-55c6624a5b00',
            'name': 'test-dataset-2',
            'title': 'Dataset di test DCAT_AP-IT 2',
            'notes': 'dcatapit dataset di test',
            'metadata_created': '2015-06-26T15:21:09.034694',
            'metadata_modified': '2015-06-26T15:21:09.075774',
            'modified': '2016-11-29',
            'identifier': str(uuid.uuid4()),
            'frequency': 'UPDATE_CONT',
            'publisher_name': 'bolzano',
            'publisher_identifier': '234234234',
            'creator_name': 'test',
            'creator_identifier': '123123123123',
            FIELD_THEMES_AGGREGATE: themes_to_aggr_json(('ENVI', )),
            'theme':
            json.dumps([theme_name_to_uri(name) for name in ('ENVI', )]),
            'dataset_is_local': True,
            'language': '{DEU,ENG,ITA}',
            'owner_org': org['name'],
        }

        src_packages = [pkg1, pkg2]
        ctx = {'ignore_auth': True, 'user': self._get_user()['name']}

        org_loaded = Group.by_name(org['name'])
        if org_loaded:
            org_dict = org_loaded.__dict__
        else:
            org_dict = helpers.call_action('organization_create',
                                           context=ctx,
                                           **org)
        pkg1['owner_org'] = org_dict['id']
        pkg2['owner_org'] = org_dict['id']

        created_packages = [
            helpers.call_action('package_create', context=ctx, **pkg)
            for pkg in src_packages
        ]

        for pkg in created_packages:
            s = RDFSerializer()
            g = s.g
            dataset_ref = s.graph_from_dataset(pkg)
            has_identifier = False
            rights_holders = list(g.objects(dataset_ref, DCT.rightsHolder))

            assert len(rights_holders), 'There should be one rights holder for\n {}:\n {}'.\
                format(pkg, s.serialize_dataset(pkg))

            for holder_ref in rights_holders:
                _holder_names = list(g.objects(holder_ref, FOAF.name))
                _holder_ids = list(
                    (str(ob) for ob in g.objects(holder_ref, DCT.identifier)))

                # local dataset will use organization name only
                # while remote will have at least two names - one with lang, one default without lang
                if pkg['dataset_is_local']:
                    num_holder_names = 1
                else:
                    num_holder_names = 2
                assert len(_holder_names) == num_holder_names, _holder_names
                assert len(_holder_ids) == 1

                test_id = pkg.get(
                    'holder_identifier') or org_dict['identifier']
                has_identifier = _holder_ids[0] == test_id
                assert has_identifier, \
                    f'No identifier in {_holder_ids} (expected {test_id}) for\n {pkg}\n{s.serialize_dataset(pkg)}'