Exemple #1
0
    def listIdentifiers(self, metadataPrefix, set=None, cursor=None,
                        from_=None, until=None, batch_size=None):
        '''List all identifiers for this repository.
        '''
        data = []
        packages = []
        group = None
        if not set:
            if not from_ and not until:
                packages = Session.query(Package).filter(Package.type=='dataset').\
                    filter(Package.private!=True).filter(Package.state=='active').all()
            else:
                if from_ and not until:
                    packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                        filter(PackageRevision.revision_timestamp > from_).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
                if until and not from_:
                    packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                        filter(PackageRevision.revision_timestamp < until).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
                if from_ and until:
                    packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                        filter(between(PackageRevision.revision_timestamp, from_, until)).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
        else:
            group = Group.get(set)
            if group:
                packages = group.packages(return_query=True).filter(Package.type=='dataset').\
                    filter(Package.private!=True).filter(Package.state=='active')
                if from_ and not until:
                    packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active')
                if until and not from_:
                    packages = packages.filter(PackageRevision.revision_timestamp < until).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active')
                if from_ and until:
                    packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                        filter(Package.name==PackageRevision.name).filter(Package.state=='active')
                packages = packages.all()
        if cursor:
            packages = packages[cursor:]
        for package in packages:
            spec = package.name
            if group:
                spec = group.name
            else:
                if package.owner_org:
                    group = Group.get(package.owner_org)
                    if group and group.name:
                        spec = group.name
                    group = None
            data.append(common.Header('', package.id, package.metadata_created, [spec], False))

        return data
Exemple #2
0
 def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None,
                 until=None, batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages = []
     group = None
     if not set:
         if not from_ and not until:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(Package.state=='active').all()
         if from_ and not until:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(PackageRevision.revision_timestamp > from_).filter(Package.name==PackageRevision.name).\
                 filter(Package.state=='active').all()
         if until and not from_:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(PackageRevision.revision_timestamp < until).filter(Package.name==PackageRevision.name).\
                 filter(Package.state=='active').all()
         if from_ and until:
             packages = Session.query(Package).filter(Package.type=='dataset').filter(Package.private!=True).\
                 filter(between(PackageRevision.revision_timestamp, from_, until)).\
                 filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
     else:
         group = Group.get(set)
         if group:
             packages = group.packages(return_query=True)
             if from_ and not until:
                 packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                     filter(Package.type=='dataset').filter(Package.private!=True).\
                     filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
             if until and not from_:
                 packages = packages.filter(PackageRevision.revision_timestamp < until).\
                     filter(Package.type=='dataset').filter(Package.private!=True).\
                     filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
             if from_ and until:
                 packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                     filter(Package.type=='dataset').filter(Package.private!=True).\
                     filter(Package.name==PackageRevision.name).filter(Package.state=='active').all()
     if cursor:
         packages = packages[cursor:]
     for res in packages:
         spec = res.name
         if group:
             spec = group.name
         else:
             if res.owner_org:
                 group = Group.get(res.owner_org)
                 if group and group.name:
                     spec = group.name
                 group = None
         data.append(self._record_for_dataset(res, spec))
     return data
    def test_zaincremental_harvester(self):

        client = CKANServer()
        metadata_registry = metadata.MetadataRegistry()
        metadata_registry.registerReader('oai_dc', oai_dc_reader)
        metadata_registry.registerWriter('oai_dc', oai_dc_writer)
        serv = BatchingServer(client, metadata_registry=metadata_registry)
        oaipmh.client.Client = mock.Mock(return_value=ServerClient(serv, metadata_registry))
        harv = OAIPMHHarvester()
        harvest_job = HarvestJob()
        harvest_job.source = HarvestSource()
        harvest_job.source.title = "Test"
        harvest_job.source.url = "http://helda.helsinki.fi/oai/request"
        harvest_job.gather_started = ((datetime.now() + timedelta(days=1)))
        harvest_job.source.config = '{"incremental":"True"}'
        harvest_job.source.type = "OAI-PMH"
        Session.add(harvest_job)
        rev = model.repo.new_revision()
        rev.timestamp = ((datetime.now() + timedelta(days=2)))
        pkg = Package(name='footest', revision=rev)
        Session.add(pkg)
        pkg.save()
        roger = Group.get('roger')
        roger.add_package_by_name('footest')
        Session.add(roger)
        roger.save()
        gathered = harv.gather_stage(harvest_job)
        harvest_object = HarvestObject.get(gathered[0])
        harv.fetch_stage(harvest_object)
        harvobj = json.loads(harvest_object.content)
        self.assert_(harvobj['records'])
def get_discipline(context, data_dict):
    model = context['model']

    terms = data_dict.get('query') or data_dict.get('q') or []
    if isinstance(terms, basestring):
        terms = [terms]
    terms = [t.strip() for t in terms if t.strip()]

    if 'fields' in data_dict:
        log.warning('"fields" parameter is deprecated.  '
                    'Use the "query" parameter instead')

    offset = data_dict.get('offset')
    limit = data_dict.get('limit')

    # TODO: should we check for user authentication first?
    q = model.Session.query(model.Group)

    if not len(terms):
        return [], 0
    katagrp = Group.get('KATA')
    res = []
    for term in terms:
        escaped_term = misc.escape_sql_like_special_characters(term, escape='\\')
        for child in katagrp.get_children_groups():
            if escaped_term in child['name']:
                res.append(child)
    return res
Exemple #5
0
    def listRecords(self,
                    metadataPrefix=None,
                    set=None,
                    cursor=None,
                    from_=None,
                    until=None,
                    batch_size=None):
        '''Show a selection of records, basically lists all datasets.
        '''
        data = []
        packages, setspc = self._filter_packages(set, cursor, from_, until,
                                                 batch_size)

        for package in packages:
            set_spec = []
            if setspc:
                set_spec.append(setspc)
            if package.owner_org:
                group = Group.get(package.owner_org)
                if group and group.name:
                    set_spec.append(group.name)
            if not set_spec:
                set_spec = [package.name]
            if metadataPrefix == 'rdf':
                data.append(self._record_for_dataset_dcat(package, set_spec))
            if metadataPrefix == 'oai_openaire':
                data.append(
                    self._record_for_dataset_datacite(package, set_spec))
            else:
                data.append(self._record_for_dataset(package, set_spec))
        return data
Exemple #6
0
 def listIdentifiers(self,
                     metadataPrefix=None,
                     set=None,
                     cursor=None,
                     from_=None,
                     until=None,
                     batch_size=None):
     '''List all identifiers for this repository.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until,
                                             batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         data.append(
             common.Header('', package.id, package.metadata_created, [spec],
                           False))
     return data
def add_to_group(key, data, errors, context):
    val = data.get(key)
    if val:
        repo.new_revision()
        grp = Group.get(val)
        grp.add_package_by_name(data[('name',)])
        grp.save()
Exemple #8
0
 def listRecords(self,
                 metadataPrefix=None,
                 set=None,
                 cursor=None,
                 from_=None,
                 until=None,
                 batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until,
                                             batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         if metadataPrefix == 'rdf':
             data.append(self._record_for_dataset_dcat(package, spec))
         else:
             data.append(self._record_for_dataset(package, spec))
     return data
    def test_records(self):
        """ Test record fetching via http-request to prevent accidental changes to interface """
        model.User(name="test", sysadmin=True).save()
        organization = get_action('organization_create')({'user': '******'}, {'name': 'test-organization', 'title': "Test organization"})
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        package_2_data = deepcopy(package_1_data)

        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        packages = [get_action('package_create')({'user': '******'}, package_1_data),
                    get_action('package_create')({'user': '******'}, package_2_data)]

        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'ListSets'})

        root = lxml.etree.fromstring(result.body)
        request_set = self._get_single_result(root, "//o:set")

        set_name = request_set.xpath("string(o:setName)", namespaces=self._namespaces)
        set_spec = request_set.xpath("string(o:setSpec)", namespaces=self._namespaces)
        self.assertEquals(organization['name'], set_spec)
        self.assertEquals(organization['title'], set_name)

        result = self.app.get(url, {'verb': 'ListIdentifiers', 'set': set_spec, 'metadataPrefix': 'oai_dc'})

        root = lxml.etree.fromstring(result.body)
        fail = True

        package_identifiers = [package['id'] for package in packages]
        package_org_names = [Group.get(package['owner_org']).name for package in packages]

        for header in root.xpath("//o:header", namespaces=self._namespaces):
            fail = False
            set_spec = header.xpath("string(o:setSpec)", namespaces=self._namespaces)
            identifier = header.xpath("string(o:identifier)", namespaces=self._namespaces)
            self.assertTrue(set_spec in package_org_names)
            self.assertTrue(identifier in package_identifiers)

            result = self.app.get(url, {'verb': 'GetRecord', 'identifier': identifier, 'metadataPrefix': 'oai_dc'})

            root = lxml.etree.fromstring(result.body)

            fail_record = True
            for record_result in root.xpath("//o:record", namespaces=self._namespaces):
                fail_record = False
                header = self._get_single_result(record_result, 'o:header')
                self._get_single_result(record_result, 'o:metadata')

                self.assertTrue(header.xpath("string(o:identifier)", namespaces=self._namespaces) in package_identifiers)
                self.assertTrue(header.xpath("string(o:setSpec)", namespaces=self._namespaces) in package_org_names)

            self.assertFalse(fail_record, "No records received")

        self.assertFalse(fail, "No headers (packages) received")
Exemple #10
0
 def getRecord(self, metadataPrefix, identifier):
     '''Simple getRecord for a dataset.
     '''
     package = Package.get(identifier)
     if not package:
         raise IdDoesNotExistError("No dataset with id %s" % identifier)
     spec = package.name
     if package.owner_org:
         group = Group.get(package.owner_org)
         if group and group.name:
             spec = group.name
     return self._record_for_dataset(package, spec)
Exemple #11
0
 def _filter_packages(set, cursor, from_, until, batch_size):
     '''Get a part of datasets for "listNN" verbs.
     '''
     packages = []
     setspc = None
     if not set:
         packages = Session.query(Package).filter(Package.type=='dataset'). \
             filter(Package.state == 'active').filter(Package.private!=True)
         if from_ and not until:
             packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                 filter(Package.name==PackageRevision.name)
         if until and not from_:
             packages = packages.filter(PackageRevision.revision_timestamp < until).\
                 filter(Package.name==PackageRevision.name)
         if from_ and until:
             packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                 filter(Package.name==PackageRevision.name)
         if batch_size:
             packages = packages.limit(batch_size)
         if cursor:
             packages = packages.offset(cursor)
         packages = packages.all()
     elif set == 'openaire_data':
         oa_tag = Session.query(Tag).filter(
             Tag.name == 'openaire_data').first()
         if oa_tag:
             packages = oa_tag.packages
         setspc = set
     else:
         group = Group.get(set)
         if group:
             # Note that group.packages never returns private datasets regardless of 'with_private' parameter.
             packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \
                 filter(Package.state == 'active')
             if from_ and not until:
                 packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                     filter(Package.name==PackageRevision.name)
             if until and not from_:
                 packages = packages.filter(PackageRevision.revision_timestamp < until).\
                     filter(Package.name==PackageRevision.name)
             if from_ and until:
                 packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                     filter(Package.name==PackageRevision.name)
             if batch_size:
                 packages = packages.limit(batch_size)
             if cursor:
                 packages = packages.offset(cursor)
             packages = packages.all()
     # if cursor is not None:
     #     cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages)
     #     packages = packages[cursor:cursor_end]
     return packages, setspc
Exemple #12
0
 def initdb(self):
     kata = Group.get('KATA')
     if not kata:
         repo.new_revision()
         kata = Group(name="KATA", title="Tieteenalat")
         kata.save()
         for tiede in tieteet.tieteet:
             t = Group(description=tiede['description'],
                       name=tiede['name'],
                       title=tiede['title'])
             t.save()
             m = Member(group=kata, table_id=t.id, table_name="group")
             m.save()
     setup()
Exemple #13
0
 def getRecord(self, metadataPrefix, identifier):
     '''Simple getRecord for a dataset.
     '''
     package = Package.get(identifier)
     if not package:
         raise IdDoesNotExistError("No dataset with id %s" % identifier)
     spec = package.name
     if package.owner_org:
         group = Group.get(package.owner_org)
         if group and group.name:
             spec = group.name
     if metadataPrefix == 'rdf':
         return self._record_for_dataset_dcat(package, spec)
     return self._record_for_dataset(package, spec)
def harvest_source_dictize(source, context):
    out = source.as_dict()

    out['publisher_title'] = u''

    publisher_id = out.get('publisher_id')
    if publisher_id:
        group = Group.get(publisher_id)
        if group:
            out['publisher_title'] = group.title

    out['status'] = _get_source_status(source, context)

    return out
def harvest_source_dictize(source, context):
    out = source.as_dict()

    out['publisher_title'] = u''

    publisher_id = out.get('publisher_id')
    if publisher_id:
        group  = Group.get(publisher_id)
        if group:
            out['publisher_title'] = group.title

    out['status'] = _get_source_status(source, context)


    return out
Exemple #16
0
 def listIdentifiers(self, metadataPrefix, set=None, cursor=None,
                     from_=None, until=None, batch_size=None):
     '''List all identifiers for this repository.
     '''
     data = []
     packages = []
     if not set:
         if not from_ and not until:
             packages = Session.query(Package).all()
         else:
             if from_:
                 packages = Session.query(Package).\
                     filter(PackageRevision.revision_timestamp > from_).\
                     all()
             if until:
                 packages = Session.query(Package).\
                     filter(PackageRevision.revision_timestamp < until).\
                     all()
             if from_ and until:
                 packages = Session.query(Package).\
                     filter(between(PackageRevision.revision_timestamp,
                                    from_,
                                    until)\
                            ).all()
     else:
         group = Group.get(set)
         if group:
             packages = group.active_packages()
             if from_ and not until:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp > from_)
             if until and not from_:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp < until)
             if from_ and until:
                 packages = packages.filter(
                     between(PackageRevision.revision_timestamp,
                             from_,
                             until))
             packages = packages.all()
     if cursor:
         packages = packages[:cursor]
     for package in packages:
         data.append(common.Header(package.id,
                                   package.metadata_created,
                                   [package.name],
                                   False))
     return data
 def listIdentifiers(self, metadataPrefix=None, set=None, cursor=None,
                     from_=None, until=None, batch_size=None):
     '''List all identifiers for this repository.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until, batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         data.append(common.Header('', package.id, package.metadata_created, [spec], False))
     return data
Exemple #18
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.
        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        # Do common tasks and then call different methods depending on what
        # kind of info the harvest object contains.
        self._set_config(harvest_object.job.source.config)
        ident = json.loads(harvest_object.content)
        
        registry = MetadataRegistry()
        if 'metadata_formats' in self.config: 
            for mdp in self.config['metadata_formats']: 
                registry.registerReader(mdp, kata_oai_dc_reader) 
            if self.metadata_prefix_value not in self.config['metadata_formats']: 
                registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) 
        else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader)

        client = oaipmh.client.Client(harvest_object.job.source.url, registry)
        client.updateGranularity() #quickfix for granularity
        domain = ident['domain']
        group = Group.get(domain)  # Checked in gather_stage so exists.
        try:
            if ident['fetch_type'] == 'record':
                return self._fetch_import_record(harvest_object, ident, client, group)
            if ident['fetch_type'] == 'set':
                return self._fetch_import_set(harvest_object, ident, client, group)
            # This should not happen...
            log.error('Unknown fetch type: %s' % ident['fetch_type'])
        except Exception as e:
            # Guard against miscellaneous stuff. Probably plain bugs.
            # Also very rare exceptions we haven't seen yet.
            self._add_retry(harvest_object)
            log.debug(traceback.format_exc(e))
        return False
Exemple #19
0
def harvest_source_dictize(source, context, last_job_status=False):
    out = source.as_dict()

    out['publisher_title'] = u''

    publisher_id = out.get('publisher_id')
    if publisher_id:
        group = Group.get(publisher_id)
        if group:
            out['publisher_title'] = group.title

    out['status'] = _get_source_status(source, context)

    if last_job_status:
        source_status = logic.get_action('harvest_source_show_status')(context, {'id': source.id})
        out['last_job_status'] = source_status.get('last_job', {})

    return out
Exemple #20
0
def add_to_group(key, data, errors, context):
    '''
    Add a new group if it doesn't yet exist.

    :param key: key
    :param data: data
    :param errors: validation errors
    :param context: context
    '''
    val = data.get(key)
    if val:
        repo.new_revision()
        grp = Group.get(val)
        # UI code needs group created if it does not match. Hence do so.
        if not grp:
            grp = Group(name=val, description=val, title=val)
            setup_default_user_roles(grp)
            grp.save()
        repo.commit()
Exemple #21
0
 def listRecords(self, metadataPrefix, set=None, cursor=None, from_=None,
                 until=None, batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages = []
     if not set:
         if not from_ and not until:
             packages = Session.query(Package).all()
         if from_:
             packages = Session.query(Package).\
                 filter(PackageRevision.revision_timestamp > from_).all()
         if until:
             packages = Session.query(Package).\
                 filter(PackageRevision.revision_timestamp < until).all()
         if from_ and until:
             packages = Session.query(Package).filter(
                 between(PackageRevision.revision_timestamp,from_,until)).\
                 all()
     else:
         group = Group.get(set)
         if group:
             packages = group.active_packages()
             if from_ and not until:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp > from_).\
                     all()
             if until and not from_:
                 packages = packages.\
                     filter(PackageRevision.revision_timestamp < until).\
                     all()
             if from_ and until:
                 packages = packages.filter(
                         between(PackageRevision.revision_timestamp,
                                 from_,
                                 until))\
                                 .all()
     if cursor:
         packages = packages[:cursor]
     for res in packages:
         data.append(self._record_for_dataset(res))
     return data
 def listRecords(self, metadataPrefix=None, set=None, cursor=None, from_=None,
                 until=None, batch_size=None):
     '''Show a selection of records, basically lists all datasets.
     '''
     data = []
     packages, group = self._filter_packages(set, cursor, from_, until, batch_size)
     for package in packages:
         spec = package.name
         if group:
             spec = group.name
         else:
             if package.owner_org:
                 group = Group.get(package.owner_org)
                 if group and group.name:
                     spec = group.name
         if metadataPrefix == 'rdf':
             data.append(self._record_for_dataset_dcat(package, spec))
         else:
             data.append(self._record_for_dataset(package, spec))
     return data
Exemple #23
0
    def getRecord(self, metadataPrefix, identifier):
        '''Simple getRecord for a dataset.
        '''
        package = Package.get(identifier)
        if not package:
            raise IdDoesNotExistError("No dataset with id %s" % identifier)

        set_spec = []
        if package.owner_org:
            group = Group.get(package.owner_org)
            if group and group.name:
                set_spec.append(group.name)
        if 'openaire_data' in package.as_dict().get('tags'):
            set_spec.append('openaire_data')
        if not set_spec:
            set_spec = [package.name]
        if metadataPrefix == 'rdf':
            return self._record_for_dataset_dcat(package, set_spec)
        if metadataPrefix == 'oai_openaire':
            return self._record_for_dataset_datacite(package, set_spec)
        return self._record_for_dataset(package, set_spec)
 def _filter_packages(set, cursor, from_, until, batch_size):
     '''Get a part of datasets for "listNN" verbs.
     '''
     packages = []
     group = None
     if not set:
         packages = Session.query(Package).filter(Package.type=='dataset'). \
             filter(Package.state == 'active').filter(Package.private!=True)
         if from_ and not until:
             packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                 filter(Package.name==PackageRevision.name)
         if until and not from_:
             packages = packages.filter(PackageRevision.revision_timestamp < until).\
                 filter(Package.name==PackageRevision.name)
         if from_ and until:
             packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                 filter(Package.name==PackageRevision.name)
         packages = packages.all()
     else:
         group = Group.get(set)
         if group:
             # Note that group.packages never returns private datasets regardless of 'with_private' parameter.
             packages = group.packages(return_query=True, with_private=False).filter(Package.type=='dataset'). \
                 filter(Package.state == 'active')
             if from_ and not until:
                 packages = packages.filter(PackageRevision.revision_timestamp > from_).\
                     filter(Package.name==PackageRevision.name)
             if until and not from_:
                 packages = packages.filter(PackageRevision.revision_timestamp < until).\
                     filter(Package.name==PackageRevision.name)
             if from_ and until:
                 packages = packages.filter(between(PackageRevision.revision_timestamp, from_, until)).\
                     filter(Package.name==PackageRevision.name)
             packages = packages.all()
     if cursor is not None:
         cursor_end = cursor + batch_size if cursor + batch_size < len(packages) else len(packages)
         packages = packages[cursor:cursor_end]
     return packages, group
    def test_records(self):
        """ Test record fetching via http-request to prevent accidental changes to interface """
        model.User(name="test", sysadmin=True).save()
        organization = get_action('organization_create')(
            {
                'user': '******'
            }, {
                'name': 'test-organization',
                'title': "Test organization"
            })
        package_1_data = deepcopy(TEST_DATADICT)
        package_1_data['owner_org'] = organization['name']
        package_1_data['private'] = False
        package_2_data = deepcopy(package_1_data)

        for pid in package_1_data.get('pids', []):
            pid['id'] = utils.generate_pid()
        for pid in package_2_data.get('pids', []):
            pid['id'] = utils.generate_pid()

        packages = [
            get_action('package_create')({
                'user': '******'
            }, package_1_data),
            get_action('package_create')({
                'user': '******'
            }, package_2_data)
        ]

        url = url_for('/oai')
        result = self.app.get(url, {'verb': 'ListSets'})

        root = lxml.etree.fromstring(result.body)
        request_set = self._get_single_result(root, "//o:set")

        set_name = request_set.xpath("string(o:setName)",
                                     namespaces=self._namespaces)
        set_spec = request_set.xpath("string(o:setSpec)",
                                     namespaces=self._namespaces)
        self.assertEquals(organization['name'], set_spec)
        self.assertEquals(organization['title'], set_name)

        result = self.app.get(url, {
            'verb': 'ListIdentifiers',
            'set': set_spec,
            'metadataPrefix': 'oai_dc'
        })

        root = lxml.etree.fromstring(result.body)
        fail = True

        package_identifiers = [package['id'] for package in packages]
        package_org_names = [
            Group.get(package['owner_org']).name for package in packages
        ]

        for header in root.xpath("//o:header", namespaces=self._namespaces):
            fail = False
            set_spec = header.xpath("string(o:setSpec)",
                                    namespaces=self._namespaces)
            identifier = header.xpath("string(o:identifier)",
                                      namespaces=self._namespaces)
            self.assertTrue(set_spec in package_org_names)
            self.assertTrue(identifier in package_identifiers)

            result = self.app.get(
                url, {
                    'verb': 'GetRecord',
                    'identifier': identifier,
                    'metadataPrefix': 'oai_dc'
                })

            root = lxml.etree.fromstring(result.body)

            fail_record = True
            for record_result in root.xpath("//o:record",
                                            namespaces=self._namespaces):
                fail_record = False
                header = self._get_single_result(record_result, 'o:header')
                self._get_single_result(record_result, 'o:metadata')

                self.assertTrue(
                    header.xpath("string(o:identifier)",
                                 namespaces=self._namespaces) in
                    package_identifiers)
                self.assertTrue(
                    header.xpath("string(o:setSpec)",
                                 namespaces=self._namespaces) in
                    package_org_names)

            self.assertFalse(fail_record, "No records received")

        self.assertFalse(fail, "No headers (packages) received")
Exemple #26
0
    def import_stage(self, harvest_object):
        '''
        The import stage will receive a HarvestObject object and will be
        responsible for:
            - performing any necessary action with the fetched object (e.g
              create a CKAN package).
              Note: if this stage creates or updates a package, a reference
              to the package must be added to the HarvestObject.
              Additionally, the HarvestObject must be flagged as current.
            - creating the HarvestObject - Package relation (if necessary)
            - creating and storing any suitable HarvestObjectErrors that may
              occur.
            - returning True if everything went as expected, False otherwise.

        :param harvest_object: HarvestObject object
        :returns: True if everything went right, False if errors were found
        '''
        model.repo.new_revision()
        master_data = json.loads(harvest_object.content)
        domain = master_data['domain']
        group = Group.get(domain)
        if not group:
            group = Group(name=domain, description=domain)
        if 'records' in master_data:
            records = master_data['records']
            set_name = master_data['set_name']
            for rec in records:
                identifier, metadata, _ = rec
                if metadata:
                    name = metadata['title'][0] if len(metadata['title'])\
                                                else identifier
                    title = name
                    norm_title = unicodedata.normalize('NFKD', name)\
                                 .encode('ASCII', 'ignore')\
                                 .lower().replace(' ', '_')[:35]
                    slug = ''.join(e for e in norm_title
                                    if e in string.ascii_letters + '_')
                    name = slug
                    creator = metadata['creator'][0]\
                                if len(metadata['creator']) else ''
                    description = metadata['description'][0]\
                                if len(metadata['description']) else ''
                    pkg = Package.by_name(name)
                    if not pkg:
                        pkg = Package(name=name, title=title)
                    extras = {}
                    for met in metadata.items():
                        key, value = met
                        if len(value) > 0:
                            if key == 'subject' or key == 'type':
                                for tag in value:
                                    if tag:
                                        tag = munge_tag(tag[:100])
                                        tag_obj = model.Tag.by_name(tag)
                                        if not tag_obj:
                                            tag_obj = model.Tag(name=tag)
                                        if tag_obj:
                                            pkgtag = model.PackageTag(
                                                                  tag=tag_obj,
                                                                  package=pkg)
                                            Session.add(tag_obj)
                                            Session.add(pkgtag)
                            else:
                                extras[key] = ' '.join(value)
                    pkg.author = creator
                    pkg.author_email = creator
                    pkg.title = title
                    pkg.notes = description
                    pkg.extras = extras
                    pkg.url = \
                    "%s?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc"\
                                % (harvest_object.job.source.url, identifier)
                    pkg.save()
                    harvest_object.package_id = pkg.id
                    Session.add(harvest_object)
                    setup_default_user_roles(pkg)
                    url = ''
                    for ids in metadata['identifier']:
                        if ids.startswith('http://'):
                            url = ids
                    title = metadata['title'][0] if len(metadata['title'])\
                                                    else ''
                    description = metadata['description'][0]\
                                    if len(metadata['description']) else ''
                    pkg.add_resource(url, description=description, name=title)
                    group.add_package_by_name(pkg.name)
                    subg_name = "%s - %s" % (domain, set_name)
                    subgroup = Group.by_name(subg_name)
                    if not subgroup:
                        subgroup = Group(name=subg_name, description=subg_name)
                    subgroup.add_package_by_name(pkg.name)
                    Session.add(group)
                    Session.add(subgroup)
                    setup_default_user_roles(group)
                    setup_default_user_roles(subgroup)
            model.repo.commit()
        else:
            self._save_object_error('Could not receive any objects from fetch!'
                                    , harvest_object, stage='Import')
            return False
        return True