예제 #1
0
    def test_0_package_role(self):
        test0 = model.Package.by_name(u'test0')
        mradmin = model.User.by_name(u'tester')
        uor = model.UserObjectRole(role=model.Role.ADMIN, user=mradmin)
        model.Session.add(uor)
        pr = model.PackageRole(role=model.Role.ADMIN,
                               package=test0,
                               user=mradmin)
        model.Session.add(pr)
        test0 = model.Package.by_name(u'test0')
        prs = model.Session.query(model.PackageRole).filter_by(
            role=model.Role.ADMIN, package=test0, user=mradmin)
        model.repo.commit_and_remove()

        # basic test of existence
        assert len(prs.all()) == 1, prs.all()
        pr = prs.first()
        assert pr.context == 'Package', pr.context

        # test delete-orphan
        q = model.Session.query(model.UserObjectRole)
        q = q.filter_by(user=mradmin)
        assert q.count() == 2, q.all()
        uow = q.filter_by(context=u'user_object').first()
        uow.user = None
        model.repo.commit_and_remove()
        assert q.count() == 1, q.all()

        # now test delete-orphan on PackageRole
        prs = model.Session.query(model.PackageRole)
        pr = prs.first()
        pr.user = None
        model.repo.commit_and_remove()
        prs = model.Session.query(model.PackageRole)
예제 #2
0
    def import_stage(self, harvest_object):
        log.debug('In ZhGisHarvester import_stage')

        if not harvest_object:
            log.error('No harvest object received')
            return False

        try:
            package_dict = json.loads(harvest_object.content)

            package_dict['id'] = harvest_object.guid
            user = model.User.get(self.HARVEST_USER)
            context = {
                'model': model,
                'session': Session,
                'user': self.HARVEST_USER
            }

            # Find or create group the dataset should get assigned to
            package_dict['groups'] = self._find_or_create_groups(context)

            # Find or create the organization
            # the dataset should get assigned to
            package_dict['owner_org'] = self._find_or_create_organization(
                context)

            # Save license url in extras
            extras = []
            if 'license_url' in package_dict:
                extras.append(('license_url', package_dict['license_url']))
            package_dict['extras'] = extras

            package = model.Package.get(package_dict['id'])
            model.PackageRole(package=package,
                              user=user,
                              role=model.Role.ADMIN)

            log.debug('Save or update package %s (%s)' %
                      (package_dict['name'], package_dict['id']))
            self._create_or_update_package(package_dict, harvest_object)

            log.debug('Save or update term translations')
            self._submit_term_translations(context, package_dict)
            Session.commit()

        except Exception, e:
            log.exception(e)
            raise
예제 #3
0
    def import_stage(self, harvest_object):
        log.debug('In ZhstatHarvester import_stage')

        if not harvest_object:
            log.error('No harvest object received')
            return False

        try:
            package_dict = json.loads(harvest_object.content)

            package_dict['id'] = harvest_object.guid
            package_dict['name'] = self._gen_new_name(package_dict['title'],
                                                      package_dict['id'])

            user = model.User.get(self.HARVEST_USER)
            context = {
                'model': model,
                'session': Session,
                'user': self.HARVEST_USER
            }

            # Find or create group the dataset should get assigned to
            for group_name in package_dict['groups']:
                if not group_name:
                    raise GroupNotFoundError(
                        'Group is not defined for dataset %s' %
                        package_dict['title'])
                data_dict = {
                    'id': group_name,
                    'name': munge_title_to_name(group_name),
                    'title': group_name
                }
                try:
                    group = get_action('group_show')(context, data_dict)
                    log.info('found  group ' + group['id'])
                except:
                    group = get_action('group_create')(context, data_dict)
                    log.info('created the group ' + group['id'])

            # Find or create the organization
            # the dataset should get assigned to
            data_dict = {
                'permission':
                'edit_group',
                'id':
                munge_title_to_name(self.ORGANIZATION[u'de']['name']),
                'name':
                munge_title_to_name(self.ORGANIZATION[u'de']['name']),
                'title':
                self.ORGANIZATION[u'de']['name'],
                'description':
                self.ORGANIZATION[u'de']['description'],
                'extras': [{
                    'key': 'website',
                    'value': self.ORGANIZATION[u'de']['website']
                }]
            }
            try:
                package_dict['owner_org'] = get_action('organization_show')(
                    context, data_dict)['id']
            except:
                organization = get_action('organization_create')(context,
                                                                 data_dict)
                package_dict['owner_org'] = organization['id']

            # Save additional metadata in extras
            extras = []
            if 'license_url' in package_dict:
                extras.append(('license_url', package_dict['license_url']))
            package_dict['extras'] = extras
            log.debug('Extras %s' % extras)

            package = model.Package.get(package_dict['id'])
            model.PackageRole(package=package,
                              user=user,
                              role=model.Role.ADMIN)

            self._create_or_update_package(package_dict, harvest_object)

            # Add the translations to the term_translations table
            for translation in package_dict['translations']:
                action.update.term_translation_update(context, translation)
            Session.commit()

        except Exception, detail:
            log.exception(detail)
            raise
예제 #4
0
            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result is True and self.config.get('read_only', False) is True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return result
        except ValidationError, e:
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
        except Exception, e:
예제 #5
0
    def import_stage(self, harvest_object):
        log.debug("In CKANHarvester import_stage")

        context = {
            "model": model,
            "session": Session,
            "user": self._get_user_name()
        }
        if not harvest_object:
            log.error("No harvest object received")
            return False

        if harvest_object.content is None:
            self._save_object_error(
                "Empty content for object %s" % harvest_object.id,
                harvest_object,
                "Import",
            )
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get("type") == "harvest":
                log.warn("Remote dataset is a harvest source, ignoring...")
                return True

            # Set default tags if needed
            default_tags = self.config.get("default_tags", [])
            if default_tags:
                if not "tags" in package_dict:
                    package_dict["tags"] = []
                package_dict["tags"].extend(
                    [t for t in default_tags if t not in package_dict["tags"]])

            remote_groups = self.config.get("remote_groups", None)
            if not remote_groups in ("only_local", "create"):
                # Ignore remote groups
                package_dict.pop("groups", None)
            else:
                if not "groups" in package_dict:
                    package_dict["groups"] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict["groups"]:
                    try:
                        data_dict = {"id": group_name}
                        group = get_action("group_show")(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group["name"])
                        else:
                            validated_groups.append(group["id"])
                    except NotFound as e:
                        log.info("Group %s is not available" % group_name)
                        if remote_groups == "create":
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error("Could not get remote group %s" %
                                          group_name)
                                continue

                            for key in [
                                    "packages",
                                    "created",
                                    "users",
                                    "groups",
                                    "tags",
                                    "extras",
                                    "display_name",
                            ]:
                                group.pop(key, None)

                            get_action("group_create")(context, group)
                            log.info("Group %s has been newly created" %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group["name"])
                            else:
                                validated_groups.append(group["id"])

                package_dict["groups"] = validated_groups

            # Local harvest source organization
            source_dataset = get_action("package_show")(
                context, {
                    "id": harvest_object.source.id
                })
            local_org = source_dataset.get("owner_org")

            remote_orgs = self.config.get("remote_orgs", None)

            if not remote_orgs in ("only_local", "create"):
                # Assign dataset to the source organization
                package_dict["owner_org"] = local_org
            else:
                if not "owner_org" in package_dict:
                    package_dict["owner_org"] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict["owner_org"]

                if remote_org:
                    try:
                        data_dict = {"id": remote_org}
                        org = get_action("organization_show")(context,
                                                              data_dict)
                        validated_org = org["id"]
                    except NotFound as e:
                        log.info("Organization %s is not available" %
                                 remote_org)
                        if remote_orgs == "create":
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org)
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org)

                                for key in [
                                        "packages",
                                        "created",
                                        "users",
                                        "groups",
                                        "tags",
                                        "extras",
                                        "display_name",
                                        "type",
                                ]:
                                    org.pop(key, None)
                                get_action("organization_create")(context, org)
                                log.info(
                                    "Organization %s has been newly created" %
                                    remote_org)
                                validated_org = org["id"]
                            except (RemoteResourceError, ValidationError):
                                log.error("Could not get remote org %s" %
                                          remote_org)

                package_dict["owner_org"] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get("default_groups", [])
            if default_groups:
                if not "groups" in package_dict:
                    package_dict["groups"] = []
                package_dict["groups"].extend([
                    g for g in default_groups
                    if g not in package_dict["groups"]
                ])

            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
            for key in list(package_dict["extras"].keys()):
                if not isinstance(package_dict["extras"][key], str):
                    try:
                        package_dict["extras"][key] = json.dumps(
                            package_dict["extras"][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict["extras"][key]

            # Set default extras if needed
            default_extras = self.config.get("default_extras", {})
            if default_extras:
                override_extras = self.config.get("override_extras", False)
                if not "extras" in package_dict:
                    package_dict["extras"] = {}
                for key, value in default_extras.items():
                    if not key in package_dict["extras"] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, str):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip("/"),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict["id"],
                            )

                        package_dict["extras"][key] = value

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get("resources", []):
                resource.pop("url_type", None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get("read_only", False) == True:

                package = model.Package.get(package_dict["id"])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get("user", "harvest")
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in ("visitor", "logged_in"):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
        except ValidationError as e:
            self._save_object_error(
                "Invalid package with GUID %s: %r" %
                (harvest_object.guid, e.error_dict),
                harvest_object,
                "Import",
            )
        except Exception as e:
            self._save_object_error("%r" % e, harvest_object, "Import")
예제 #6
0
    def import_stage(self, harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' %
                                    harvest_object.id,
                                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            d = socrataAdaptor()
            log.debug("Converting View")
            stripped_source = harvest_object.source.url.rstrip('/')
            package_dict = d.convertViewXml(harvest_object.id,
                                            stripped_source,
                                            harvest_object.content)

            package_dict.update({"catalogue_url":
                                str(harvest_object.source.url.rstrip('/'))})
            package_dict.update({"platform": "socrata"})

            if 'category' in package_dict.keys():
                package_dict['extras'].update({'category':
                                              package_dict['category']})
                del package_dict['category']

            log.debug(package_dict)
            if package_dict['id'] not in ids:
                metadata_created = datetime.datetime.now()
                package_dict.update({"metadata_created":
                                    str(metadata_created)})
                socrata_db.save(package_dict)
                log.info('Metadata saved succesfully to MongoDb.')
            else:
                document = socrata_db.find_one({"id": package_dict['id']})
                met_created = document['metadata_created']
                package_dict.update({'metadata_created': met_created})
                package_dict.update({'metadata_updated':
                                    str(datetime.datetime.now())})
                package_dict.update({'updated_dataset': True})
                socrata_db.remove({"id": package_dict['id']})
                socrata_db.save(package_dict)
                log.info('Metadata updated succesfully to MongoDb.')

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags
                                             if t not in package_dict['tags']])

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if
                                              g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)
            return True

        except ValidationError, e:
            self._save_object_error('Invalid package with GUID %s: %r' %
                                    (harvest_object.guid, e.error_dict),
                                    harvest_object, 'Import')
            log.debug("Validation Error: %s", harvest_object.guid)
예제 #7
0
    def import_stage(self,harvest_object):
        log.debug('In NTPCHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)


            log.debug(package_dict)
            log.debug('=============================================')
            package_dict["id"] = harvest_object.guid
            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            log.debug(remote_groups)
            log.debug('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {'model': model, 'session': Session, 'user': '******'}

                for group_name in package_dict['groups']:
                    log.debug(group_name)
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(harvest_object.source.url, group_name)
                            except:
                                log.error('Could not get remote group %s' % group_name)
                                continue

                            for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']:
                                group.pop(key, None)
                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' % group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Ignore remote orgs for the time being
            package_dict.pop('owner_org', None)

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
            for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                                package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras',{})
            if default_extras:
                override_extras = self.config.get('override_extras',False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key,value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value,basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
                                     harvest_source_title=harvest_object.job.source.title,
                                     harvest_job_id=harvest_object.job.id,
                                     harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            log.debug('_create_or_update_package')
            log.debug(package_dict)
            log.debug(harvest_object)
            result = self._create_or_update_package(package_dict,harvest_object)

            if result and self.config.get('read_only',False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)

            log.debug('import_stage return true')
            return True
    def import_stage(self, harvest_object):
        log.debug('In HTMLHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            harvest_object.content = harvest_object.content.replace("'", '"')

            #package_dict=harvest_object.content
            package_dict = json.loads(
                harvest_object.content.decode('utf-8', 'ignore'))

            ## handle notes validation errors as existance of:  " and /
            extrasjson = []
            try:
                extras = package_dict['extras']
            except:
                extras = ""
            j = 0
            ##transformations to json's extras
            if 'value' in str(extras) and 'key' in str(extras):

                extrasjson[:] = []
                extrasjson2 = ""
                while j < len(package_dict['extras']):
                    extra_key = package_dict['extras'][j]['key']
                    extra_value = package_dict['extras'][j]['value']
                    if len(extra_value) > 0:

                        c = 0
                        extra_value1 = ""

                        while c < len(extra_value):
                            extra_value1 = extra_value1 + extra_value[c]
                            c += 1

                        c = 0
                        extra_value = extra_value1
                    extra = '"' + str(
                        extra_key.encode('utf-8')) + '":' + '"' + str(
                            extra_value.encode('utf-8')) + '"'
                    extrasjson.append(extra)

                    j += 1

                k = 0
                extrasjson1 = ""

                while k < len(extrasjson):
                    extrasjson1 = extrasjson1 + extrasjson[k] + ","
                    k += 1

                k = 0
                j = 0

                extrasjson1 = "{" + extrasjson1.rstrip(',') + "}"

                try:
                    extrasjson2 = json.loads(extrasjson1)
                except:
                    errorscounter += 1

                if len(extrasjson) > 0:
                    package_dict.update({"extras": extrasjson2})

            try:

                tags = package_dict['tags']
                j = 0

                if 'name' in str(tags):

                    while j < len(package_dict['tags']):

                        tag = package_dict['tags'][j]['name']
                        tagsarray.append(tag)
                        j += 1

                if len(tagsarray) > 0:
                    package_dict.update({"tags": tagsarray})

                tagsarray[:] = []
                j = 0

            except:
                pass

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {
                    'model': model,
                    'session': Session,
                    'user': '******'
                }

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except:
                                log.error('Could not get remote group %s' %
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)
                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            context = {'model': model, 'session': Session, 'user': '******'}

            # Local harvest source organization
            #source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id})
            #local_org = source_dataset.get('owner_org')

            #remote_orgs = self.config.get('remote_orgs', None)

            #if not remote_orgs in ('only_local', 'create'):
            ## Assign dataset to the source organization
            #package_dict['owner_org'] = local_org
            #else:
            #if not 'owner_org' in package_dict:
            #package_dict['owner_org'] = None

            ## check if remote org exist locally, otherwise remove
            #validated_org = None
            #remote_org = package_dict['owner_org']

            #if remote_org:
            #try:
            #data_dict = {'id': remote_org}
            #org = get_action('organization_show')(context, data_dict)
            #validated_org = org['id']
            #except NotFound, e:
            #log.info('Organization %s is not available' % remote_org)
            #if remote_orgs == 'create':
            #try:
            #org = self._get_group(harvest_object.source.url, remote_org)
            #for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']:
            #org.pop(key, None)
            #get_action('organization_create')(context, org)
            #log.info('Organization %s has been newly created' % remote_org)
            #validated_org = org['id']
            #except:
            #log.error('Could not get remote org %s' % remote_org)

            #package_dict['owner_org'] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})
            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip('/'),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get('resources', []):
                resource.pop('url_type', None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            # Ignore remote groups for the time being
            del package_dict['groups']

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})
            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip('/'),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

        except ValidationError, e:
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
예제 #10
0
class CKANSchemingHarvester(CKANHarvester):
    '''
    A Harvester for CKAN instances with custom scheming dataset
    '''
    def info(self):
        return {
            'name': 'ckan-scheming',
            'title': 'CKAN-scheming',
            'description':
            'Harvests remote CKAN instances with ckanext-scheming',
            'form_config_interface': 'Text'
        }

    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')

        context = {
            'model': model,
            'session': Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s' %
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Local harvest source organization
            source_dataset = get_action('package_show')(
                context, {
                    'id': harvest_object.source.id
                })
            local_org = source_dataset.get('owner_org')

            remote_orgs = self.config.get('remote_orgs', None)

            if not remote_orgs in ('only_local', 'create'):
                # Assign dataset to the source organization
                package_dict['owner_org'] = local_org
            else:
                if not 'owner_org' in package_dict:
                    package_dict['owner_org'] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict['owner_org']

                if remote_org:
                    try:
                        data_dict = {'id': remote_org}
                        org = get_action('organization_show')(context,
                                                              data_dict)
                        validated_org = org['id']
                    except NotFound, e:
                        log.info('Organization %s is not available' %
                                 remote_org)
                        if remote_orgs == 'create':
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org)
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org)

                                for key in [
                                        'packages', 'created', 'users',
                                        'groups', 'tags', 'extras',
                                        'display_name', 'type'
                                ]:
                                    org.pop(key, None)
                                get_action('organization_create')(context, org)
                                log.info(
                                    'Organization %s has been newly created' %
                                    remote_org)
                                validated_org = org['id']
                            except (RemoteResourceError, ValidationError):
                                log.error('Could not get remote org %s' %
                                          remote_org)

                package_dict['owner_org'] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

# FIXME: enable only if not using ckanext-scheming dataset schemas
# handle extras in harvested schema
#
            """
            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
	    for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                                package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras',{})
            if default_extras:
                override_extras = self.config.get('override_extras',False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key,value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value,basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
                                     harvest_source_title=harvest_object.job.source.title,
                                     harvest_job_id=harvest_object.job.id,
                                     harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value
	    """

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get('resources', []):
                resource.pop('url_type', None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
예제 #11
0
    def import_stage(self,harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            #log.debug(harvest_object.content)

            d = socrataAdaptor()
            log.debug("Converting View")
            package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content)
            log.debug(package_dict)

            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])


            # Set default groups if needed
            default_groups = self.config.get('default_groups',[])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,harvest_object)
            #log.debug(result)

            if result and self.config.get('read_only',False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)



        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')