harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            print 'before create_or_update'
            print new_dict
            result = self._create_or_update_package(new_dict,harvest_object)
            #result = self.create_or_update_api(context, package_dict)

            if result and self.config.get('read_only',False) == True:

                package = model.Package.get(new_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)

            return True
        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')
Example #2
0
                resource.pop('url_type', None)

                # Clear revision_id as the revision won't exist on this CKAN
                # and saving it will cause an IntegrityError with the foreign
                # key.
                resource.pop('revision_id', None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result is True and self.config.get('read_only', False) is True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)
Example #3
0
    def setup_class(self):
        model.Session.remove()
        CreateTestData.create()
        model.Session.remove()
        self.authorizer = authz.Authorizer()

        self.admin_role = model.Role.ADMIN
        self.editor_role = model.Role.EDITOR
        self.reader_role = model.Role.READER

        john = model.User(name=u'john')
        model.Session.add(john)
        
        # setup annakarenina with default roles
        anna = model.Package.by_name(u'annakarenina')
        model.clear_user_roles(anna)
        annakarenina_creator = model.User(name=u'annakarenina_creator')
        model.Session.add(annakarenina_creator)
        model.repo.commit_and_remove()
        model.setup_default_user_roles(anna, [annakarenina_creator])
        model.repo.commit_and_remove()

        # setup warandpeace with no roles
        war = model.Package.by_name(u'warandpeace')
        model.clear_user_roles(war)

        # setup restricted package - visitors can't change
        restricted = model.Package(name=u'restricted')
        vrestricted = model.Package(name=u'vrestricted')
        mreditor = model.User(name=u'mreditor')
        mrreader = model.User(name=u'mrreader')
        self.mrsysadmin = u'mrsysadmin'
        mrsysadmin = model.User(name=self.mrsysadmin)
        model.repo.new_revision()
        model.Session.add_all([restricted,
            vrestricted,mreditor,mrreader,mrsysadmin])
        model.repo.commit_and_remove()
        visitor_roles = []
        logged_in_roles = [model.Role.EDITOR, model.Role.READER]
        logged_in_roles_v = []
        restricted = model.Package.by_name(u'restricted')
        vrestricted = model.Package.by_name(u'vrestricted')
        model.setup_user_roles(restricted, visitor_roles, logged_in_roles)
        model.setup_user_roles(vrestricted, visitor_roles, logged_in_roles_v)
        model.repo.commit_and_remove()
        mreditor = model.User.by_name(u'mreditor')
        model.add_user_to_role(mreditor, model.Role.EDITOR, restricted)

        mrsysadmin = model.User.by_name(u'mrsysadmin')
        model.add_user_to_role(mrsysadmin, model.Role.ADMIN, model.System())
        model.repo.commit_and_remove()

        self.mreditor = model.User.by_name(u'mreditor')
        self.mrreader = model.User.by_name(u'mrreader')
        self.annakarenina_creator = model.User.by_name(u'annakarenina_creator')
        self.logged_in = model.User.by_name(model.PSEUDO_USER__LOGGED_IN)
        self.visitor = model.User.by_name(model.PSEUDO_USER__VISITOR)
        self.john = model.User.by_name(u'john')
        self.war = model.Package.by_name(u'warandpeace')
        self.anna = model.Package.by_name(u'annakarenina')
        self.restricted = model.Package.by_name(u'restricted')
        self.vrestricted = model.Package.by_name(u'vrestricted')
Example #4
0
    def import_stage(self, harvest_object):
        log.debug("In CKANHarvester import_stage")

        context = {
            "model": model,
            "session": Session,
            "user": self._get_user_name()
        }
        if not harvest_object:
            log.error("No harvest object received")
            return False

        if harvest_object.content is None:
            self._save_object_error(
                "Empty content for object %s" % harvest_object.id,
                harvest_object,
                "Import",
            )
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get("type") == "harvest":
                log.warn("Remote dataset is a harvest source, ignoring...")
                return True

            # Set default tags if needed
            default_tags = self.config.get("default_tags", [])
            if default_tags:
                if not "tags" in package_dict:
                    package_dict["tags"] = []
                package_dict["tags"].extend(
                    [t for t in default_tags if t not in package_dict["tags"]])

            remote_groups = self.config.get("remote_groups", None)
            if not remote_groups in ("only_local", "create"):
                # Ignore remote groups
                package_dict.pop("groups", None)
            else:
                if not "groups" in package_dict:
                    package_dict["groups"] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict["groups"]:
                    try:
                        data_dict = {"id": group_name}
                        group = get_action("group_show")(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group["name"])
                        else:
                            validated_groups.append(group["id"])
                    except NotFound as e:
                        log.info("Group %s is not available" % group_name)
                        if remote_groups == "create":
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error("Could not get remote group %s" %
                                          group_name)
                                continue

                            for key in [
                                    "packages",
                                    "created",
                                    "users",
                                    "groups",
                                    "tags",
                                    "extras",
                                    "display_name",
                            ]:
                                group.pop(key, None)

                            get_action("group_create")(context, group)
                            log.info("Group %s has been newly created" %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group["name"])
                            else:
                                validated_groups.append(group["id"])

                package_dict["groups"] = validated_groups

            # Local harvest source organization
            source_dataset = get_action("package_show")(
                context, {
                    "id": harvest_object.source.id
                })
            local_org = source_dataset.get("owner_org")

            remote_orgs = self.config.get("remote_orgs", None)

            if not remote_orgs in ("only_local", "create"):
                # Assign dataset to the source organization
                package_dict["owner_org"] = local_org
            else:
                if not "owner_org" in package_dict:
                    package_dict["owner_org"] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict["owner_org"]

                if remote_org:
                    try:
                        data_dict = {"id": remote_org}
                        org = get_action("organization_show")(context,
                                                              data_dict)
                        validated_org = org["id"]
                    except NotFound as e:
                        log.info("Organization %s is not available" %
                                 remote_org)
                        if remote_orgs == "create":
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org)
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org)

                                for key in [
                                        "packages",
                                        "created",
                                        "users",
                                        "groups",
                                        "tags",
                                        "extras",
                                        "display_name",
                                        "type",
                                ]:
                                    org.pop(key, None)
                                get_action("organization_create")(context, org)
                                log.info(
                                    "Organization %s has been newly created" %
                                    remote_org)
                                validated_org = org["id"]
                            except (RemoteResourceError, ValidationError):
                                log.error("Could not get remote org %s" %
                                          remote_org)

                package_dict["owner_org"] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get("default_groups", [])
            if default_groups:
                if not "groups" in package_dict:
                    package_dict["groups"] = []
                package_dict["groups"].extend([
                    g for g in default_groups
                    if g not in package_dict["groups"]
                ])

            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
            for key in list(package_dict["extras"].keys()):
                if not isinstance(package_dict["extras"][key], str):
                    try:
                        package_dict["extras"][key] = json.dumps(
                            package_dict["extras"][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict["extras"][key]

            # Set default extras if needed
            default_extras = self.config.get("default_extras", {})
            if default_extras:
                override_extras = self.config.get("override_extras", False)
                if not "extras" in package_dict:
                    package_dict["extras"] = {}
                for key, value in default_extras.items():
                    if not key in package_dict["extras"] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, str):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip("/"),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict["id"],
                            )

                        package_dict["extras"][key] = value

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get("resources", []):
                resource.pop("url_type", None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get("read_only", False) == True:

                package = model.Package.get(package_dict["id"])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get("user", "harvest")
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in ("visitor", "logged_in"):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
        except ValidationError as e:
            self._save_object_error(
                "Invalid package with GUID %s: %r" %
                (harvest_object.guid, e.error_dict),
                harvest_object,
                "Import",
            )
        except Exception as e:
            self._save_object_error("%r" % e, harvest_object, "Import")
    def import_stage(self,harvest_object):
        '''
        Imports each dataset from custom, into the CKAN server
        '''
        log.debug('In CustomHarvester import_stage')
        print('In CustomoftHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)


        package_dict = json.loads(harvest_object.content)
     
        
        log.debug(harvest_object.job.source.config)
        try:
            #log.debug(harvest_object.content)


            package_dict.update({"catalogue_url":str(harvest_object.source.url)})
            package_dict.update({"platform":"genericapi"})
            package_dict.update({"name":package_dict['id'].lower().strip()})
           

            mainurl=str(harvest_object.source.url)
            #if package_dict['id'] not in ids:
            document=custom_db.find_one({"catalogue_url":harvest_object.source.url,'id':package_dict['id']})
            if document==None:
                  metadata_created=datetime.datetime.now()
                  package_dict.update({"metadata_created":str(metadata_created)})
                  custom_db.save(package_dict)
                  log.info('Metadata saved succesfully to MongoDb.')
                  fetch_document=db_fetch_temp.find_one()
		  if fetch_document==None:
			fetch_document={}
			fetch_document.update({"cat_url":mainurl})
			fetch_document.update({"new":1})
			fetch_document.update({"updated":0})
			db_fetch_temp.save(fetch_document)
		  else:
			if mainurl==fetch_document['cat_url']:
			  new_count=fetch_document['new']
			  new_count+=1
			  fetch_document.update({"new":new_count})
			  db_fetch_temp.save(fetch_document)
			else:
			  last_cat_url=fetch_document['cat_url']
			  doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']})
			  if 'new' in fetch_document.keys():
				new=fetch_document['new']
				if 'new' in doc.keys():
				  last_new=doc['new']
				  doc.update({"last_new":last_new})
				doc.update({"new":new})
				db_jobs.save(doc)
			  if 'updated' in fetch_document.keys():
				updated=fetch_document['updated']
				if 'updated' in doc.keys():
				  last_updated=doc['updated']
				  doc.update({"last_updated":last_updated})
				doc.update({"updated":updated})
				db_jobs.save(doc)
			  fetch_document.update({"cat_url":mainurl})
			  fetch_document.update({"new":1})
			  fetch_document.update({"updated":0})
			  db_fetch_temp.save(fetch_document)
            else:
                  met_created=document['metadata_created']
                  if 'copied' in document.keys():
                      package_dict.update({'copied':document['copied']})
                  package_dict.update({'metadata_created':met_created})
                  package_dict.update({'metadata_updated':str(datetime.datetime.now())})
                  package_dict.update({'updated_dataset':True})
                  existing_dataset=custom_db.find_one({"id":package_dict['id'],"catalogue_url":mainurl})
                  objectid=existing_dataset['_id']
                  package_dict.update({'_id':objectid})
                  custom_db.save(package_dict)
                  log.info('Metadata updated succesfully to MongoDb.')
                  fetch_document=db_fetch_temp.find_one()
		  if fetch_document==None:
			fetch_document={}
			fetch_document.update({"cat_url":mainurl})
			fetch_document.update({"updated":1})
			fetch_document.update({"new":0})
			db_fetch_temp.save(fetch_document)
		  else:
			if mainurl==fetch_document['cat_url']:
			  updated_count=fetch_document['updated']
			  updated_count+=1
			  fetch_document.update({"updated":updated_count})
			  db_fetch_temp.save(fetch_document)
			else:
			  last_cat_url=fetch_document['cat_url']
			  doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']})
			  if 'new' in fetch_document.keys():
				new=fetch_document['new']
				if 'new' in doc.keys():
				  last_new=doc['new']
				  doc.update({"last_new":last_new})
				doc.update({"new":new})
				db_jobs.save(doc)
			  if 'updated' in fetch_document.keys():
				updated=fetch_document['updated']
				if 'updated' in doc.keys():
				  last_updated=doc['updated']
				  doc.update({"last_updated":last_updated})
				doc.update({"updated":updated})
				db_jobs.save(doc)
			  fetch_document.update({"cat_url":mainurl})
			  fetch_document.update({"updated":1})
			  fetch_document.update({"new":0})
			  db_fetch_temp.save(fetch_document)	



            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])


            # Set default groups if needed
            default_groups = self.config.get('default_groups',[])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,harvest_object)
            #log.debug(result)

            if result and self.config.get('read_only',False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
            return True



        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')
            print('ValidationError')
Example #6
0
    def setup_class(self):
        model.Session.remove()
        CreateTestData.create()
        model.Session.remove()
        self.authorizer = authz.Authorizer()

        self.admin_role = model.Role.ADMIN
        self.editor_role = model.Role.EDITOR
        self.reader_role = model.Role.READER

        john = model.User(name=u'john')
        model.Session.add(john)

        # setup annakarenina with default roles
        anna = model.Package.by_name(u'annakarenina')
        model.clear_user_roles(anna)
        annakarenina_creator = model.User(name=u'annakarenina_creator')
        model.Session.add(annakarenina_creator)
        model.repo.commit_and_remove()
        model.setup_default_user_roles(anna, [annakarenina_creator])
        model.repo.commit_and_remove()

        # setup warandpeace with no roles
        war = model.Package.by_name(u'warandpeace')
        model.clear_user_roles(war)

        # setup restricted package - visitors can't change
        restricted = model.Package(name=u'restricted')
        vrestricted = model.Package(name=u'vrestricted')
        mreditor = model.User(name=u'mreditor')
        mrreader = model.User(name=u'mrreader')
        self.mrsysadmin = u'mrsysadmin'
        mrsysadmin = model.User(name=self.mrsysadmin)
        model.repo.new_revision()
        model.Session.add_all(
            [restricted, vrestricted, mreditor, mrreader, mrsysadmin])
        model.repo.commit_and_remove()
        visitor_roles = []
        logged_in_roles = [model.Role.EDITOR, model.Role.READER]
        logged_in_roles_v = []
        restricted = model.Package.by_name(u'restricted')
        vrestricted = model.Package.by_name(u'vrestricted')
        model.setup_user_roles(restricted, visitor_roles, logged_in_roles)
        model.setup_user_roles(vrestricted, visitor_roles, logged_in_roles_v)
        model.repo.commit_and_remove()
        mreditor = model.User.by_name(u'mreditor')
        model.add_user_to_role(mreditor, model.Role.EDITOR, restricted)

        mrsysadmin = model.User.by_name(u'mrsysadmin')
        model.add_user_to_role(mrsysadmin, model.Role.ADMIN, model.System())
        model.repo.commit_and_remove()

        self.mreditor = model.User.by_name(u'mreditor')
        self.mrreader = model.User.by_name(u'mrreader')
        self.annakarenina_creator = model.User.by_name(u'annakarenina_creator')
        self.logged_in = model.User.by_name(model.PSEUDO_USER__LOGGED_IN)
        self.visitor = model.User.by_name(model.PSEUDO_USER__VISITOR)
        self.john = model.User.by_name(u'john')
        self.war = model.Package.by_name(u'warandpeace')
        self.anna = model.Package.by_name(u'annakarenina')
        self.restricted = model.Package.by_name(u'restricted')
        self.vrestricted = model.Package.by_name(u'vrestricted')
Example #7
0
    def import_stage(self,harvest_object):
        omit_tags = ['ogd', 'None']
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
            return False
        
        self._set_config(harvest_object.job.source.config)

        try:
            old_content = json.loads(harvest_object.content)
            
            new_content = {}
            new_content['id'] = harvest_object.guid

            # Common extras
            new_content['extras'] = {}
            #new_content['extras']['harvest_catalogue_name'] = u'Open Government Data Wien'
            #new_content['extras']['harvest_catalogue_url'] = u'http://data.wien.gv.at'
            
            
            new_content['maintainer'] = old_content.get('maintainer')
            new_content['maintainer_email'] = ''
            new_content['author'] = old_content.get('maintainer')
            new_content['resources'] = old_content.get('resources')
            for r in new_content['resources']:
                if (r.get('name') != ""):
                    r['name'] = r.get('description') 
            
            if old_content.get('tags'):
                if not 'tags' in new_content:
                    new_content['tags'] = []
                new_content['tags'].extend([i for i in old_content.get('tags') if i not in omit_tags])
                
                
            new_content['name'] = 'ogdwien_' + self._gen_new_name(old_content.get('title'))
            if(len(new_content['name']) > 100):
                new_content['name'] = new_content['name'][:99]
                                            
            new_content['license_id'] = 'cc-by'
            new_content['url'] = old_content.get('extras').get('harvest_dataset_url')
            new_content['notes'] = old_content.get('notes')
            new_content['title'] = old_content.get('title')
            frq = self.map_frequency(old_content.get('extras').get('temporal_granularity'))
            if (frq != ''):
                new_content['extras']['update_frequency'] = frq
                log.info("update_frequency: %s" % frq)
                
            temp = old_content.get('extras').get('temporal_coverage')
            if (temp):
                regex = re.compile("[0-9][0-9]+")
                r = regex.search(temp)
                if (r):
                    new_content['extras']['begin_datetime'] = temp
                     
            new_content['extras']['attribute_description'] = old_content.get('extras').get('attributes')
            new_content['extras']['geographic_toponym'] = old_content.get('extras').get('geographic_coverage')
            new_content['extras']['remote_id'] = harvest_object.guid
            new_content['extras']['remote_guid'] = old_content.get('title')
            new_content['extras']['publisher'] = 'OGD Wien'
            new_content['extras']['publisher_email'] = '*****@*****.**'
            new_content['groups'] = []
            for cat in old_content.get('extras').get('categories'):
                ng = self.map_category(cat)
                if (ng != ''):
                    new_content['groups'].append(ng)
                log.info('Group: %s - %s' % (cat, ng) )      
            
            

            if self.config:
                # Set default tags if needed
                default_tags = self.config.get('default_tags',[])
                if default_tags:
                    if not 'tags' in new_content:
                        new_content['tags'] = []
                    new_content['tags'].extend([t for t in default_tags if t not in new_content['tags']])
    
                # Set default groups if needed
                default_groups = self.config.get('default_groups',[])
                if default_groups:
                    if not 'groups' in new_content:
                        new_content['groups'] = []
                    new_content['groups'].extend([g for g in default_groups if g not in new_content['groups']])
                
            result= self._create_or_update_package(new_content, harvest_object)
            
            if result and self.config.get('read_only',False) == True:

                package = model.Package.get(new_content['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
        
            return result
        
        except Exception, e:
            log.exception(e)
            self._save_object_error('%r' % e, harvest_object, 'Import')
    def import_stage(self,harvest_object):
        log.debug('In NTPCHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)


            log.debug(package_dict)
            log.debug('=============================================')
            package_dict["id"] = harvest_object.guid
            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            log.debug(remote_groups)
            log.debug('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {'model': model, 'session': Session, 'user': '******'}

                for group_name in package_dict['groups']:
                    log.debug(group_name)
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(harvest_object.source.url, group_name)
                            except:
                                log.error('Could not get remote group %s' % group_name)
                                continue

                            for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']:
                                group.pop(key, None)
                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' % group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Ignore remote orgs for the time being
            package_dict.pop('owner_org', None)

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
            for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                                package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras',{})
            if default_extras:
                override_extras = self.config.get('override_extras',False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key,value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value,basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
                                     harvest_source_title=harvest_object.job.source.title,
                                     harvest_job_id=harvest_object.job.id,
                                     harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            log.debug('_create_or_update_package')
            log.debug(package_dict)
            log.debug(harvest_object)
            result = self._create_or_update_package(package_dict,harvest_object)

            if result and self.config.get('read_only',False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)

            log.debug('import_stage return true')
            return True
Example #9
0
    def import_stage(self, harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' %
                                    harvest_object.id,
                                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            d = socrataAdaptor()
            log.debug("Converting View")
            stripped_source = harvest_object.source.url.rstrip('/')
            package_dict = d.convertViewXml(harvest_object.id,
                                            stripped_source,
                                            harvest_object.content)

            package_dict.update({"catalogue_url":
                                str(harvest_object.source.url.rstrip('/'))})
            package_dict.update({"platform": "socrata"})

            if 'category' in package_dict.keys():
                package_dict['extras'].update({'category':
                                              package_dict['category']})
                del package_dict['category']

            log.debug(package_dict)
            if package_dict['id'] not in ids:
                metadata_created = datetime.datetime.now()
                package_dict.update({"metadata_created":
                                    str(metadata_created)})
                socrata_db.save(package_dict)
                log.info('Metadata saved succesfully to MongoDb.')
            else:
                document = socrata_db.find_one({"id": package_dict['id']})
                met_created = document['metadata_created']
                package_dict.update({'metadata_created': met_created})
                package_dict.update({'metadata_updated':
                                    str(datetime.datetime.now())})
                package_dict.update({'updated_dataset': True})
                socrata_db.remove({"id": package_dict['id']})
                socrata_db.save(package_dict)
                log.info('Metadata updated succesfully to MongoDb.')

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags
                                             if t not in package_dict['tags']])

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if
                                              g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)
            return True

        except ValidationError, e:
            self._save_object_error('Invalid package with GUID %s: %r' %
                                    (harvest_object.guid, e.error_dict),
                                    harvest_object, 'Import')
            log.debug("Validation Error: %s", harvest_object.guid)
    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            # Ignore remote groups for the time being
            del package_dict['groups']

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})
            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip('/'),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

        except ValidationError, e:
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
    def import_stage(self, harvest_object):
        log.debug('In HTMLHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            harvest_object.content = harvest_object.content.replace("'", '"')

            #package_dict=harvest_object.content
            package_dict = json.loads(
                harvest_object.content.decode('utf-8', 'ignore'))

            ## handle notes validation errors as existance of:  " and /
            extrasjson = []
            try:
                extras = package_dict['extras']
            except:
                extras = ""
            j = 0
            ##transformations to json's extras
            if 'value' in str(extras) and 'key' in str(extras):

                extrasjson[:] = []
                extrasjson2 = ""
                while j < len(package_dict['extras']):
                    extra_key = package_dict['extras'][j]['key']
                    extra_value = package_dict['extras'][j]['value']
                    if len(extra_value) > 0:

                        c = 0
                        extra_value1 = ""

                        while c < len(extra_value):
                            extra_value1 = extra_value1 + extra_value[c]
                            c += 1

                        c = 0
                        extra_value = extra_value1
                    extra = '"' + str(
                        extra_key.encode('utf-8')) + '":' + '"' + str(
                            extra_value.encode('utf-8')) + '"'
                    extrasjson.append(extra)

                    j += 1

                k = 0
                extrasjson1 = ""

                while k < len(extrasjson):
                    extrasjson1 = extrasjson1 + extrasjson[k] + ","
                    k += 1

                k = 0
                j = 0

                extrasjson1 = "{" + extrasjson1.rstrip(',') + "}"

                try:
                    extrasjson2 = json.loads(extrasjson1)
                except:
                    errorscounter += 1

                if len(extrasjson) > 0:
                    package_dict.update({"extras": extrasjson2})

            try:

                tags = package_dict['tags']
                j = 0

                if 'name' in str(tags):

                    while j < len(package_dict['tags']):

                        tag = package_dict['tags'][j]['name']
                        tagsarray.append(tag)
                        j += 1

                if len(tagsarray) > 0:
                    package_dict.update({"tags": tagsarray})

                tagsarray[:] = []
                j = 0

            except:
                pass

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {
                    'model': model,
                    'session': Session,
                    'user': '******'
                }

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except:
                                log.error('Could not get remote group %s' %
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)
                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            context = {'model': model, 'session': Session, 'user': '******'}

            # Local harvest source organization
            #source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id})
            #local_org = source_dataset.get('owner_org')

            #remote_orgs = self.config.get('remote_orgs', None)

            #if not remote_orgs in ('only_local', 'create'):
            ## Assign dataset to the source organization
            #package_dict['owner_org'] = local_org
            #else:
            #if not 'owner_org' in package_dict:
            #package_dict['owner_org'] = None

            ## check if remote org exist locally, otherwise remove
            #validated_org = None
            #remote_org = package_dict['owner_org']

            #if remote_org:
            #try:
            #data_dict = {'id': remote_org}
            #org = get_action('organization_show')(context, data_dict)
            #validated_org = org['id']
            #except NotFound, e:
            #log.info('Organization %s is not available' % remote_org)
            #if remote_orgs == 'create':
            #try:
            #org = self._get_group(harvest_object.source.url, remote_org)
            #for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']:
            #org.pop(key, None)
            #get_action('organization_create')(context, org)
            #log.info('Organization %s has been newly created' % remote_org)
            #validated_org = org['id']
            #except:
            #log.error('Could not get remote org %s' % remote_org)

            #package_dict['owner_org'] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})
            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.
                                url.strip('/'),
                                harvest_source_title=harvest_object.job.source.
                                title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get('resources', []):
                resource.pop('url_type', None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
Example #12
0
    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        if harvest_object.content == DELETE:
            return self._delete_package(harvest_object)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if remote_groups not in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {'model': model, 'session': Session, 'user': '******'}

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s' % group_name)
                                continue

                            for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']:
                                group.pop(key, None)
                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' % group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            context = {'model': model, 'session': Session, 'user': '******'}

            # Local harvest source organization
            source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id})
            source_dataset.get('owner_org')

            self.config.get('remote_orgs', None)

            if 'owner_org' not in package_dict:
                package_dict['owner_org'] = None

            # check if remote org exist locally, otherwise remove
            validated_org = None
            remote_org = None
            if package_dict.get('organization'):
                remote_org = package_dict['organization']['name']

            if remote_org:
                try:
                    data_dict = {'id': remote_org}
                    org = get_action('organization_show')(context, data_dict)
                    validated_org = org['id']
                except NotFound:
                    log.info('No organization exist, not importing dataset')
                    return "unchanged"
            else:
                log.info('No organization in harvested dataset')
                return "unchanged"

            package_dict['owner_org'] = validated_org

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
            for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                            package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras', {})
            if default_extras:
                override_extras = self.config.get('override_extras', False)
                if 'extras' not in package_dict:
                    package_dict['extras'] = {}
                for key, value in default_extras.iteritems():
                    if key not in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                                 harvest_source_url=harvest_object.job.source.url.strip('/'),
                                                 harvest_source_title=harvest_object.job.source.title,
                                                 harvest_job_id=harvest_object.job.id,
                                                 harvest_object_id=harvest_object.id,
                                                 dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get('resources', []):
                resource.pop('url_type', None)

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)
                if 'metadata_modified' in package_dict and package_dict['metadata_modified'] <= existing_package_dict.get('metadata_modified'):
                    return "unchanged"
            except NotFound:
                pass

            result = self._create_or_update_package(package_dict, harvest_object)

            if result and self.config.get('read_only', False) is True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    model.PackageRole(package=package, user=user, role=model.Role.READER)

            return True
Example #13
0
    def import_stage(self,harvest_object):
        log.debug('In CKANHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])

            # Ignore remote groups for the time being
            del package_dict['groups']

            # Set default groups if needed
            default_groups = self.config.get('default_groups',[])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            if harvest_object.source.publisher_id:
                package_dict['owner_org'] = harvest_object.source.publisher_id

            # Set default extras if needed
            default_extras = self.config.get('default_extras',{})
            if default_extras:
                override_extras = self.config.get('override_extras',False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key,value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value,basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
                                     harvest_source_title=harvest_object.job.source.title,
                                     harvest_job_id=harvest_object.job.id,
                                     harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value

            result = self._create_or_update_package(package_dict,harvest_object)

            if result and self.config.get('read_only',False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)


        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')
    def import_stage(self, harvest_object):
        log.debug("In HTMLHarvester import_stage")
        if not harvest_object:
            log.error("No harvest object received")
            return False

        if harvest_object.content is None:
            self._save_object_error("Empty content for object %s" % harvest_object.id, harvest_object, "Import")
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            harvest_object.content = harvest_object.content.replace("'", '"')

            # package_dict=harvest_object.content
            package_dict = json.loads(harvest_object.content.decode("utf-8", "ignore"))

            ## handle notes validation errors as existance of:  " and /
            extrasjson = []
            try:
                extras = package_dict["extras"]
            except:
                extras = ""
            j = 0
            ##transformations to json's extras
            if "value" in str(extras) and "key" in str(extras):

                extrasjson[:] = []
                extrasjson2 = ""
                while j < len(package_dict["extras"]):
                    extra_key = package_dict["extras"][j]["key"]
                    extra_value = package_dict["extras"][j]["value"]
                    if len(extra_value) > 0:

                        c = 0
                        extra_value1 = ""

                        while c < len(extra_value):
                            extra_value1 = extra_value1 + extra_value[c]
                            c += 1

                        c = 0
                        extra_value = extra_value1
                    extra = '"' + str(extra_key.encode("utf-8")) + '":' + '"' + str(extra_value.encode("utf-8")) + '"'
                    extrasjson.append(extra)

                    j += 1

                k = 0
                extrasjson1 = ""

                while k < len(extrasjson):
                    extrasjson1 = extrasjson1 + extrasjson[k] + ","
                    k += 1

                k = 0
                j = 0

                extrasjson1 = "{" + extrasjson1.rstrip(",") + "}"

                try:
                    extrasjson2 = json.loads(extrasjson1)
                except:
                    errorscounter += 1

                if len(extrasjson) > 0:
                    package_dict.update({"extras": extrasjson2})

            try:

                tags = package_dict["tags"]
                j = 0

                if "name" in str(tags):

                    while j < len(package_dict["tags"]):

                        tag = package_dict["tags"][j]["name"]
                        tagsarray.append(tag)
                        j += 1

                if len(tagsarray) > 0:
                    package_dict.update({"tags": tagsarray})

                tagsarray[:] = []
                j = 0

            except:
                pass

            if package_dict.get("type") == "harvest":
                log.warn("Remote dataset is a harvest source, ignoring...")
                return True

            # Set default tags if needed
            default_tags = self.config.get("default_tags", [])
            if default_tags:
                if not "tags" in package_dict:
                    package_dict["tags"] = []
                package_dict["tags"].extend([t for t in default_tags if t not in package_dict["tags"]])

            remote_groups = self.config.get("remote_groups", None)
            if not remote_groups in ("only_local", "create"):
                # Ignore remote groups
                package_dict.pop("groups", None)
            else:
                if not "groups" in package_dict:
                    package_dict["groups"] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []
                context = {"model": model, "session": Session, "user": "******"}

                for group_name in package_dict["groups"]:
                    try:
                        data_dict = {"id": group_name}
                        group = get_action("group_show")(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group["name"])
                        else:
                            validated_groups.append(group["id"])
                    except NotFound, e:
                        log.info("Group %s is not available" % group_name)
                        if remote_groups == "create":
                            try:
                                group = self._get_group(harvest_object.source.url, group_name)
                            except:
                                log.error("Could not get remote group %s" % group_name)
                                continue

                            for key in ["packages", "created", "users", "groups", "tags", "extras", "display_name"]:
                                group.pop(key, None)
                            get_action("group_create")(context, group)
                            log.info("Group %s has been newly created" % group_name)
                            if self.api_version == 1:
                                validated_groups.append(group["name"])
                            else:
                                validated_groups.append(group["id"])

                package_dict["groups"] = validated_groups

            context = {"model": model, "session": Session, "user": "******"}

            # Local harvest source organization
            # source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id})
            # local_org = source_dataset.get('owner_org')

            # remote_orgs = self.config.get('remote_orgs', None)

            # if not remote_orgs in ('only_local', 'create'):
            ## Assign dataset to the source organization
            # package_dict['owner_org'] = local_org
            # else:
            # if not 'owner_org' in package_dict:
            # package_dict['owner_org'] = None

            ## check if remote org exist locally, otherwise remove
            # validated_org = None
            # remote_org = package_dict['owner_org']

            # if remote_org:
            # try:
            # data_dict = {'id': remote_org}
            # org = get_action('organization_show')(context, data_dict)
            # validated_org = org['id']
            # except NotFound, e:
            # log.info('Organization %s is not available' % remote_org)
            # if remote_orgs == 'create':
            # try:
            # org = self._get_group(harvest_object.source.url, remote_org)
            # for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']:
            # org.pop(key, None)
            # get_action('organization_create')(context, org)
            # log.info('Organization %s has been newly created' % remote_org)
            # validated_org = org['id']
            # except:
            # log.error('Could not get remote org %s' % remote_org)

            # package_dict['owner_org'] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get("default_groups", [])
            if default_groups:
                package_dict["groups"].extend([g for g in default_groups if g not in package_dict["groups"]])

            # Set default extras if needed
            default_extras = self.config.get("default_extras", {})
            if default_extras:
                override_extras = self.config.get("override_extras", False)
                if not "extras" in package_dict:
                    package_dict["extras"] = {}
                for key, value in default_extras.iteritems():
                    if not key in package_dict["extras"] or override_extras:
                        # Look for replacement strings
                        if isinstance(value, basestring):
                            value = value.format(
                                harvest_source_id=harvest_object.job.source.id,
                                harvest_source_url=harvest_object.job.source.url.strip("/"),
                                harvest_source_title=harvest_object.job.source.title,
                                harvest_job_id=harvest_object.job.id,
                                harvest_object_id=harvest_object.id,
                                dataset_id=package_dict["id"],
                            )

                        package_dict["extras"][key] = value

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get("resources", []):
                resource.pop("url_type", None)

            result = self._create_or_update_package(package_dict, harvest_object)

            if result and self.config.get("read_only", False) == True:

                package = model.Package.get(package_dict["id"])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get("user", u"harvest")
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u"visitor", u"logged_in"):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)

            return True
Example #15
0
class CKANSchemingHarvester(CKANHarvester):
    '''
    A Harvester for CKAN instances with custom scheming dataset
    '''
    def info(self):
        return {
            'name': 'ckan-scheming',
            'title': 'CKAN-scheming',
            'description':
            'Harvests remote CKAN instances with ckanext-scheming',
            'form_config_interface': 'Text'
        }

    def import_stage(self, harvest_object):
        log.debug('In CKANHarvester import_stage')

        context = {
            'model': model,
            'session': Session,
            'user': self._get_user_name()
        }
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error(
                'Empty content for object %s' % harvest_object.id,
                harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        try:
            package_dict = json.loads(harvest_object.content)

            if package_dict.get('type') == 'harvest':
                log.warn('Remote dataset is a harvest source, ignoring...')
                return True

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend(
                    [t for t in default_tags if t not in package_dict['tags']])

            remote_groups = self.config.get('remote_groups', None)
            if not remote_groups in ('only_local', 'create'):
                # Ignore remote groups
                package_dict.pop('groups', None)
            else:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []

                # check if remote groups exist locally, otherwise remove
                validated_groups = []

                for group_name in package_dict['groups']:
                    try:
                        data_dict = {'id': group_name}
                        group = get_action('group_show')(context, data_dict)
                        if self.api_version == 1:
                            validated_groups.append(group['name'])
                        else:
                            validated_groups.append(group['id'])
                    except NotFound, e:
                        log.info('Group %s is not available' % group_name)
                        if remote_groups == 'create':
                            try:
                                group = self._get_group(
                                    harvest_object.source.url, group_name)
                            except RemoteResourceError:
                                log.error('Could not get remote group %s' %
                                          group_name)
                                continue

                            for key in [
                                    'packages', 'created', 'users', 'groups',
                                    'tags', 'extras', 'display_name'
                            ]:
                                group.pop(key, None)

                            get_action('group_create')(context, group)
                            log.info('Group %s has been newly created' %
                                     group_name)
                            if self.api_version == 1:
                                validated_groups.append(group['name'])
                            else:
                                validated_groups.append(group['id'])

                package_dict['groups'] = validated_groups

            # Local harvest source organization
            source_dataset = get_action('package_show')(
                context, {
                    'id': harvest_object.source.id
                })
            local_org = source_dataset.get('owner_org')

            remote_orgs = self.config.get('remote_orgs', None)

            if not remote_orgs in ('only_local', 'create'):
                # Assign dataset to the source organization
                package_dict['owner_org'] = local_org
            else:
                if not 'owner_org' in package_dict:
                    package_dict['owner_org'] = None

                # check if remote org exist locally, otherwise remove
                validated_org = None
                remote_org = package_dict['owner_org']

                if remote_org:
                    try:
                        data_dict = {'id': remote_org}
                        org = get_action('organization_show')(context,
                                                              data_dict)
                        validated_org = org['id']
                    except NotFound, e:
                        log.info('Organization %s is not available' %
                                 remote_org)
                        if remote_orgs == 'create':
                            try:
                                try:
                                    org = self._get_organization(
                                        harvest_object.source.url, remote_org)
                                except RemoteResourceError:
                                    # fallback if remote CKAN exposes organizations as groups
                                    # this especially targets older versions of CKAN
                                    org = self._get_group(
                                        harvest_object.source.url, remote_org)

                                for key in [
                                        'packages', 'created', 'users',
                                        'groups', 'tags', 'extras',
                                        'display_name', 'type'
                                ]:
                                    org.pop(key, None)
                                get_action('organization_create')(context, org)
                                log.info(
                                    'Organization %s has been newly created' %
                                    remote_org)
                                validated_org = org['id']
                            except (RemoteResourceError, ValidationError):
                                log.error('Could not get remote org %s' %
                                          remote_org)

                package_dict['owner_org'] = validated_org or local_org

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([
                    g for g in default_groups
                    if g not in package_dict['groups']
                ])

# FIXME: enable only if not using ckanext-scheming dataset schemas
# handle extras in harvested schema
#
            """
            # Find any extras whose values are not strings and try to convert
            # them to strings, as non-string extras are not allowed anymore in
            # CKAN 2.0.
	    for key in package_dict['extras'].keys():
                if not isinstance(package_dict['extras'][key], basestring):
                    try:
                        package_dict['extras'][key] = json.dumps(
                                package_dict['extras'][key])
                    except TypeError:
                        # If converting to a string fails, just delete it.
                        del package_dict['extras'][key]

            # Set default extras if needed
            default_extras = self.config.get('default_extras',{})
            if default_extras:
                override_extras = self.config.get('override_extras',False)
                if not 'extras' in package_dict:
                    package_dict['extras'] = {}
                for key,value in default_extras.iteritems():
                    if not key in package_dict['extras'] or override_extras:
                        # Look for replacement strings
                        if isinstance(value,basestring):
                            value = value.format(harvest_source_id=harvest_object.job.source.id,
                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
                                     harvest_source_title=harvest_object.job.source.title,
                                     harvest_job_id=harvest_object.job.id,
                                     harvest_object_id=harvest_object.id,
                                     dataset_id=package_dict['id'])

                        package_dict['extras'][key] = value
	    """

            # Clear remote url_type for resources (eg datastore, upload) as we
            # are only creating normal resources with links to the remote ones
            for resource in package_dict.get('resources', []):
                resource.pop('url_type', None)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:

                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package,
                                             user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)

            return True
Example #16
0
    def import_stage(self,harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            #log.debug(harvest_object.content)

            d = socrataAdaptor()
            log.debug("Converting View")
            package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content)
            log.debug(package_dict)

            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])


            # Set default groups if needed
            default_groups = self.config.get('default_groups',[])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,harvest_object)
            #log.debug(result)

            if result and self.config.get('read_only',False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)



        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')