harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value print 'before create_or_update' print new_dict result = self._create_or_update_package(new_dict,harvest_object) #result = self.create_or_update_api(context, package_dict) if result and self.config.get('read_only',False) == True: package = model.Package.get(new_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
resource.pop('url_type', None) # Clear revision_id as the revision won't exist on this CKAN # and saving it will cause an IntegrityError with the foreign # key. resource.pop('revision_id', None) result = self._create_or_update_package(package_dict, harvest_object) if result is True and self.config.get('read_only', False) is True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
def setup_class(self): model.Session.remove() CreateTestData.create() model.Session.remove() self.authorizer = authz.Authorizer() self.admin_role = model.Role.ADMIN self.editor_role = model.Role.EDITOR self.reader_role = model.Role.READER john = model.User(name=u'john') model.Session.add(john) # setup annakarenina with default roles anna = model.Package.by_name(u'annakarenina') model.clear_user_roles(anna) annakarenina_creator = model.User(name=u'annakarenina_creator') model.Session.add(annakarenina_creator) model.repo.commit_and_remove() model.setup_default_user_roles(anna, [annakarenina_creator]) model.repo.commit_and_remove() # setup warandpeace with no roles war = model.Package.by_name(u'warandpeace') model.clear_user_roles(war) # setup restricted package - visitors can't change restricted = model.Package(name=u'restricted') vrestricted = model.Package(name=u'vrestricted') mreditor = model.User(name=u'mreditor') mrreader = model.User(name=u'mrreader') self.mrsysadmin = u'mrsysadmin' mrsysadmin = model.User(name=self.mrsysadmin) model.repo.new_revision() model.Session.add_all([restricted, vrestricted,mreditor,mrreader,mrsysadmin]) model.repo.commit_and_remove() visitor_roles = [] logged_in_roles = [model.Role.EDITOR, model.Role.READER] logged_in_roles_v = [] restricted = model.Package.by_name(u'restricted') vrestricted = model.Package.by_name(u'vrestricted') model.setup_user_roles(restricted, visitor_roles, logged_in_roles) model.setup_user_roles(vrestricted, visitor_roles, logged_in_roles_v) model.repo.commit_and_remove() mreditor = model.User.by_name(u'mreditor') model.add_user_to_role(mreditor, model.Role.EDITOR, restricted) mrsysadmin = model.User.by_name(u'mrsysadmin') model.add_user_to_role(mrsysadmin, model.Role.ADMIN, model.System()) model.repo.commit_and_remove() self.mreditor = model.User.by_name(u'mreditor') self.mrreader = model.User.by_name(u'mrreader') self.annakarenina_creator = model.User.by_name(u'annakarenina_creator') self.logged_in = model.User.by_name(model.PSEUDO_USER__LOGGED_IN) self.visitor = model.User.by_name(model.PSEUDO_USER__VISITOR) self.john = model.User.by_name(u'john') self.war = model.Package.by_name(u'warandpeace') self.anna = model.Package.by_name(u'annakarenina') self.restricted = model.Package.by_name(u'restricted') self.vrestricted = model.Package.by_name(u'vrestricted')
def import_stage(self, harvest_object): log.debug("In CKANHarvester import_stage") context = { "model": model, "session": Session, "user": self._get_user_name() } if not harvest_object: log.error("No harvest object received") return False if harvest_object.content is None: self._save_object_error( "Empty content for object %s" % harvest_object.id, harvest_object, "Import", ) return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get("type") == "harvest": log.warn("Remote dataset is a harvest source, ignoring...") return True # Set default tags if needed default_tags = self.config.get("default_tags", []) if default_tags: if not "tags" in package_dict: package_dict["tags"] = [] package_dict["tags"].extend( [t for t in default_tags if t not in package_dict["tags"]]) remote_groups = self.config.get("remote_groups", None) if not remote_groups in ("only_local", "create"): # Ignore remote groups package_dict.pop("groups", None) else: if not "groups" in package_dict: package_dict["groups"] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict["groups"]: try: data_dict = {"id": group_name} group = get_action("group_show")(context, data_dict) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) except NotFound as e: log.info("Group %s is not available" % group_name) if remote_groups == "create": try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error("Could not get remote group %s" % group_name) continue for key in [ "packages", "created", "users", "groups", "tags", "extras", "display_name", ]: group.pop(key, None) get_action("group_create")(context, group) log.info("Group %s has been newly created" % group_name) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) package_dict["groups"] = validated_groups # Local harvest source organization source_dataset = get_action("package_show")( context, { "id": harvest_object.source.id }) local_org = source_dataset.get("owner_org") remote_orgs = self.config.get("remote_orgs", None) if not remote_orgs in ("only_local", "create"): # Assign dataset to the source organization package_dict["owner_org"] = local_org else: if not "owner_org" in package_dict: package_dict["owner_org"] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict["owner_org"] if remote_org: try: data_dict = {"id": remote_org} org = get_action("organization_show")(context, data_dict) validated_org = org["id"] except NotFound as e: log.info("Organization %s is not available" % remote_org) if remote_orgs == "create": try: try: org = self._get_organization( harvest_object.source.url, remote_org) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org) for key in [ "packages", "created", "users", "groups", "tags", "extras", "display_name", "type", ]: org.pop(key, None) get_action("organization_create")(context, org) log.info( "Organization %s has been newly created" % remote_org) validated_org = org["id"] except (RemoteResourceError, ValidationError): log.error("Could not get remote org %s" % remote_org) package_dict["owner_org"] = validated_org or local_org # Set default groups if needed default_groups = self.config.get("default_groups", []) if default_groups: if not "groups" in package_dict: package_dict["groups"] = [] package_dict["groups"].extend([ g for g in default_groups if g not in package_dict["groups"] ]) # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in list(package_dict["extras"].keys()): if not isinstance(package_dict["extras"][key], str): try: package_dict["extras"][key] = json.dumps( package_dict["extras"][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict["extras"][key] # Set default extras if needed default_extras = self.config.get("default_extras", {}) if default_extras: override_extras = self.config.get("override_extras", False) if not "extras" in package_dict: package_dict["extras"] = {} for key, value in default_extras.items(): if not key in package_dict["extras"] or override_extras: # Look for replacement strings if isinstance(value, str): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip("/"), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict["id"], ) package_dict["extras"][key] = value # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get("resources", []): resource.pop("url_type", None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get("read_only", False) == True: package = model.Package.get(package_dict["id"]) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get("user", "harvest") user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in ("visitor", "logged_in"): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError as e: self._save_object_error( "Invalid package with GUID %s: %r" % (harvest_object.guid, e.error_dict), harvest_object, "Import", ) except Exception as e: self._save_object_error("%r" % e, harvest_object, "Import")
def import_stage(self,harvest_object): ''' Imports each dataset from custom, into the CKAN server ''' log.debug('In CustomHarvester import_stage') print('In CustomoftHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) package_dict = json.loads(harvest_object.content) log.debug(harvest_object.job.source.config) try: #log.debug(harvest_object.content) package_dict.update({"catalogue_url":str(harvest_object.source.url)}) package_dict.update({"platform":"genericapi"}) package_dict.update({"name":package_dict['id'].lower().strip()}) mainurl=str(harvest_object.source.url) #if package_dict['id'] not in ids: document=custom_db.find_one({"catalogue_url":harvest_object.source.url,'id':package_dict['id']}) if document==None: metadata_created=datetime.datetime.now() package_dict.update({"metadata_created":str(metadata_created)}) custom_db.save(package_dict) log.info('Metadata saved succesfully to MongoDb.') fetch_document=db_fetch_temp.find_one() if fetch_document==None: fetch_document={} fetch_document.update({"cat_url":mainurl}) fetch_document.update({"new":1}) fetch_document.update({"updated":0}) db_fetch_temp.save(fetch_document) else: if mainurl==fetch_document['cat_url']: new_count=fetch_document['new'] new_count+=1 fetch_document.update({"new":new_count}) db_fetch_temp.save(fetch_document) else: last_cat_url=fetch_document['cat_url'] doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']}) if 'new' in fetch_document.keys(): new=fetch_document['new'] if 'new' in doc.keys(): last_new=doc['new'] doc.update({"last_new":last_new}) doc.update({"new":new}) db_jobs.save(doc) if 'updated' in fetch_document.keys(): updated=fetch_document['updated'] if 'updated' in doc.keys(): last_updated=doc['updated'] doc.update({"last_updated":last_updated}) doc.update({"updated":updated}) db_jobs.save(doc) fetch_document.update({"cat_url":mainurl}) fetch_document.update({"new":1}) fetch_document.update({"updated":0}) db_fetch_temp.save(fetch_document) else: met_created=document['metadata_created'] if 'copied' in document.keys(): package_dict.update({'copied':document['copied']}) package_dict.update({'metadata_created':met_created}) package_dict.update({'metadata_updated':str(datetime.datetime.now())}) package_dict.update({'updated_dataset':True}) existing_dataset=custom_db.find_one({"id":package_dict['id'],"catalogue_url":mainurl}) objectid=existing_dataset['_id'] package_dict.update({'_id':objectid}) custom_db.save(package_dict) log.info('Metadata updated succesfully to MongoDb.') fetch_document=db_fetch_temp.find_one() if fetch_document==None: fetch_document={} fetch_document.update({"cat_url":mainurl}) fetch_document.update({"updated":1}) fetch_document.update({"new":0}) db_fetch_temp.save(fetch_document) else: if mainurl==fetch_document['cat_url']: updated_count=fetch_document['updated'] updated_count+=1 fetch_document.update({"updated":updated_count}) db_fetch_temp.save(fetch_document) else: last_cat_url=fetch_document['cat_url'] doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']}) if 'new' in fetch_document.keys(): new=fetch_document['new'] if 'new' in doc.keys(): last_new=doc['new'] doc.update({"last_new":last_new}) doc.update({"new":new}) db_jobs.save(doc) if 'updated' in fetch_document.keys(): updated=fetch_document['updated'] if 'updated' in doc.keys(): last_updated=doc['updated'] doc.update({"last_updated":last_updated}) doc.update({"updated":updated}) db_jobs.save(doc) fetch_document.update({"cat_url":mainurl}) fetch_document.update({"updated":1}) fetch_document.update({"new":0}) db_fetch_temp.save(fetch_document) # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict,harvest_object) #log.debug(result) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') print('ValidationError')
def setup_class(self): model.Session.remove() CreateTestData.create() model.Session.remove() self.authorizer = authz.Authorizer() self.admin_role = model.Role.ADMIN self.editor_role = model.Role.EDITOR self.reader_role = model.Role.READER john = model.User(name=u'john') model.Session.add(john) # setup annakarenina with default roles anna = model.Package.by_name(u'annakarenina') model.clear_user_roles(anna) annakarenina_creator = model.User(name=u'annakarenina_creator') model.Session.add(annakarenina_creator) model.repo.commit_and_remove() model.setup_default_user_roles(anna, [annakarenina_creator]) model.repo.commit_and_remove() # setup warandpeace with no roles war = model.Package.by_name(u'warandpeace') model.clear_user_roles(war) # setup restricted package - visitors can't change restricted = model.Package(name=u'restricted') vrestricted = model.Package(name=u'vrestricted') mreditor = model.User(name=u'mreditor') mrreader = model.User(name=u'mrreader') self.mrsysadmin = u'mrsysadmin' mrsysadmin = model.User(name=self.mrsysadmin) model.repo.new_revision() model.Session.add_all( [restricted, vrestricted, mreditor, mrreader, mrsysadmin]) model.repo.commit_and_remove() visitor_roles = [] logged_in_roles = [model.Role.EDITOR, model.Role.READER] logged_in_roles_v = [] restricted = model.Package.by_name(u'restricted') vrestricted = model.Package.by_name(u'vrestricted') model.setup_user_roles(restricted, visitor_roles, logged_in_roles) model.setup_user_roles(vrestricted, visitor_roles, logged_in_roles_v) model.repo.commit_and_remove() mreditor = model.User.by_name(u'mreditor') model.add_user_to_role(mreditor, model.Role.EDITOR, restricted) mrsysadmin = model.User.by_name(u'mrsysadmin') model.add_user_to_role(mrsysadmin, model.Role.ADMIN, model.System()) model.repo.commit_and_remove() self.mreditor = model.User.by_name(u'mreditor') self.mrreader = model.User.by_name(u'mrreader') self.annakarenina_creator = model.User.by_name(u'annakarenina_creator') self.logged_in = model.User.by_name(model.PSEUDO_USER__LOGGED_IN) self.visitor = model.User.by_name(model.PSEUDO_USER__VISITOR) self.john = model.User.by_name(u'john') self.war = model.Package.by_name(u'warandpeace') self.anna = model.Package.by_name(u'annakarenina') self.restricted = model.Package.by_name(u'restricted') self.vrestricted = model.Package.by_name(u'vrestricted')
def import_stage(self,harvest_object): omit_tags = ['ogd', 'None'] if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import') return False self._set_config(harvest_object.job.source.config) try: old_content = json.loads(harvest_object.content) new_content = {} new_content['id'] = harvest_object.guid # Common extras new_content['extras'] = {} #new_content['extras']['harvest_catalogue_name'] = u'Open Government Data Wien' #new_content['extras']['harvest_catalogue_url'] = u'http://data.wien.gv.at' new_content['maintainer'] = old_content.get('maintainer') new_content['maintainer_email'] = '' new_content['author'] = old_content.get('maintainer') new_content['resources'] = old_content.get('resources') for r in new_content['resources']: if (r.get('name') != ""): r['name'] = r.get('description') if old_content.get('tags'): if not 'tags' in new_content: new_content['tags'] = [] new_content['tags'].extend([i for i in old_content.get('tags') if i not in omit_tags]) new_content['name'] = 'ogdwien_' + self._gen_new_name(old_content.get('title')) if(len(new_content['name']) > 100): new_content['name'] = new_content['name'][:99] new_content['license_id'] = 'cc-by' new_content['url'] = old_content.get('extras').get('harvest_dataset_url') new_content['notes'] = old_content.get('notes') new_content['title'] = old_content.get('title') frq = self.map_frequency(old_content.get('extras').get('temporal_granularity')) if (frq != ''): new_content['extras']['update_frequency'] = frq log.info("update_frequency: %s" % frq) temp = old_content.get('extras').get('temporal_coverage') if (temp): regex = re.compile("[0-9][0-9]+") r = regex.search(temp) if (r): new_content['extras']['begin_datetime'] = temp new_content['extras']['attribute_description'] = old_content.get('extras').get('attributes') new_content['extras']['geographic_toponym'] = old_content.get('extras').get('geographic_coverage') new_content['extras']['remote_id'] = harvest_object.guid new_content['extras']['remote_guid'] = old_content.get('title') new_content['extras']['publisher'] = 'OGD Wien' new_content['extras']['publisher_email'] = '*****@*****.**' new_content['groups'] = [] for cat in old_content.get('extras').get('categories'): ng = self.map_category(cat) if (ng != ''): new_content['groups'].append(ng) log.info('Group: %s - %s' % (cat, ng) ) if self.config: # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in new_content: new_content['tags'] = [] new_content['tags'].extend([t for t in default_tags if t not in new_content['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in new_content: new_content['groups'] = [] new_content['groups'].extend([g for g in default_groups if g not in new_content['groups']]) result= self._create_or_update_package(new_content, harvest_object) if result and self.config.get('read_only',False) == True: package = model.Package.get(new_content['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return result except Exception, e: log.exception(e) self._save_object_error('%r' % e, harvest_object, 'Import')
def import_stage(self,harvest_object): log.debug('In NTPCHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) log.debug(package_dict) log.debug('=============================================') package_dict["id"] = harvest_object.guid # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) log.debug(remote_groups) log.debug('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = {'model': model, 'session': Session, 'user': '******'} for group_name in package_dict['groups']: log.debug(group_name) try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group(harvest_object.source.url, group_name) except: log.error('Could not get remote group %s' % group_name) continue for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Ignore remote orgs for the time being package_dict.pop('owner_org', None) # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras',{}) if default_extras: override_extras = self.config.get('override_extras',False) if not 'extras' in package_dict: package_dict['extras'] = {} for key,value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value log.debug('_create_or_update_package') log.debug(package_dict) log.debug(harvest_object) result = self._create_or_update_package(package_dict,harvest_object) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) log.debug('import_stage return true') return True
def import_stage(self, harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: d = socrataAdaptor() log.debug("Converting View") stripped_source = harvest_object.source.url.rstrip('/') package_dict = d.convertViewXml(harvest_object.id, stripped_source, harvest_object.content) package_dict.update({"catalogue_url": str(harvest_object.source.url.rstrip('/'))}) package_dict.update({"platform": "socrata"}) if 'category' in package_dict.keys(): package_dict['extras'].update({'category': package_dict['category']}) del package_dict['category'] log.debug(package_dict) if package_dict['id'] not in ids: metadata_created = datetime.datetime.now() package_dict.update({"metadata_created": str(metadata_created)}) socrata_db.save(package_dict) log.info('Metadata saved succesfully to MongoDb.') else: document = socrata_db.find_one({"id": package_dict['id']}) met_created = document['metadata_created'] package_dict.update({'metadata_created': met_created}) package_dict.update({'metadata_updated': str(datetime.datetime.now())}) package_dict.update({'updated_dataset': True}) socrata_db.remove({"id": package_dict['id']}) socrata_db.save(package_dict) log.info('Metadata updated succesfully to MongoDb.') # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True except ValidationError, e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') log.debug("Validation Error: %s", harvest_object.guid)
def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) # Ignore remote groups for the time being del package_dict['groups'] # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) if default_extras: override_extras = self.config.get('override_extras', False) if not 'extras' in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) except ValidationError, e: self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def import_stage(self, harvest_object): log.debug('In HTMLHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: harvest_object.content = harvest_object.content.replace("'", '"') #package_dict=harvest_object.content package_dict = json.loads( harvest_object.content.decode('utf-8', 'ignore')) ## handle notes validation errors as existance of: " and / extrasjson = [] try: extras = package_dict['extras'] except: extras = "" j = 0 ##transformations to json's extras if 'value' in str(extras) and 'key' in str(extras): extrasjson[:] = [] extrasjson2 = "" while j < len(package_dict['extras']): extra_key = package_dict['extras'][j]['key'] extra_value = package_dict['extras'][j]['value'] if len(extra_value) > 0: c = 0 extra_value1 = "" while c < len(extra_value): extra_value1 = extra_value1 + extra_value[c] c += 1 c = 0 extra_value = extra_value1 extra = '"' + str( extra_key.encode('utf-8')) + '":' + '"' + str( extra_value.encode('utf-8')) + '"' extrasjson.append(extra) j += 1 k = 0 extrasjson1 = "" while k < len(extrasjson): extrasjson1 = extrasjson1 + extrasjson[k] + "," k += 1 k = 0 j = 0 extrasjson1 = "{" + extrasjson1.rstrip(',') + "}" try: extrasjson2 = json.loads(extrasjson1) except: errorscounter += 1 if len(extrasjson) > 0: package_dict.update({"extras": extrasjson2}) try: tags = package_dict['tags'] j = 0 if 'name' in str(tags): while j < len(package_dict['tags']): tag = package_dict['tags'][j]['name'] tagsarray.append(tag) j += 1 if len(tagsarray) > 0: package_dict.update({"tags": tagsarray}) tagsarray[:] = [] j = 0 except: pass if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = { 'model': model, 'session': Session, 'user': '******' } for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except: log.error('Could not get remote group %s' % group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups context = {'model': model, 'session': Session, 'user': '******'} # Local harvest source organization #source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id}) #local_org = source_dataset.get('owner_org') #remote_orgs = self.config.get('remote_orgs', None) #if not remote_orgs in ('only_local', 'create'): ## Assign dataset to the source organization #package_dict['owner_org'] = local_org #else: #if not 'owner_org' in package_dict: #package_dict['owner_org'] = None ## check if remote org exist locally, otherwise remove #validated_org = None #remote_org = package_dict['owner_org'] #if remote_org: #try: #data_dict = {'id': remote_org} #org = get_action('organization_show')(context, data_dict) #validated_org = org['id'] #except NotFound, e: #log.info('Organization %s is not available' % remote_org) #if remote_orgs == 'create': #try: #org = self._get_group(harvest_object.source.url, remote_org) #for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: #org.pop(key, None) #get_action('organization_create')(context, org) #log.info('Organization %s has been newly created' % remote_org) #validated_org = org['id'] #except: #log.error('Could not get remote org %s' % remote_org) #package_dict['owner_org'] = validated_org or local_org # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) if default_extras: override_extras = self.config.get('override_extras', False) if not 'extras' in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get('resources', []): resource.pop('url_type', None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True
def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) if harvest_object.content == DELETE: return self._delete_package(harvest_object) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if remote_groups not in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if 'groups' not in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = {'model': model, 'session': Session, 'user': '******'} for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group(harvest_object.source.url, group_name) except RemoteResourceError: log.error('Could not get remote group %s' % group_name) continue for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name']: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups context = {'model': model, 'session': Session, 'user': '******'} # Local harvest source organization source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id}) source_dataset.get('owner_org') self.config.get('remote_orgs', None) if 'owner_org' not in package_dict: package_dict['owner_org'] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = None if package_dict.get('organization'): remote_org = package_dict['organization']['name'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')(context, data_dict) validated_org = org['id'] except NotFound: log.info('No organization exist, not importing dataset') return "unchanged" else: log.info('No organization in harvested dataset') return "unchanged" package_dict['owner_org'] = validated_org # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras', {}) if default_extras: override_extras = self.config.get('override_extras', False) if 'extras' not in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): if key not in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get('resources', []): resource.pop('url_type', None) # Check if package exists data_dict = {} data_dict['id'] = package_dict['id'] try: existing_package_dict = get_action('package_show')(context, data_dict) if 'metadata_modified' in package_dict and package_dict['metadata_modified'] <= existing_package_dict.get('metadata_modified'): return "unchanged" except NotFound: pass result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) is True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) model.PackageRole(package=package, user=user, role=model.Role.READER) return True
def import_stage(self,harvest_object): log.debug('In CKANHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Ignore remote groups for the time being del package_dict['groups'] # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) if harvest_object.source.publisher_id: package_dict['owner_org'] = harvest_object.source.publisher_id # Set default extras if needed default_extras = self.config.get('default_extras',{}) if default_extras: override_extras = self.config.get('override_extras',False) if not 'extras' in package_dict: package_dict['extras'] = {} for key,value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value result = self._create_or_update_package(package_dict,harvest_object) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def import_stage(self, harvest_object): log.debug("In HTMLHarvester import_stage") if not harvest_object: log.error("No harvest object received") return False if harvest_object.content is None: self._save_object_error("Empty content for object %s" % harvest_object.id, harvest_object, "Import") return False self._set_config(harvest_object.job.source.config) try: harvest_object.content = harvest_object.content.replace("'", '"') # package_dict=harvest_object.content package_dict = json.loads(harvest_object.content.decode("utf-8", "ignore")) ## handle notes validation errors as existance of: " and / extrasjson = [] try: extras = package_dict["extras"] except: extras = "" j = 0 ##transformations to json's extras if "value" in str(extras) and "key" in str(extras): extrasjson[:] = [] extrasjson2 = "" while j < len(package_dict["extras"]): extra_key = package_dict["extras"][j]["key"] extra_value = package_dict["extras"][j]["value"] if len(extra_value) > 0: c = 0 extra_value1 = "" while c < len(extra_value): extra_value1 = extra_value1 + extra_value[c] c += 1 c = 0 extra_value = extra_value1 extra = '"' + str(extra_key.encode("utf-8")) + '":' + '"' + str(extra_value.encode("utf-8")) + '"' extrasjson.append(extra) j += 1 k = 0 extrasjson1 = "" while k < len(extrasjson): extrasjson1 = extrasjson1 + extrasjson[k] + "," k += 1 k = 0 j = 0 extrasjson1 = "{" + extrasjson1.rstrip(",") + "}" try: extrasjson2 = json.loads(extrasjson1) except: errorscounter += 1 if len(extrasjson) > 0: package_dict.update({"extras": extrasjson2}) try: tags = package_dict["tags"] j = 0 if "name" in str(tags): while j < len(package_dict["tags"]): tag = package_dict["tags"][j]["name"] tagsarray.append(tag) j += 1 if len(tagsarray) > 0: package_dict.update({"tags": tagsarray}) tagsarray[:] = [] j = 0 except: pass if package_dict.get("type") == "harvest": log.warn("Remote dataset is a harvest source, ignoring...") return True # Set default tags if needed default_tags = self.config.get("default_tags", []) if default_tags: if not "tags" in package_dict: package_dict["tags"] = [] package_dict["tags"].extend([t for t in default_tags if t not in package_dict["tags"]]) remote_groups = self.config.get("remote_groups", None) if not remote_groups in ("only_local", "create"): # Ignore remote groups package_dict.pop("groups", None) else: if not "groups" in package_dict: package_dict["groups"] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = {"model": model, "session": Session, "user": "******"} for group_name in package_dict["groups"]: try: data_dict = {"id": group_name} group = get_action("group_show")(context, data_dict) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) except NotFound, e: log.info("Group %s is not available" % group_name) if remote_groups == "create": try: group = self._get_group(harvest_object.source.url, group_name) except: log.error("Could not get remote group %s" % group_name) continue for key in ["packages", "created", "users", "groups", "tags", "extras", "display_name"]: group.pop(key, None) get_action("group_create")(context, group) log.info("Group %s has been newly created" % group_name) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) package_dict["groups"] = validated_groups context = {"model": model, "session": Session, "user": "******"} # Local harvest source organization # source_dataset = get_action('package_show')(context, {'id': harvest_object.source.id}) # local_org = source_dataset.get('owner_org') # remote_orgs = self.config.get('remote_orgs', None) # if not remote_orgs in ('only_local', 'create'): ## Assign dataset to the source organization # package_dict['owner_org'] = local_org # else: # if not 'owner_org' in package_dict: # package_dict['owner_org'] = None ## check if remote org exist locally, otherwise remove # validated_org = None # remote_org = package_dict['owner_org'] # if remote_org: # try: # data_dict = {'id': remote_org} # org = get_action('organization_show')(context, data_dict) # validated_org = org['id'] # except NotFound, e: # log.info('Organization %s is not available' % remote_org) # if remote_orgs == 'create': # try: # org = self._get_group(harvest_object.source.url, remote_org) # for key in ['packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type']: # org.pop(key, None) # get_action('organization_create')(context, org) # log.info('Organization %s has been newly created' % remote_org) # validated_org = org['id'] # except: # log.error('Could not get remote org %s' % remote_org) # package_dict['owner_org'] = validated_org or local_org # Set default groups if needed default_groups = self.config.get("default_groups", []) if default_groups: package_dict["groups"].extend([g for g in default_groups if g not in package_dict["groups"]]) # Set default extras if needed default_extras = self.config.get("default_extras", {}) if default_extras: override_extras = self.config.get("override_extras", False) if not "extras" in package_dict: package_dict["extras"] = {} for key, value in default_extras.iteritems(): if not key in package_dict["extras"] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip("/"), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict["id"], ) package_dict["extras"][key] = value # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get("resources", []): resource.pop("url_type", None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get("read_only", False) == True: package = model.Package.get(package_dict["id"]) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get("user", u"harvest") user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u"visitor", u"logged_in"): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True
class CKANSchemingHarvester(CKANHarvester): ''' A Harvester for CKAN instances with custom scheming dataset ''' def info(self): return { 'name': 'ckan-scheming', 'title': 'CKAN-scheming', 'description': 'Harvests remote CKAN instances with ckanext-scheming', 'form_config_interface': 'Text' } def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') context = { 'model': model, 'session': Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error('Could not get remote group %s' % group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Local harvest source organization source_dataset = get_action('package_show')( context, { 'id': harvest_object.source.id }) local_org = source_dataset.get('owner_org') remote_orgs = self.config.get('remote_orgs', None) if not remote_orgs in ('only_local', 'create'): # Assign dataset to the source organization package_dict['owner_org'] = local_org else: if not 'owner_org' in package_dict: package_dict['owner_org'] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict['owner_org'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')(context, data_dict) validated_org = org['id'] except NotFound, e: log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: try: org = self._get_organization( harvest_object.source.url, remote_org) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org) for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type' ]: org.pop(key, None) get_action('organization_create')(context, org) log.info( 'Organization %s has been newly created' % remote_org) validated_org = org['id'] except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s' % remote_org) package_dict['owner_org'] = validated_org or local_org # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # FIXME: enable only if not using ckanext-scheming dataset schemas # handle extras in harvested schema # """ # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras',{}) if default_extras: override_extras = self.config.get('override_extras',False) if not 'extras' in package_dict: package_dict['extras'] = {} for key,value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value """ # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get('resources', []): resource.pop('url_type', None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True
def import_stage(self,harvest_object): ''' Imports each dataset from Socrata, into the CKAN server ''' log.debug('In SocrataHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) log.debug(harvest_object.job.source.config) try: #log.debug(harvest_object.content) d = socrataAdaptor() log.debug("Converting View") package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content) log.debug(package_dict) # Set default tags if needed default_tags = self.config.get('default_tags',[]) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) # Set default groups if needed default_groups = self.config.get('default_groups',[]) if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) log.debug(package_dict) result = self._create_or_update_package(package_dict,harvest_object) #log.debug(result) if result and self.config.get('read_only',False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user',u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor',u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')