def import_stage(self,harvest_object): omit_tags = ['ogd', 'none'] old_content = json.loads(harvest_object.content) old = json.loads(harvest_object.content) new_content = {} new_content = old_content new_content['id'] = old_content.get('extras').get('metadata_identifier') or old_content.get('id') new_content['license_id'] = helper.map_license(old_content.get('license'), 'cc-by') new_content['name'] = self._gen_new_name(old_content.get('title')) new_content['metadata_modified'] = old_content.get('extras').get('metadata_modified') or old_content.get('metadata_modified') or '' new_content['extras']['publisher'] = u'Land Oberösterreich' new_content['extras']['schema_language'] = old.get('extras').get('schema_language') or 'ger' new_content['extras']['schema_name'] = old.get('extras').get('schema_name') or 'OGD Austria Metadata 2.1' new_content['extras']['schema_characterset'] = old.get('extras').get('schema_characterset') or 'utf8' new_content['resources'] = [] if isinstance(old.get('resources'), list): new_content['resources'] = old.get('resources') else: new_content['resources'].append(old.get('resources')) new_content['tags']= [] if isinstance(old.get('tags'), list): new_content['tags'] = old.get('tags') else: new_content['tags'].append(old.get('tags')) harvest_object.content = json.dumps(new_content) print harvest_object super(DataOoeGvAtHarvester, self).import_stage(harvest_object)
def test_0_check_setup(self): offset = '/api/rest/package' resB = self.app.get(offset).body resA = self.sub_app_get(offset) pkgsB = json.loads(resB or '[]') pkgsA = json.loads(resA or '[]') assert len(pkgsA) == 2 assert len(pkgsB) == 0
def gather_stage(self,harvest_job): log.debug('In CKANHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() # Get source URL base_url = harvest_job.source.url.rstrip('/') base_rest_url = base_url + self._get_rest_api_offset() base_search_url = base_url + self._get_search_api_offset() if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0): if not self.config.get('force_all',False): get_all_packages = False # Request only the packages modified since last harvest job last_time = previous_job.gather_finished.isoformat() url = base_search_url + '/revision?since_time=%s' % last_time try: content = self._get_content(url) revision_ids = json.loads(content) if len(revision_ids): for revision_id in revision_ids: url = base_rest_url + '/revision/%s' % revision_id try: content = self._get_content(url) except ContentFetchError,e: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) continue revision = json.loads(content) for package_id in revision['packages']: if not package_id in package_ids: package_ids.append(package_id) else: log.info('No packages have been updated on the remote CKAN instance since the last harvest job') return None except urllib2.HTTPError,e: if e.getcode() == 400: log.info('CKAN instance %s does not suport revision filtering' % base_url) get_all_packages = True else: self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) return None
def import_stage(self, harvest_object): package_dict = json.loads(harvest_object.content) if not self._should_import_local(package_dict): package_dict['state'] = 'deleted' else: package_dict = self._apply_package_extras_white_list(package_dict) package_dict = self._apply_package_resource_extras_black_list(package_dict) package_dict = self._fix_date_in_fields(package_dict) package_dict = self._set_license(package_dict) package_dict = self._pop_black_list_resources_by_type(package_dict) harvest_object.content = json.dumps(package_dict) upload_resources = self._pop_upload_resources(package_dict) import_stage_result = super(GuiaHarvesterPlugin, self).import_stage(harvest_object) if import_stage_result: package_dict = json.loads(harvest_object.content) harvested_rels = package_dict.get('relationships', []) try: this_package = model.Package.get(package_dict['name']) if not this_package: raise logic.NotFound() except logic.NotFound as nf: log.info('import_stage(): could not find package "{0}"; relationships not updated: {1}'.format(package_dict['name'], nf)) return import_stage_result existing_rels = this_package.get_relationships() self._update_relationships(existing_rels, harvested_rels) for resource_dict in upload_resources: resource_url = resource_dict['url'] resource_filename = resource_url.split('/')[-1] try: response = requests.get(resource_url) resource_file = StringIO(response.content) except Exception,e: self._save_object_error('Resource not harvested for package "{0}". Unable to fetch resource from "{1}": {2}'.format(package_dict['name'], resource_url, e), harvest_object, 'Import') continue cfs = FieldStorage() cfs.file = resource_file cfs.filename = resource_filename resource_dict['upload'] = cfs if 'created' in resource_dict: del resource_dict['created'] if 'last_modified' in resource_dict: del resource_dict['last_modified'] if 'api' in resource_dict: del resource_dict['api'] try: the_resource = toolkit.get_action('resource_create')(data_dict=resource_dict) except Exception,e: self._save_object_error('Resource not harvested for package "{0}". Unable to import the resource originally from "{1}": {2}'.format(package_dict['name'], resource_url, e), harvest_object, 'Import') continue
def get_pkg_ids_for_organizations(orgs): pkg_ids = set() for organization in orgs: url = base_search_url + '/dataset?organization=%s' % organization content = self._get_content(url) content_json = json.loads(content) result_count = int(content_json['count']) pkg_ids |= set(content_json['results']) while len(pkg_ids) < result_count or not content_json['results']: url = base_search_url + '/dataset?organization=%s&offset=%s' % (organization, len(pkg_ids)) content = self._get_content(url) content_json = json.loads(content) pkg_ids |= set(content_json['results']) return pkg_ids
def fetch_stage(self,harvest_object): ''' Fetches the list of datasets from the catalog ''' log.debug('In CustomHarvester fetch_stage') self._set_config(harvest_object.job.source.config) db=client.odm db_jobs=db.jobs config=db_jobs.find_one({"cat_url":harvest_object.source.url}) api_key=config['apikey'] dataset_url=config['dataset_url'] metadata_mappings=json.loads(config['metadata_mappings']) if "data.norge.no" in harvest_object.source.url.rstrip('/'): many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4'] else: many_datasets_list.append(datasets_list_url) if dataset_url!="": fetch_url=harvest_object.source.url.rstrip('/')+dataset_url.replace("{api}",api_key).replace("{id}", harvest_object.guid) #print(fetch_url) else: fetch_url="" dataset={} features=[] if fetch_url!="": result=urllib2.urlopen(fetch_url) try: try: dataset=json.load(result) except: try: headers = {'Accept':'application/json'} r=urllib2.Request(fetch_url,headers=headers) dataset=json.loads(urllib2.urlopen(r).read()) except: result=urllib2.urlopen(fetch_url) read=result.read() read=read.replace("null(","dataset=").rstrip(')') exec(read) #print(dataset) except Exception, e: log.exception('Could not load ' + fetch_url) self._save_gather_error('%r'%e.message,harvest_object)
def validate_config(self,config): print 'VALIDATE CONFIG' if not config: return config try: config_obj = json.loads(config) if 'api_version' in config_obj: try: int(config_obj['api_version']) except ValueError: raise ValueError('api_version must be an integer') if 'default_tags' in config_obj: if not isinstance(config_obj['default_tags'],list): raise ValueError('default_tags must be a list') if 'default_groups' in config_obj: if not isinstance(config_obj['default_groups'],list): raise ValueError('default_groups must be a list') # Check if default groups exist context = {'model':model,'user':c.user} for group_name in config_obj['default_groups']: try: group = get_action('group_show')(context,{'id':group_name}) except NotFound,e: raise ValueError('Default group not found') if 'default_extras' in config_obj: if not isinstance(config_obj['default_extras'],dict): raise ValueError('default_extras must be a dictionary') if 'from' in config_obj: try: datetime.strptime(config_obj['from'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError("Incorrect date format, should be yyyy-mm-ddThh:mm:ssZ ") # int(config_obj['from']) if 'until' in config_obj: try: datetime.strptime(config_obj['until'], '%Y-%m-%dT%H:%M:%SZ') except ValueError: raise ValueError("Incorrect date format, should be yyyy-mm-ddThh:mm:ssZ ") #if 'vocabulary' in config_obj: # if config_obj['vocabulary'] != 'metashare' and config_obj['vocabulary'] != 'olac' and config_obj['vocabulary'] !='cmdi': # raise ValueError("Incorrect vocabulary, please choose between metashare, olac and cmdi") #else: # raise ValueError("Please provide a vocabulary, you can choose between metashare, olac and cmdi") if 'user' in config_obj: # Check if user exists context = {'model':model,'user':c.user} try: user = get_action('user_show')(context,{'id':config_obj.get('user')}) except NotFound,e: raise ValueError('User not found')
def test_revisions__since_revision_id__latest(self): last_rev, rev = self._get_last_and_penultimate_revisions() offset = "/api/util/revisions?since-revision-id=%s" % rev.id result = self.app.get(offset, status=[200]) res = json.loads(result.body) assert isinstance(res, dict), res assert set(res.keys()) >= set(("since_timestamp", "datasets")), res.keys() assert_equal(res["since_revision_id"], rev.id) assert_equal(res["newest_revision_id"], last_rev.id) assert_equal(res["number_of_revisions"], 2) assert_equal(res["results_limited"], False) pkgs = res["datasets"] pkg = pkgs[0] assert_equal(pkg["name"], "latest") assert_equal(pkg["notes"].strip(), "Latest dataset.") assert pkg["publisher_title"] in ("National Health Service", "Department of Health"), pkg["publisher_title"] assert set(pkg.keys()) >= set( ("title", "dataset_link", "notes", "publisher_title", "publisher_link") ), pkg.keys() # try dataset_link if model.engine_is_sqlite(): raise SkipTest("Link tests need postgres") res = self.app.get(pkg["dataset_link"], status=[200]) assert "latest" in res.body # try publisher_link res = self.app.get(pkg["publisher_link"], status=[200]) assert "National Health Service" in res.body, res
def import_stage(self,harvest_object): if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import') return False try: package_dict = json.loads(harvest_object.content) package_dict['id'] = harvest_object.guid package_dict['name'] = self._gen_new_name(package_dict['title']) # Common extras package_dict['extras']['harvest_catalogue_name'] = u'Dati Piemonte' package_dict['extras']['harvest_catalogue_url'] = u'http://dati.piemonte.it/' package_dict['extras']['eu_country'] = u'IT' package_dict['extras']['eu_nuts1'] = u'ITC' package_dict['extras']['eu_nuts2'] = u'ITC1' return self._create_or_update_package(package_dict, harvest_object) except Exception, e: log.exception(e) self._save_object_error('%r' % e, harvest_object, 'Import')
def group_to_api2(group, context): dictized = group_dictize(group, context) dictized["extras"] = dict((extra["key"], json.loads(extra["value"])) for extra in dictized["extras"]) dictized["packages"] = sorted([package["id"] for package in dictized["packages"]]) return dictized
def _get_group(self, base_url, group_name): url = base_url + self._get_rest_api_offset() + '/group/' + group_name try: content = self._get_content(url) return json.loads(content) except Exception, e: raise e
def _set_config(self,config_str): if config_str: self.config = json.loads(config_str) self.api_version = int(self.config['api_version']) log.debug('Using config: %r', self.config) else: self.config = {}
def fetch_stage(self, harvest_object): log.debug('In SwisstopoHarvester fetch_stage') # Get the URL log.debug(json.loads(harvest_object.content)) name = json.loads(harvest_object.content)['name'] log.debug(harvest_object.content) # Get contents try: harvest_object.save() log.debug('successfully processed ' + name) return True except Exception, e: log.exception(e) raise
def validate_config(self, config): if not config: return config config_obj = json.loads(config) return config
def validate_config(self,config): if not config: return config try: config_obj = json.loads(config) if 'default_tags' in config_obj: if not isinstance(config_obj['default_tags'],list): raise ValueError('default_tags must be a list') if 'default_groups' in config_obj: if not isinstance(config_obj['default_groups'],list): raise ValueError('default_groups must be a list') # Check if default groups exist context = {'model':model,'user':c.user} for group_name in config_obj['default_groups']: try: group = get_action('group_show')(context,{'id':group_name}) except NotFound,e: raise ValueError('Default group not found') if 'default_extras' in config_obj: if not isinstance(config_obj['default_extras'],dict): raise ValueError('default_extras must be a dictionary') if 'user' in config_obj: # Check if user exists context = {'model':model,'user':c.user} try: user = get_action('user_show')(context,{'id':config_obj.get('user')}) except NotFound,e: raise ValueError('User not found')
def import_stage(self,harvest_object): log.debug('In SRDAHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False #self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) package_dict["id"] = harvest_object.guid package_dict["extras"][u"資料庫名稱"] = u'SRDA' package_dict["extras"][u"資料庫網址"] = u'http://srda.sinica.edu.tw/' #print package_dict for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps(package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] result = self._create_or_update_package(package_dict,harvest_object) return True except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import')
def fetch_stage(self, harvest_object): log.debug('In fetch_stage ' + self.info()['title']) d = json.loads(harvest_object.content) d = self.scraper.fetch(d) harvest_object.content = json.dumps(d) harvest_object.save() return True
def test_revisions__since_revision_id__latest(self): last_rev, rev = self._get_last_and_penultimate_revisions() offset = '/api/util/revisions?since-revision-id=%s' % rev.id result = self.app.get(offset, status=[200]) res = json.loads(result.body) assert isinstance(res, dict), res assert set(res.keys()) >= set(('since_timestamp', 'datasets')), res.keys() assert_equal(res['since_revision_id'], rev.id) assert_equal(res['newest_revision_id'], last_rev.id) assert_equal(res['number_of_revisions'], 2) assert_equal(res['results_limited'], False) pkgs = res['datasets'] pkg = pkgs[0] assert_equal(pkg['name'], 'latest') assert_equal(pkg['notes'].strip(), 'Latest dataset.') assert pkg['publisher_title'] in ('National Health Service', 'Department of Health'), pkg['publisher_title'] assert set(pkg.keys()) >= set(('title', 'dataset_link', 'notes', 'publisher_title', 'publisher_link')), pkg.keys() # try dataset_link if model.engine_is_sqlite(): raise SkipTest("Link tests need postgres") res = self.app.get(pkg['dataset_link'], status=[200]) assert 'latest' in res.body # try publisher_link res = self.app.get(pkg['publisher_link'], status=[200]) assert 'National Health Service' in res.body, res
def test_create_package(self): test_pkg = self.get_package_fixture('test1') offset = '/api/rest/package' postparams = '%s=1' % json.dumps(test_pkg) result = self.app.post(offset, postparams, status=[201], extra_environ=self.extra_environ_sysadmin) # check returned dict is correct res = json.loads(result.body) assert_equal(res['name'], test_pkg['name']) assert res['id'] assert_equal(res['title'], test_pkg['title']) assert_equal(res['license_id'], test_pkg['license_id']) assert_equal(res['extras'].get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to']) assert_equal(res['resources'][0].get('description'), test_pkg['resources'][0]['description']) assert_equal(set(res['tags']), set(test_pkg['tags'])) # check package was created ok pkg = model.Package.by_name(test_pkg['name']) pkg_dict = get_action('package_show')(self.context, {'id': test_pkg['name']}) assert_equal(pkg.name, test_pkg['name']) assert_equal(pkg.title, test_pkg['title']) assert_equal(pkg.extras.get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to']) assert_equal(pkg.resources[0].description, test_pkg['resources'][0]['description']) assert_equal(set([tag['name'] for tag in pkg_dict['tags']]), set(test_pkg['tags']))
def test_get_package(self): offset = '/api/rest/package/%s' % self.pkg_name result = self.app.get(offset, status=[200]) content_type = result.header_dict['Content-Type'] assert 'application/json' in content_type, content_type res = json.loads(result.body) assert_equal(res['name'], self.pkg_name) assert_equal(res['id'], self.pkg_id) assert_equal(res['notes'], u'Ratings for all articles on the Directgov website. One data file is available per day. Sets of files are organised by month on the download page') assert_equal(res['license_id'], 'uk-ogl') assert_equal(res['license'], u'UK Open Government Licence (OGL)') assert_equal(set(res['tags']), set(["article", "cota", "directgov", "information", "ranking", "rating"])) assert self._is_member_of_org(res, "national-health-service") extras = res['extras'] expected_extra_keys = set(( 'access_constraints', 'contact-email', 'contact-name', 'contact-phone', 'foi-email', 'foi-name', 'foi-phone', 'foi-web', 'geographic_coverage', 'mandate', 'temporal_coverage-to', 'temporal_coverage-from', 'temporal_granularity')) assert set(extras.keys()) >= expected_extra_keys, set(extras.keys()) - expected_extra_keys assert_equal(extras.get('temporal_coverage-from'), '2010-01-01') assert_equal(len(res['resources']), 1) resource = res['resources'][0] assert_equal(resource['description'], "Directgov Article Ratings") assert_equal(resource['url'], "http://innovate-apps.direct.gov.uk/cota/") assert_equal(resource['format'], "HTML")
def test_new(self): name = "test-spatial-dataset-1" offset = url_for(controller="package", action="new") res = self.app.get(offset, extra_environ=self.extra_environ) assert "Add - Datasets" in res fv = res.forms["dataset-edit"] prefix = "" fv[prefix + "name"] = name fv[prefix + "extras__0__key"] = u"spatial" fv[prefix + "extras__0__value"] = self.geojson_examples["point"] res = fv.submit("save", extra_environ=self.extra_environ) assert not "Error" in res, res package = Package.get(name) # Check that a PackageExtent object has been created package_extent = Session.query(PackageExtent).filter(PackageExtent.package_id == package.id).first() geojson = json.loads(self.geojson_examples["point"]) assert package_extent assert package_extent.package_id == package.id assert Session.scalar(package_extent.the_geom.x) == geojson["coordinates"][0] assert Session.scalar(package_extent.the_geom.y) == geojson["coordinates"][1] assert Session.scalar(package_extent.the_geom.srid) == self.db_srid
def package_to_api1(pkg, context): dictized = package_dictize(pkg, context) dictized.pop("revision_timestamp") dictized["groups"] = [group["name"] for group in dictized["groups"]] dictized["tags"] = [tag["name"] for tag in dictized["tags"] if not tag.get("vocabulary_id")] dictized["extras"] = dict((extra["key"], json.loads(extra["value"])) for extra in dictized["extras"]) dictized["notes_rendered"] = ckan.misc.MarkdownFormat().to_html(pkg.notes) resources = dictized["resources"] for resource in resources: resource_dict_to_api(resource, pkg.id, context) if pkg.resources: dictized["download_url"] = pkg.resources[0].url dictized["license"] = pkg.license.title if pkg.license else None dictized["ratings_average"] = pkg.get_average_rating() dictized["ratings_count"] = len(pkg.ratings) site_url = config.get("ckan.site_url", None) if site_url: dictized["ckan_url"] = "%s/dataset/%s" % (site_url, pkg.name) metadata_modified = pkg.metadata_modified dictized["metadata_modified"] = metadata_modified.isoformat() if metadata_modified else None metadata_created = pkg.metadata_created dictized["metadata_created"] = metadata_created.isoformat() if metadata_created else None subjects = dictized.pop("relationships_as_subject") objects = dictized.pop("relationships_as_object") relationships = [] for relationship in objects: model = context["model"] swap_types = model.PackageRelationship.forward_to_reverse_type type = swap_types(relationship["type"]) relationships.append( { "subject": pkg.get(relationship["object_package_id"]).name, "type": type, "object": pkg.get(relationship["subject_package_id"]).name, "comment": relationship["comment"], } ) for relationship in subjects: model = context["model"] relationships.append( { "subject": pkg.get(relationship["subject_package_id"]).name, "type": relationship["type"], "object": pkg.get(relationship["object_package_id"]).name, "comment": relationship["comment"], } ) dictized["relationships"] = relationships return dictized
def check_spatial_extra(self,package): ''' For a given package, looks at the spatial extent (as given in the extra "spatial" in GeoJSON format) and records it in PostGIS. ''' if not package.id: log.warning('Couldn\'t store spatial extent because no id was provided for the package') return # TODO: deleted extra for extra in package.extras_list: if extra.key == 'spatial': if extra.state == 'active' and extra.value: try: log.debug('Received: %r' % extra.value) geometry = json.loads(extra.value) except ValueError,e: error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]} raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict)) except TypeError,e: error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]} raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict)) try: save_package_extent(package.id,geometry) except ValueError,e: error_dict = {'spatial':[u'Error creating geometry: %s' % str(e)]} raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict)) except Exception, e: raise if bool(os.getenv('DEBUG')): raise error_dict = {'spatial':[u'Error: %s' % str(e)]} raise p.toolkit.ValidationError(error_dict, error_summary=package_error_summary(error_dict))
def test_edit_package(self): # create the package to be edited pkg_name = 'test4' test_pkg = self.get_package_fixture(pkg_name) pkg = CreateTestData.create_arbitrary(test_pkg) # edit it offset = '/api/rest/package/%s' % pkg_name edited_pkg = copy.deepcopy(test_pkg) edited_pkg['title'] = 'Edited title' postparams = '%s=1' % json.dumps(edited_pkg) result = self.app.put(offset, postparams, status=[200], extra_environ=self.extra_environ_sysadmin) # check returned dict is correct res = json.loads(result.body) assert_equal(res['name'], test_pkg['name']) assert res['id'] assert_equal(res['title'], 'Edited title') assert_equal(res['license_id'], test_pkg['license_id']) assert res['organization']['name'] == test_pkg['groups'][0] assert_equal(res['extras'].get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to']) assert_equal(res['resources'][0].get('description'), test_pkg['resources'][0]['description']) assert_equal(set(res['tags']), set(test_pkg['tags'])) # check package was edited ok pkg = model.Package.by_name(test_pkg['name']) pkg_dict = get_action('package_show')(self.context, {'id': test_pkg['name']}) assert_equal(pkg.name, test_pkg['name']) assert_equal(pkg.title, 'Edited title') assert pkg.get_organization().name == test_pkg['groups'][0] assert_equal(pkg.extras.get('temporal_coverage-to'), test_pkg['extras']['temporal_coverage-to']) assert_equal(pkg.resources[0].description, test_pkg['resources'][0]['description']) assert_equal(set([tag['name'] for tag in pkg_dict['tags']]), set(test_pkg['tags']))
def _parse_recline_state(self, params): state_version = int(request.params.get('state_version', '1')) if state_version != 1: return None recline_state = {} for k, v in request.params.items(): try: v = json.loads(v) except ValueError: pass recline_state[k] = v recline_state.pop('width', None) recline_state.pop('height', None) recline_state['readOnly'] = True # previous versions of recline setup used elasticsearch_url attribute # for data api url - see http://trac.ckan.org/ticket/2639 # fix by relocating this to url attribute which is the default location if 'dataset' in recline_state and 'elasticsearch_url' in recline_state['dataset']: recline_state['dataset']['url'] = recline_state['dataset']['elasticsearch_url'] # Ensure only the currentView is available # default to grid view if none specified if not recline_state.get('currentView', None): recline_state['currentView'] = 'grid' for k in recline_state.keys(): if k.startswith('view-') and \ not k.endswith(recline_state['currentView']): recline_state.pop(k) return recline_state
def check_spatial_extra(self,package): if not package.id: log.warning('Couldn\'t store spatial extent because no id was provided for the package') return # TODO: deleted extra for extra in package.extras_list: if extra.key == 'spatial': if extra.state == 'active': try: log.debug('Received: %r' % extra.value) geometry = json.loads(extra.value) except ValueError,e: error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]} raise ValidationError(error_dict, error_summary=package_error_summary(error_dict)) except TypeError,e: error_dict = {'spatial':[u'Error decoding JSON object: %s' % str(e)]} raise ValidationError(error_dict, error_summary=package_error_summary(error_dict)) try: save_package_extent(package.id,geometry) except ValueError,e: error_dict = {'spatial':[u'Error creating geometry: %s' % str(e)]} raise ValidationError(error_dict, error_summary=package_error_summary(error_dict)) except Exception, e: error_dict = {'spatial':[u'Error: %s' % str(e)]} raise ValidationError(error_dict, error_summary=package_error_summary(error_dict))
def _parse_recline_state(self, params): state_version = int(request.params.get('state_version', '1')) if state_version != 1: return None recline_state = {} for k, v in request.params.items(): try: v = json.loads(v) except ValueError: pass recline_state[k] = v recline_state.pop('width', None) recline_state.pop('height', None) recline_state['readOnly'] = True # Ensure only the currentView is available # default to grid view if none specified if not recline_state.get('currentView', None): recline_state['currentView'] = 'grid' for k in recline_state.keys(): if k.startswith('view-') and \ not k.endswith(recline_state['currentView']): recline_state.pop(k) return recline_state
def _parse_recline_state(self, params): state_version = int(request.params.get("state_version", "1")) if state_version != 1: return None recline_state = {} for k, v in request.params.items(): try: v = json.loads(v) except ValueError: pass recline_state[k] = v recline_state.pop("width", None) recline_state.pop("height", None) recline_state["readOnly"] = True # previous versions of recline setup used elasticsearch_url attribute # for data api url - see http://trac.ckan.org/ticket/2639 # fix by relocating this to url attribute which is the default location if "dataset" in recline_state and "elasticsearch_url" in recline_state["dataset"]: recline_state["dataset"]["url"] = recline_state["dataset"]["elasticsearch_url"] # Ensure only the currentView is available # default to grid view if none specified if not recline_state.get("currentView", None): recline_state["currentView"] = "grid" for k in recline_state.keys(): if k.startswith("view-") and not k.endswith(recline_state["currentView"]): recline_state.pop(k) return recline_state
def test_create_extent(self): package = factories.Dataset() geojson = json.loads(self.geojson_examples['point']) shape = asShape(geojson) package_extent = PackageExtent(package_id=package['id'], the_geom=WKTElement(shape.wkt, self.db_srid)) package_extent.save() assert_equals(package_extent.package_id, package['id']) if legacy_geoalchemy: assert_equals(Session.scalar(package_extent.the_geom.x), geojson['coordinates'][0]) assert_equals(Session.scalar(package_extent.the_geom.y), geojson['coordinates'][1]) assert_equals(Session.scalar(package_extent.the_geom.srid), self.db_srid) else: from sqlalchemy import func assert_equals( Session.query(func.ST_X(package_extent.the_geom)).first()[0], geojson['coordinates'][0]) assert_equals( Session.query(func.ST_Y(package_extent.the_geom)).first()[0], geojson['coordinates'][1]) assert_equals(package_extent.the_geom.srid, self.db_srid)
def before_index(self, pkg_dict): '''Adds the fulltext of a package to the dict what will be given to the solr for indexing. @param pkg_dict: flattened dict (except for multli-valued fields such as tags) containing all the terms which will be sent to the indexer @return: modified package dict ''' if pkg_dict and pkg_dict.has_key('extras_full_text_search'): del pkg_dict['extras_full_text_search'] data_dict = json.loads(pkg_dict['data_dict']) fulltext = [x for x in data_dict['extras'] if 'full_text_search' in x['key']] if len(fulltext) > 0: extras = [x for x in data_dict['extras'] if not 'full_text_search' in x['key']] data_dict['extras'] = extras pkg_dict['fulltext'] = fulltext[0]['value'] else: fulltext_dict = _get_fulltext(pkg_dict['id']) if fulltext_dict: pkg_dict['fulltext'] = fulltext_dict.text pkg_dict['data_dict'] = json.dumps(data_dict) return pkg_dict
def import_stage(self, harvest_object): log.debug('In DSPCKANHarvester import_stage') context = { 'model': model, 'session': Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error('Could not get remote group %s' % group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Local harvest source organization source_dataset = get_action('package_show')( context, { 'id': harvest_object.source.id }) local_org = source_dataset.get('owner_org') remote_orgs = self.config.get('remote_orgs', None) if not remote_orgs in ('only_local', 'create'): # Assign dataset to the source organization package_dict['owner_org'] = local_org else: if not 'owner_org' in package_dict: package_dict['owner_org'] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict['owner_org'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')(context, data_dict) validated_org = org['id'] except NotFound, e: log.info('Organization %s is not available' % remote_org) if remote_orgs == 'create': try: try: org = self._get_organization( harvest_object.source.url, remote_org) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org) for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type' ]: org.pop(key, None) get_action('organization_create')(context, org) log.info( 'Organization %s has been newly created' % remote_org) validated_org = org['id'] except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s' % remote_org) package_dict['owner_org'] = validated_org or local_org
def gather_stage(self, harvest_job): log.debug('In DSP\'s CKANHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() # Get source URL base_url = harvest_job.source.url.rstrip('/') base_rest_url = base_url + self._get_rest_api_offset() base_search_url = base_url + self._get_search_api_offset() if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0): if not self.config.get('force_all', False): get_all_packages = False # Request only the packages modified since last harvest job last_time = previous_job.gather_finished.isoformat() url = base_search_url + '/revision?since_time=%s' % last_time try: content = self._get_content(url) revision_ids = json.loads(content) if len(revision_ids): for revision_id in revision_ids: url = base_rest_url + '/revision/%s' % revision_id try: content = self._get_content(url) except ContentFetchError, e: self._save_gather_error( 'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) continue revision = json.loads(content) for package_id in revision['packages']: if not package_id in package_ids: package_ids.append(package_id) else: log.info( 'No packages have been updated on the remote CKAN instance since the last harvest job' ) return None except urllib2.HTTPError, e: if e.getcode() == 400: log.info( 'CKAN instance %s does not suport revision filtering' % base_url) get_all_packages = True else: self._save_gather_error( 'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) return None
'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) return None if get_all_packages: # Request all remote packages url = base_rest_url + '/package' try: content = self._get_content(url) except ContentFetchError, e: self._save_gather_error( 'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) return None package_ids = json.loads(content) try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % url, harvest_job)
def import_stage(self, harvest_object): log.debug('In HRIHarvester import_stage') context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if remote_groups not in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if 'groups' not in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available', group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error('Could not get remote group %s', group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created', group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Find if remote org exists locally, otherwise don't import dataset if 'owner_org' not in package_dict: package_dict['owner_org'] = None remote_org = None if package_dict.get('organization'): remote_org = package_dict['organization']['name'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')(context, data_dict) package_dict['owner_org'] = org['id'] except NotFound: log.info('No organization exist, not importing dataset') return "unchanged" else: log.info('No organization in harvested dataset') return "unchanged" # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) def get_extra(key, package_dict): for extra in package_dict.get('extras', []): if extra['key'] == key: return extra if default_extras: override_extras = self.config.get('override_extras', False) if 'extras' not in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): existing_extra = get_extra(key, package_dict) if existing_extra and not override_extras: continue # no need for the default if existing_extra: package_dict['extras'].remove(existing_extra) # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url. strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'].append({'key': key, 'value': value}) for resource in package_dict.get('resources', []): # Clear remote url_type for resources (eg datastore, upload) as # we are only creating normal resources with links to the # remote ones resource.pop('url_type', None) # Clear revision_id as the revision won't exist on this CKAN # and saving it will cause an IntegrityError with the foreign # key. resource.pop('revision_id', None) result = self._create_or_update_package( package_dict, harvest_object, package_dict_form='package_show') return result
def import_stage(self, harvest_object): ''' The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g. create, update or delete a DataNorge package). Note: if this stage creates or updates a package, a reference to the package should be added to the HarvestObject. - setting the HarvestObject.package (if there is one) - setting the HarvestObject.current for this harvest: - True if successfully created/updated - False if successfully deleted - setting HarvestObject.current to False for previous harvest objects of this harvest source if the action was successful. - creating and storing any suitable HarvestObjectErrors that may occur. - creating the HarvestObject - Package relation (if necessary) - returning True if the action was done, "unchanged" if the object didn't need harvesting after all or False if there were errors. NB You can run this stage repeatedly using 'paster harvest import'. :param harvest_object: HarvestObject object :returns: True if the action was done, "unchanged" if the object didn't need harvesting after all or False if there were errors. ''' log.debug('In DataNorgeHarvester import_stage') base_context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type', '') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True organization_name = package_dict['publisher'].get('name') package_dict['owner_org'] = self._gen_new_name(organization_name) if not 'tags' in package_dict: package_dict['tags'] = [] # TODO: CKAN tags don't accept commas, while keywords from datanorge # do contain them. A solution for this may be to create groups from # the keywords, since they're not really seen as 'tags' in # datanorge. The tags in datanorge are not accessable via their API. default_tags = self.config.get('default_tags', False) if default_tags: package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) # Sets a description to the dataset. descriptions = package_dict.pop('description') notes = None for item in descriptions: if item.get('language') == 'nb': notes = item.get('value') if notes: package_dict['notes'] = notes if not 'resources' in package_dict: package_dict['resources'] = [] distribution = package_dict.get('distribution') if distribution: for resource in distribution: items = resource.get('description') name = 'Name' if items: for item in items: if item.get('language') == 'nb': name = item.get('value') package_dict['resources'].append({ 'url': resource.get('accessURL'), 'name': name, 'format': resource.get('format') }) source_dataset = \ get_action('package_show')(base_context.copy(), {'id': harvest_object.source.id}) # Local harvest source organization source_dataset = \ get_action('package_show')(base_context.copy(), {'id': harvest_object.source.id}) local_org = source_dataset.get('owner_org') create_orgs = self.config.get('create_orgs', True) if not create_orgs: # Assign dataset to the source package_dict['owner_org'] = local_org else: # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict.get('owner_org', None) if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')( base_context.copy(), data_dict) if org.get('state') == 'deleted': patch_org = { 'id': org.get('id'), 'state': 'active' } get_action('organization_patch')( base_context.copy(), patch_org) validated_org = org['id'] except NotFound, e: log.info('Organization %s is not available', remote_org) if create_orgs: try: new_org = { 'name': package_dict.get('owner_org'), 'title': organization_name } try: html_source = \ BeautifulSoup( urllib.urlopen( package_dict.get('url') ).read() ) img_source = \ html_source.body.find( 'div', attrs={'class': 'logo'} ).img.get('src') except AttributeError, e: img_source = None log.debug('No logo was found for remote ' 'org %s.' % remote_org) if img_source: new_org['image_url'] = img_source org = get_action('organization_create')( base_context.copy(), new_org) log.info( 'Organization %s has been newly ' 'created', remote_org) validated_org = org['id'] except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s' % remote_org)
def _import_package(self, harvest_object): package_dict = json.loads(harvest_object.content) package_dict['id'] = harvest_object.guid package_dict['name'] = munge_title_to_name(package_dict[u'datasetID']) context = self._create_new_context() # check if package already exists and existing_package = self._get_existing_package(package_dict) # get metadata for resources resource_metadata = package_dict.pop('resource_metadata', {}) new_resources = self._generate_resources_from_folder( package_dict['datasetFolder'] ) for resource in new_resources: if resource['name'] in resource_metadata: resource.update(resource_metadata[resource['name']]) # update existing resources, delete old ones, create new ones actions, resources_changed = self._resources_actions( existing_package, new_resources ) if existing_package and 'resources' in existing_package: package_dict['resources'] = existing_package['resources'] self._find_or_create_organization(package_dict, context.copy()) # import the package if it does not yet exists => it's a new package # or if this harvester is allowed to update packages if not existing_package: dataset_id = self._create_package(package_dict, harvest_object) self._create_notification_for_new_dataset(package_dict) log.debug('Dataset `%s` has been added' % package_dict['id']) else: # Don't change the dataset name even if the title has package_dict['name'] = existing_package['name'] package_dict['id'] = existing_package['id'] dataset_id = self._update_package(package_dict, harvest_object) log.debug('Dataset `%s` has been updated' % package_dict['id']) # create diffs if there is a previous package if existing_package: self._create_diffs(package_dict) # set the date_last_modified if any resource changed if self.config['update_date_last_modified'] and resources_changed: theme_plugin = StadtzhThemePlugin() package_schema = theme_plugin.update_package_schema() schema_context = self._create_new_context() schema_context['ignore_auth'] = True schema_context['schema'] = package_schema today = datetime.datetime.now().strftime('%d.%m.%Y') try: get_action('package_patch')( schema_context, {'id': dataset_id, 'dateLastUpdated': today} ) except p.toolkit.ValidationError, e: self._save_object_error( 'Update validation Error: %s' % str(e.error_summary), harvest_object, 'Import' ) return False log.info('Updated dateLastUpdated to %s', today)
c.editors = c.group.members_of_type(model.User, 'editor') if c.user: c.is_sysadmin = Authorizer().is_sysadmin(unicode(c.user)) c.can_admin = c.is_sysadmin or c.userobj in c.administrators c.can_edit = c.can_admin or c.userobj in c.editors c.restricted_to_publisher = 'publisher' in request.params parent_groups = c.group.get_groups('publisher') c.parent_publisher = parent_groups[0] if len(parent_groups) > 0 else None c.group_extras = [] for extra in sorted(c.group_dict.get('extras',[]), key=lambda x:x['key']): if extra.get('state') == 'deleted': continue k, v = extra['key'], extra['value'] v = json.loads(v) c.group_extras.append((k, v)) c.group_extras = dict(c.group_extras) return render('publisher/read.html') def report_users_not_assigned_to_groups(self): context = {'model': model, 'session': model.Session, 'user': c.user or c.author} try: check_access('group_create', context) except NotAuthorized: abort(401, _('Not authorized to see this page')) query = """SELECT * FROM public.user WHERE id NOT IN
def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return False # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] context = { 'model': model, 'session': Session, 'user': '******' } for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available' % group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except: log.error('Could not get remote group %s' % group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created' % group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Ignore remote orgs for the time being package_dict.pop('owner_org', None) # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras', {}) if default_extras: override_extras = self.config.get('override_extras', False) if not 'extras' in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source. url.strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get('read_only', False) == True: package = model.Package.get(package_dict['id']) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get('user', u'harvest') user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) # Other users can only read for user_name in (u'visitor', u'logged_in'): user = model.User.get(user_name) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) return True
'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) return None if get_all_packages: # Request all remote packages url = base_rest_url + '/package' try: content = self._get_content(url) except Exception, e: self._save_gather_error( 'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) return None package_ids = json.loads(content) try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % url, harvest_job)
def gather_stage(self, harvest_job): log.error('In SpodHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() # Get source URL base_url = harvest_job.source.url.rstrip('/') base_rest_url = base_url + self._get_rest_api_offset() base_search_url = base_url + self._get_search_api_offset() # Filter in/out datasets from particular organizations org_filter_include = self.config.get('organizations_filter_include', []) org_filter_exclude = self.config.get('organizations_filter_exclude', []) def get_pkg_ids_for_organizations(orgs): pkg_ids = set() for organization in orgs: url = base_search_url + '/dataset?organization=%s' % organization content = self._get_content(url) content_json = json.loads(content) result_count = int(content_json['count']) pkg_ids |= set(content_json['results']) while len( pkg_ids) < result_count or not content_json['results']: url = base_search_url + '/dataset?organization=%s&offset=%s' % ( organization, len(pkg_ids)) content = self._get_content(url) content_json = json.loads(content) pkg_ids |= set(content_json['results']) return pkg_ids include_pkg_ids = get_pkg_ids_for_organizations(org_filter_include) exclude_pkg_ids = get_pkg_ids_for_organizations(org_filter_exclude) if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0): if not self.config.get('force_all', False): get_all_packages = True if get_all_packages: # Request all remote packages log.error("Request all remote packages") url = base_rest_url + '/package' log.error(url) try: content = self._get_content(url) package_ids = json.loads(content) except ContentFetchError, e: log.error("Unable to get content for URL") self._save_gather_error( 'Unable to get content for URL: %s: %s' % (url, str(e)), harvest_job) return None except JSONDecodeError, e: log.error("Unable to decode content for URL") self._save_gather_error( 'Unable to decode content for URL: %s: %s' % (url, str(e)), harvest_job) return None
def before_index(self, pkg_dict): import shapely import shapely.geometry if pkg_dict.get('extras_spatial', None) and self.search_backend in ( 'solr', 'solr-spatial-field'): try: geometry = json.loads(pkg_dict['extras_spatial']) except ValueError as e: log.error('Geometry not valid GeoJSON, not indexing') return pkg_dict if self.search_backend == 'solr': # Only bbox supported for this backend if not (geometry['type'] == 'Polygon' and len(geometry['coordinates']) == 1 and len(geometry['coordinates'][0]) == 5): log.error( 'Solr backend only supports bboxes (Polygons with 5 points), ignoring geometry {0}' .format(pkg_dict['extras_spatial'])) return pkg_dict coords = geometry['coordinates'] pkg_dict['maxy'] = max(coords[0][2][1], coords[0][0][1]) pkg_dict['miny'] = min(coords[0][2][1], coords[0][0][1]) pkg_dict['maxx'] = max(coords[0][2][0], coords[0][0][0]) pkg_dict['minx'] = min(coords[0][2][0], coords[0][0][0]) pkg_dict['bbox_area'] = (pkg_dict['maxx'] - pkg_dict['minx']) * \ (pkg_dict['maxy'] - pkg_dict['miny']) elif self.search_backend == 'solr-spatial-field': wkt = None # Check potential problems with bboxes if geometry['type'] == 'Polygon' \ and len(geometry['coordinates']) == 1 \ and len(geometry['coordinates'][0]) == 5: # Check wrong bboxes (4 same points) xs = [p[0] for p in geometry['coordinates'][0]] ys = [p[1] for p in geometry['coordinates'][0]] if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5: wkt = 'POINT({x} {y})'.format(x=xs[0], y=ys[0]) else: # Check if coordinates are defined counter-clockwise, # otherwise we'll get wrong results from Solr lr = shapely.geometry.polygon.LinearRing( geometry['coordinates'][0]) if not lr.is_ccw: lr.coords = list(lr.coords)[::-1] polygon = shapely.geometry.polygon.Polygon(lr) wkt = polygon.wkt if not wkt: shape = shapely.geometry.asShape(geometry) if not shape.is_valid: log.error( 'Wrong geometry, not indexing package {0}'.format( pkg_dict.get('name'))) return pkg_dict wkt = shape.wkt pkg_dict['spatial_geom'] = wkt return pkg_dict
def import_stage(self, harvest_object): package_dict = json.loads(harvest_object.content) if not self._should_import_local(package_dict): package_dict['state'] = 'deleted' else: package_dict = self._apply_package_extras_white_list(package_dict) package_dict = self._apply_package_resource_extras_black_list( package_dict) package_dict = self._fix_date_in_fields(package_dict) package_dict = self._set_license(package_dict) package_dict = self._pop_black_list_resources_by_type(package_dict) harvest_object.content = json.dumps(package_dict) upload_resources = self._pop_upload_resources(package_dict) import_stage_result = super(GuiaHarvesterPlugin, self).import_stage(harvest_object) if import_stage_result: package_dict = json.loads(harvest_object.content) harvested_rels = package_dict.get('relationships', []) try: this_package = model.Package.get(package_dict['name']) if not this_package: raise logic.NotFound() except logic.NotFound as nf: log.info( 'import_stage(): could not find package "{0}"; relationships not updated: {1}' .format(package_dict['name'], nf)) return import_stage_result existing_rels = this_package.get_relationships() self._update_relationships(existing_rels, harvested_rels) for resource_dict in upload_resources: resource_url = resource_dict['url'] resource_filename = resource_url.split('/')[-1] try: response = requests.get(resource_url) resource_file = StringIO(response.content) except Exception, e: self._save_object_error( 'Resource not harvested for package "{0}". Unable to fetch resource from "{1}": {2}' .format(package_dict['name'], resource_url, e), harvest_object, 'Import') continue cfs = FieldStorage() cfs.file = resource_file cfs.filename = resource_filename resource_dict['upload'] = cfs if 'created' in resource_dict: del resource_dict['created'] if 'last_modified' in resource_dict: del resource_dict['last_modified'] if 'api' in resource_dict: del resource_dict['api'] try: the_resource = toolkit.get_action('resource_create')( data_dict=resource_dict) except Exception, e: self._save_object_error( 'Resource not harvested for package "{0}". Unable to import the resource originally from "{1}": {2}' .format(package_dict['name'], resource_url, e), harvest_object, 'Import') continue
def loads(self, chars): try: return json.loads(chars) except ValueError, inst: raise Exception, "Couldn't loads string '%s': %s" % (chars, inst)
def import_stage(self, harvest_object): log.debug('In HRIHarvester import_stage') context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default translations lang = ckan_config['ckan.locale_default'] def translated_field(name): translated = package_dict.get('%s_translated' % name, {}) translated[lang] = translated.get(lang, package_dict[name]) # Process translations added as extras translated.update((e['key'].split('_', 2)[1], e['value']) for e in package_dict.get('extras', []) if e['key'].startswith('%s_' % name)) return translated def translated_extra_list(name): translated = {lang: []} for x in package_dict.get('extras', []): if x['key'] == name and len(x['value']) > 2: translated[lang] = [x['value']] package_dict['extras'] = [ x for x in package_dict.get('extras', []) if x['key'] != name ] return translated package_dict['title_translated'] = translated_field('title') package_dict['notes_translated'] = translated_field('notes') package_dict['update_frequency'] = translated_extra_list( 'update_frequency') # Set default values for required fields default_values = { 'maintainer': package_dict.get('author') or '(not set)', 'maintainer_email': package_dict.get('author_email') or '(not set)', } missing_values = ((k, v) for k, v in default_values.iteritems() if not package_dict.get(k)) package_dict.update(missing_values) # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) keywords = package_dict.get('keywords', {}) keywords[lang] = keywords.get( lang, [x['name'] for x in package_dict['tags']]) package_dict['keywords'] = keywords remote_groups = self.config.get('remote_groups', None) if remote_groups not in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if 'groups' not in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict['groups']: try: data_dict = {'id': group_name} group = get_action('group_show')(context, data_dict) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) except NotFound, e: log.info('Group %s is not available', group_name) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_name) except RemoteResourceError: log.error('Could not get remote group %s', group_name) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(context, group) log.info('Group %s has been newly created', group_name) if self.api_version == 1: validated_groups.append(group['name']) else: validated_groups.append(group['id']) package_dict['groups'] = validated_groups # Find if remote org exists locally, otherwise don't import dataset if 'owner_org' not in package_dict: package_dict['owner_org'] = None remote_org = None if package_dict.get('organization'): remote_org = package_dict['organization']['name'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')(context, data_dict) package_dict['owner_org'] = org['id'] except NotFound: log.info('No organization exist, not importing dataset') return "unchanged" else: log.info('No organization in harvested dataset') return "unchanged" # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] package_dict['groups'].extend([ g for g in default_groups if g not in package_dict['groups'] ]) # Map fields fields_to_map = [('url', 'maintainer_website')] for key_from, key_to in fields_to_map: if key_to not in package_dict and key_from in package_dict: package_dict[key_to] = package_dict[key_from] # Rename extras extras_to_rename_keys = { 'geographic_coverage': 'geographical_coverage', 'temporal_coverage-from': 'valid_from', 'temporal_coverage-to': 'valid_till', 'source': 'owner' } def map_extra(e): result = {} result.update(e) result['key'] = extras_to_rename_keys.get(e['key'], e['key']) return result package_dict['extras'] = [ map_extra(extra) for extra in package_dict.get('extras', []) ] # Set default extras if needed default_extras = self.config.get('default_extras', {}) def get_extra(key, package_dict): for extra in package_dict.get('extras', []): if extra['key'] == key: return extra if default_extras: override_extras = self.config.get('override_extras', False) if 'extras' not in package_dict: package_dict['extras'] = {} for key, value in default_extras.iteritems(): existing_extra = get_extra(key, package_dict) if existing_extra and not override_extras: continue # no need for the default if existing_extra: package_dict['extras'].remove(existing_extra) # Look for replacement strings if isinstance(value, basestring): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url. strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'].append({'key': key, 'value': value}) # Convert extras from strings to datetimes extras_to_datetimes = ['valid_from', 'valid_till'] def map_extra_to_date(e): if e['key'] not in extras_to_datetimes: return e result = {} result.update(e) result['value'] = self._parse_datetime(e['value']) return result package_dict['extras'] = [ map_extra_to_date(extra) for extra in package_dict.get('extras', []) ] # Move extras to fields extras_to_fields_keys = [ 'collection_type', 'geographical_coverage', 'valid_from', 'valid_till', 'owner' ] extras_to_fields = [ x for x in package_dict.get('extras', []) if x['key'] in extras_to_fields_keys and x['key'] not in package_dict ] for x in extras_to_fields: package_dict[x['key']] = x['value'] package_dict['extras'] = [ x for x in package_dict.get('extras', []) if x['key'] not in extras_to_fields_keys ] for resource in package_dict.get('resources', []): # Clear remote url_type for resources (eg datastore, upload) as # we are only creating normal resources with links to the # remote ones resource.pop('url_type', None) # Clear revision_id as the revision won't exist on this CKAN # and saving it will cause an IntegrityError with the foreign # key. resource.pop('revision_id', None) # Ensure imported tags are valid tag_string_fields = ['geographical_coverage'] for field in tag_string_fields: package_dict[field] = [ t for t in self._parse_tag_string( package_dict.get(field, '')) if t ] # Create or update package result = self._create_or_update_package( package_dict, harvest_object, package_dict_form='package_show') return result
def check_spatial_extra(self, package): ''' For a given package, looks at the spatial extent (as given in the extra "spatial" in GeoJSON format) and records it in PostGIS. ''' from ckanext.spatial.lib import save_package_extent if not package.id: log.warning( 'Couldn\'t store spatial extent because no id was provided for the package' ) return # TODO: deleted extra for extra in package.extras_list: if extra.key == 'spatial': if extra.state == 'active' and extra.value: try: log.debug('Received: %r' % extra.value) geometry = json.loads(extra.value) except ValueError as e: error_dict = { 'spatial': [ u'Error decoding JSON object: %s' % six.text_type(e) ] } raise p.toolkit.ValidationError( error_dict, error_summary=package_error_summary(error_dict)) except TypeError as e: error_dict = { 'spatial': [ u'Error decoding JSON object: %s' % six.text_type(e) ] } raise p.toolkit.ValidationError( error_dict, error_summary=package_error_summary(error_dict)) try: save_package_extent(package.id, geometry) except ValueError as e: error_dict = { 'spatial': [ u'Error creating geometry: %s' % six.text_type(e) ] } raise p.toolkit.ValidationError( error_dict, error_summary=package_error_summary(error_dict)) except Exception as e: if bool(os.getenv('DEBUG')): raise error_dict = { 'spatial': [u'Error: %s' % six.text_type(e)] } raise p.toolkit.ValidationError( error_dict, error_summary=package_error_summary(error_dict)) elif (extra.state == 'active' and not extra.value) or extra.state == 'deleted': # Delete extent from table save_package_extent(package.id, None) break
def _get_extent_object(self, geometry): if isinstance(geometry, six.string_types): geometry = json.loads(geometry) shape = asShape(geometry) return PackageExtent(package_id="xxx", the_geom=WKTElement(shape.wkt, 4326))
def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') base_context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if 'tags' not in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if remote_groups not in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if 'groups' not in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_ in package_dict['groups']: try: try: if 'id' in group_: data_dict = {'id': group_['id']} group = get_action('group_show')( base_context.copy(), data_dict) else: raise NotFound except NotFound as e: if 'name' in group_: data_dict = {'id': group_['name']} group = get_action('group_show')( base_context.copy(), data_dict) else: raise NotFound # Found local group validated_groups.append({ 'id': group['id'], 'name': group['name'] }) except NotFound as e: log.info('Group %s is not available', group_) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_) except RemoteResourceError: log.error('Could not get remote group %s', group_) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(base_context.copy(), group) log.info('Group %s has been newly created', group_) validated_groups.append({ 'id': group['id'], 'name': group['name'] }) package_dict['groups'] = validated_groups # Local harvest source organization source_dataset = get_action('package_show')( base_context.copy(), { 'id': harvest_object.source.id }) local_org = source_dataset.get('owner_org') remote_orgs = self.config.get('remote_orgs', None) if remote_orgs not in ('only_local', 'create'): # Assign dataset to the source organization package_dict['owner_org'] = local_org else: if 'owner_org' not in package_dict: package_dict['owner_org'] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict['owner_org'] if remote_org: try: data_dict = {'id': remote_org} org = get_action('organization_show')( base_context.copy(), data_dict) validated_org = org['id'] except NotFound as e: log.info('Organization %s is not available', remote_org) if remote_orgs == 'create': try: try: org = self._get_organization( harvest_object.source.url, remote_org) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org) for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name', 'type' ]: org.pop(key, None) get_action('organization_create')( base_context.copy(), org) log.info( 'Organization %s has been newly created', remote_org) validated_org = org['id'] except (RemoteResourceError, ValidationError): log.error('Could not get remote org %s', remote_org) package_dict['owner_org'] = validated_org or local_org # Set default groups if needed default_groups = self.config.get('default_groups', []) if default_groups: if 'groups' not in package_dict: package_dict['groups'] = [] existing_group_ids = [g['id'] for g in package_dict['groups']] package_dict['groups'].extend([ g for g in self.config['default_group_dicts'] if g['id'] not in existing_group_ids ]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) def get_extra(key, package_dict): for extra in package_dict.get('extras', []): if extra['key'] == key: return extra if default_extras: override_extras = self.config.get('override_extras', False) if 'extras' not in package_dict: package_dict['extras'] = [] for key, value in default_extras.items(): existing_extra = get_extra(key, package_dict) if existing_extra and not override_extras: continue # no need for the default if existing_extra: package_dict['extras'].remove(existing_extra) # Look for replacement strings if isinstance(value, six.string_types): value = value.format( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url. strip('/'), harvest_source_title=harvest_object.job.source. title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'].append({'key': key, 'value': value}) for resource in package_dict.get('resources', []): # Clear remote url_type for resources (eg datastore, upload) as # we are only creating normal resources with links to the # remote ones resource.pop('url_type', None) # Clear revision_id as the revision won't exist on this CKAN # and saving it will cause an IntegrityError with the foreign # key. resource.pop('revision_id', None) package_dict = self.modify_package_dict(package_dict, harvest_object) result = self._create_or_update_package( package_dict, harvest_object, package_dict_form='package_show') return result except ValidationError as e: self._save_object_error( 'Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') except Exception as e: self._save_object_error('%s' % e, harvest_object, 'Import')
def fetch_stage(self, harvest_object): # Check harvest object status status = self._get_object_extra(harvest_object, 'status') if status == 'delete': # No need to fetch anything, just pass to the import stage return True log = logging.getLogger(__name__ + '.CSW.fetch') log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id) url = harvest_object.source.url try: self._setup_csw_client(url) except Exception as e: self._save_object_error('Error contacting the CSW server: %s' % e, harvest_object) return False identifier = harvest_object.guid esn = self.source_config.get('esn', 'full') try: record = self.csw.getrecordbyid([identifier], outputschema=self.output_schema()) except Exception as e: self._save_object_error( 'Error getting the CSW record with GUID %s' % identifier, harvest_object) return False if record is None: self._save_object_error('Empty record for GUID %s' % identifier, harvest_object) return False source_config = json.loads(harvest_object.source.config ) if harvest_object.source.config else {} require_keywords = source_config.get('require_keywords', None) if require_keywords: record_keywords = set() for keyword_container in record.get('identification', {}).get('keywords', []): keywords = keyword_container.get('keywords', None) if keywords and isinstance(keywords, list): record_keywords.update(keywords) if not set(require_keywords).issubset(record_keywords): status_extra = self._get_extra(harvest_object, 'status') if status_extra is None: self._save_object_error( 'No status set for object with GUID %s' % identifier, harvest_object) return False status_extra.value = 'delete' status_extra.save() # Should not be processed further return 'unchanged' else: log.info("Found tagged record with guid %s" % identifier) require_in_abstract = source_config.get('require_in_abstract', None) if require_in_abstract: if not record.get('identification', {}).get('abstract', '') or\ require_in_abstract not in record.get('identification', {}).get('abstract', ""): status_extra = self._get_extra(harvest_object, 'status') if status_extra is None: self._save_object_error( 'No status set for object with GUID %s' % identifier, harvest_object) return False status_extra.value = 'delete' status_extra.save() # Should not be processed further return 'unchanged' else: log.info("Found tagged record with guid %s" % identifier) try: # Save the fetch contents in the HarvestObject # Contents come from csw_client already declared and encoded as utf-8 # Remove original XML declaration content = re.sub(r'<\?xml(.*)\?>', '', record['xml']) harvest_object.content = content.strip() harvest_object.save() except Exception as e: self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \ (identifier, e), harvest_object) return False log.debug('XML content saved (len %s)', len(record['xml'])) return True
def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') base_context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get('type') == 'harvest': log.warn('Remote dataset is a harvest source, ignoring...') return True # Set default tags if needed default_tags = self.config.get('default_tags', []) if default_tags: if not 'tags' in package_dict: package_dict['tags'] = [] package_dict['tags'].extend( [t for t in default_tags if t not in package_dict['tags']]) remote_groups = self.config.get('remote_groups', None) if not remote_groups in ('only_local', 'create'): # Ignore remote groups package_dict.pop('groups', None) else: if not 'groups' in package_dict: package_dict['groups'] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_ in package_dict['groups']: try: try: if 'id' in group_: data_dict = {'id': group_['id']} group = get_action('group_show')( base_context.copy(), data_dict) else: raise NotFound except NotFound, e: if 'name' in group_: data_dict = {'id': group_['name']} group = get_action('group_show')( base_context.copy(), data_dict) else: raise NotFound # Found local group validated_groups.append({ 'id': group['id'], 'name': group['name'] }) except NotFound, e: log.info('Group %s is not available', group_) if remote_groups == 'create': try: group = self._get_group( harvest_object.source.url, group_) except RemoteResourceError: log.error('Could not get remote group %s', group_) continue for key in [ 'packages', 'created', 'users', 'groups', 'tags', 'extras', 'display_name' ]: group.pop(key, None) get_action('group_create')(base_context.copy(), group) log.info('Group %s has been newly created', group_) validated_groups.append({ 'id': group['id'], 'name': group['name'] })
def validate_config(self, config): if not config: return config try: config_obj = json.loads(config) if 'api_version' in config_obj: try: int(config_obj['api_version']) except ValueError: raise ValueError('api_version must be an integer') if 'default_tags' in config_obj: if not isinstance(config_obj['default_tags'], list): raise ValueError('default_tags must be a list') if config_obj['default_tags'] and \ not isinstance(config_obj['default_tags'][0], dict): raise ValueError('default_tags must be a list of ' 'dictionaries') if 'default_groups' in config_obj: if not isinstance(config_obj['default_groups'], list): raise ValueError('default_groups must be a *list* of group' ' names/ids') if config_obj['default_groups'] and \ not isinstance(config_obj['default_groups'][0], six.string_types): raise ValueError('default_groups must be a list of group ' 'names/ids (i.e. strings)') # Check if default groups exist context = {'model': model, 'user': toolkit.c.user} config_obj['default_group_dicts'] = [] for group_name_or_id in config_obj['default_groups']: try: group = get_action('group_show')(context, { 'id': group_name_or_id }) # save the dict to the config object, as we'll need it # in the import_stage of every dataset config_obj['default_group_dicts'].append(group) except NotFound as e: raise ValueError('Default group not found') config = json.dumps(config_obj) if 'default_extras' in config_obj: if not isinstance(config_obj['default_extras'], dict): raise ValueError('default_extras must be a dictionary') if 'organizations_filter_include' in config_obj \ and 'organizations_filter_exclude' in config_obj: raise ValueError( 'Harvest configuration cannot contain both ' 'organizations_filter_include and organizations_filter_exclude' ) if 'groups_filter_include' in config_obj \ and 'groups_filter_exclude' in config_obj: raise ValueError( 'Harvest configuration cannot contain both ' 'groups_filter_include and groups_filter_exclude') if 'user' in config_obj: # Check if user exists context = {'model': model, 'user': toolkit.c.user} try: get_action('user_show')(context, { 'id': config_obj.get('user') }) except NotFound: raise ValueError('User not found') for key in ('read_only', 'force_all'): if key in config_obj: if not isinstance(config_obj[key], bool): raise ValueError('%s must be boolean' % key) except ValueError as e: raise e return config
def import_stage(self, harvest_object): log.debug("In CKANHarvester import_stage") context = {"model": model, "session": Session, "user": self._get_user_name()} if not harvest_object: log.error("No harvest object received") return False if harvest_object.content is None: self._save_object_error( "Empty content for object %s" % harvest_object.id, harvest_object, "Import", ) return False self._set_config(harvest_object.job.source.config) try: package_dict = json.loads(harvest_object.content) if package_dict.get("type") == "harvest": log.warn("Remote dataset is a harvest source, ignoring...") return True # Set default tags if needed default_tags = self.config.get("default_tags", []) if default_tags: if not "tags" in package_dict: package_dict["tags"] = [] package_dict["tags"].extend( [t for t in default_tags if t not in package_dict["tags"]] ) remote_groups = self.config.get("remote_groups", None) if not remote_groups in ("only_local", "create"): # Ignore remote groups package_dict.pop("groups", None) else: if not "groups" in package_dict: package_dict["groups"] = [] # check if remote groups exist locally, otherwise remove validated_groups = [] for group_name in package_dict["groups"]: try: data_dict = {"id": group_name} group = get_action("group_show")(context, data_dict) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) except NotFound as e: log.info("Group %s is not available" % group_name) if remote_groups == "create": try: group = self._get_group( harvest_object.source.url, group_name ) except RemoteResourceError: log.error("Could not get remote group %s" % group_name) continue for key in [ "packages", "created", "users", "groups", "tags", "extras", "display_name", ]: group.pop(key, None) get_action("group_create")(context, group) log.info("Group %s has been newly created" % group_name) if self.api_version == 1: validated_groups.append(group["name"]) else: validated_groups.append(group["id"]) package_dict["groups"] = validated_groups # Local harvest source organization source_dataset = get_action("package_show")( context, {"id": harvest_object.source.id} ) local_org = source_dataset.get("owner_org") remote_orgs = self.config.get("remote_orgs", None) if not remote_orgs in ("only_local", "create"): # Assign dataset to the source organization package_dict["owner_org"] = local_org else: if not "owner_org" in package_dict: package_dict["owner_org"] = None # check if remote org exist locally, otherwise remove validated_org = None remote_org = package_dict["owner_org"] if remote_org: try: data_dict = {"id": remote_org} org = get_action("organization_show")(context, data_dict) validated_org = org["id"] except NotFound as e: log.info("Organization %s is not available" % remote_org) if remote_orgs == "create": try: try: org = self._get_organization( harvest_object.source.url, remote_org ) except RemoteResourceError: # fallback if remote CKAN exposes organizations as groups # this especially targets older versions of CKAN org = self._get_group( harvest_object.source.url, remote_org ) for key in [ "packages", "created", "users", "groups", "tags", "extras", "display_name", "type", ]: org.pop(key, None) get_action("organization_create")(context, org) log.info( "Organization %s has been newly created" % remote_org ) validated_org = org["id"] except (RemoteResourceError, ValidationError): log.error("Could not get remote org %s" % remote_org) package_dict["owner_org"] = validated_org or local_org # Set default groups if needed default_groups = self.config.get("default_groups", []) if default_groups: if not "groups" in package_dict: package_dict["groups"] = [] package_dict["groups"].extend( [g for g in default_groups if g not in package_dict["groups"]] ) # FIXME: enable only if not using ckanext-scheming dataset schemas # handle extras in harvested schema # """ # Find any extras whose values are not strings and try to convert # them to strings, as non-string extras are not allowed anymore in # CKAN 2.0. for key in package_dict['extras'].keys(): if not isinstance(package_dict['extras'][key], basestring): try: package_dict['extras'][key] = json.dumps( package_dict['extras'][key]) except TypeError: # If converting to a string fails, just delete it. del package_dict['extras'][key] # Set default extras if needed default_extras = self.config.get('default_extras',{}) if default_extras: override_extras = self.config.get('override_extras',False) if not 'extras' in package_dict: package_dict['extras'] = {} for key,value in default_extras.iteritems(): if not key in package_dict['extras'] or override_extras: # Look for replacement strings if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) package_dict['extras'][key] = value """ # Clear remote url_type for resources (eg datastore, upload) as we # are only creating normal resources with links to the remote ones for resource in package_dict.get("resources", []): resource.pop("url_type", None) result = self._create_or_update_package(package_dict, harvest_object) if result and self.config.get("read_only", False) == True: package = model.Package.get(package_dict["id"]) # Clear default permissions model.clear_user_roles(package) # Setup harvest user as admin user_name = self.config.get("user", "harvest") user = model.User.get(user_name) pkg_role = model.PackageRole( package=package, user=user, role=model.Role.ADMIN ) # Other users can only read for user_name in ("visitor", "logged_in"): user = model.User.get(user_name) pkg_role = model.PackageRole( package=package, user=user, role=model.Role.READER ) return True except ValidationError as e: self._save_object_error( "Invalid package with GUID %s: %r" % (harvest_object.guid, e.error_dict), harvest_object, "Import", ) except Exception as e: self._save_object_error("%r" % e, harvest_object, "Import")
def _get_search_params(cls, request_params): if request_params.has_key('qjson'): try: params = json.loads(request_params['qjson'], encoding='utf8') except ValueError, e: raise ValueError, gettext('Malformed qjson value') + ': %r' % e
def import_stage(self, harvest_object): '''The import_stage contains lots of boiler plate, updating the harvest_objects correctly etc, so inherit this method and customize the get_package_dict method. * HOExtra.status should have been set to 'new_or_changed' or 'deleted' in the gather or fetch stages. * It follows that checking that the metadata date has changed should have been done in the gather or fetch stages * harvest_object.source.config can control default additions to the package, for extras etc ''' log.debug('Import stage for harvest object: %s', harvest_object.id) if not harvest_object: # something has gone wrong with the code log.error('No harvest object received') self._save_object_error('System error') return False if harvest_object.content is None: # fetched object is blank - error with the harvested server self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False source_config = json.loads(harvest_object.source.config or '{}') def get_extra(extras, key): for extra in extras: if extra.key == key: return extra.value return 'new' status = get_extra(harvest_object.extras, 'status') if not status in ['new', 'changed', 'new_or_changed', 'deleted']: log.error('Status is not set correctly: %r', status) self._save_object_error('System error', harvest_object, 'Import') return False # Get the last harvested object (if any) previous_object = \ model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True) \ .first() # Fix the obscure cases where the last harvested object is disconnected # from its package # i.e. harvest_object where current = true and package_id is null if previous_object and not previous_object.package_id: pkg = model.Session.query(model.Package) \ .filter_by(state='active') \ .join(model.PackageExtra) \ .filter_by(state='active') \ .filter_by(key='guid') \ .filter_by(value=harvest_object.guid) \ .first() if pkg: previous_object.package_id = pkg.id log.info( 'Previous harvest object %s had no package_id - ' 'have fixed with package: %s', previous_object.id, pkg.name) else: log.warning( 'Previous harvest object %s has no package_id - ' 'could not fix by finding GUID %r', previous_object.id, harvest_object.guid) user = self._get_user_name() context = { 'model': model, 'session': model.Session, 'user': user, 'api_version': 3, 'extras_as_string': True } if status == 'delete': # Delete package tk.get_action('package_delete')(context.copy(), { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) previous_object.save() self._transfer_current(previous_object, harvest_object) return True # Set defaults for the package_dict, mainly from the source_config package_dict_defaults = PackageDictDefaults() package_id = previous_object.package_id if previous_object else None package_dict_defaults['id'] = package_id or unicode(uuid.uuid4()) existing_dataset = model.Package.get(package_id) if existing_dataset: package_dict_defaults['name'] = existing_dataset.name if existing_dataset and existing_dataset.owner_org: package_dict_defaults['owner_org'] = existing_dataset.owner_org else: source_dataset = tk.get_action('package_show')( context.copy(), { 'id': harvest_object.source.id }) package_dict_defaults['owner_org'] = source_dataset.get( 'owner_org') package_dict_defaults['tags'] = source_config.get('default_tags', []) package_dict_defaults['groups'] = source_config.get( 'default_groups', []) package_dict_defaults['extras'] = { 'import_source': 'harvest', # to identify all harvested datasets 'harvest_object_id': harvest_object.id, 'guid': harvest_object.guid, 'metadata-date': harvest_object.metadata_modified_date.strftime('%Y-%m-%d') if harvest_object.metadata_modified_date else None, # Add provenance for this harvest, so at least that info is saved # even if the harvester doesn't fill it in properly with get_provenance(). 'metadata_provenance': self.get_metadata_provenance(harvest_object, harvested_provenance=None), } default_extras = source_config.get('default_extras', {}) if default_extras: env = dict( harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict_defaults['id']) for key, value in default_extras.iteritems(): # Look for replacement strings if isinstance(value, basestring): value = value.format(env) package_dict_defaults['extras'][key] = value if existing_dataset: extras_kept = set( pylons.config.get('ckan.harvest.extras_not_overwritten', '').split(' ')) for extra_key in extras_kept: if extra_key in existing_dataset.extras: package_dict_defaults['extras'][extra_key] = \ existing_dataset.extras.get(extra_key) if status in ('new', 'changed', 'new_or_changed'): # There are 2 circumstances that the status is wrong: # 1. we are using 'paster import' to reimport this object, yet # status is still 'new' from the previous harvest, yet it needs to # be 'changed' so that it does a package_update(). # 2. the first harvest excepted, so status is 'new' because the # harvest_object is there, but no package was created. # Simplest solution is to set it according to whether there is an # existing dataset. status = 'changed' if existing_dataset else 'new' # FIXME URGENTLY # harvest_object.extras # harvest_object.set_extra('status', status) harvest_object.save() try: package_dict = self.get_package_dict(harvest_object, package_dict_defaults, source_config, existing_dataset) except PackageDictError, e: log.error('Harvest PackageDictError in get_package_dict %s %r', e, harvest_object) self._save_object_error('Error converting to dataset: %s' % e, harvest_object, 'Import') return False
query['tie'] = '0.1' # this minimum match is explained # http://wiki.apache.org/solr/DisMaxQParserPlugin#mm_.28Minimum_.27Should.27_Match.29 query['mm'] = '2<-1 5<80%' query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError( 'SOLR returned an error running query: %r Error: %r' % (query, e.reason)) try: data = json.loads(solr_response) response = data['response'] self.count = response.get('numFound', 0) self.results = response.get('docs', []) # #1683 Filter out the last row that is sometimes out of order self.results = self.results[:rows_to_return] # get any extras and add to 'extras' dict for result in self.results: extra_keys = filter(lambda x: x.startswith('extras_'), result.keys()) extras = {} for extra_key in extra_keys: value = result.pop(extra_key) extras[extra_key[len('extras_'):]] = value
try: if request.method in ['POST', 'PUT']: request_data = request.body else: request_data = None except Exception, inst: msg = "Could not extract request body data: %s" % \ (inst) raise ValueError(msg) cls.log.debug('Retrieved request body: %r' % request.body) if not request_data: msg = "No request body data" raise ValueError(msg) if request_data: try: request_data = json.loads(request_data, encoding='utf8') except ValueError, e: raise ValueError('Error decoding JSON data. ' 'Error: %r ' 'JSON data extracted from the request: %r' % (e, request_data)) if not isinstance(request_data, dict): raise ValueError('Request data JSON decoded to %r but ' 'it needs to be a dictionary.' % request_data) # ensure unicode values for key, val in request_data.items(): # if val is str then assume it is ascii, since json converts # utf8 encoded JSON to unicode request_data[key] = cls._make_unicode(val) cls.log.debug('Request data extracted: %r' % request_data) return request_data
else: return self._finish_not_found( gettext('Unknown register: %s') % register) @classmethod def _get_search_params(cls, request_params): if request_params.has_key('qjson'): try: params = json.loads(request_params['qjson'], encoding='utf8') except ValueError, e: raise ValueError, gettext('Malformed qjson value') + ': %r' % e elif len(request_params) == 1 and \ len(request_params.values()[0]) < 2 and \ request_params.keys()[0].startswith('{'): # e.g. {some-json}='1' or {some-json}='' params = json.loads(request_params.keys()[0], encoding='utf8') else: params = request_params if not isinstance(params, (UnicodeMultiDict, dict)): raise ValueError, _( 'Request params must be in form of a json encoded dictionary.') return params def markdown(self, ver=None): raw_markdown = request.params.get('q', '') results = ckan.misc.MarkdownFormat().to_html(raw_markdown) return self._finish_ok(results) def tag_counts(self, ver=None): c.q = request.params.get('q', '')
def import_stage(self, harvest_object): if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False try: row = json.loads(harvest_object.content) def csplit(txt): return [t.strip() for t in txt.split(",")] package_dict = { 'title': row['TITLE'], 'url': row['URL'], 'notes': row['LONGDESC'], 'author': row['AUTHOR_NAME'], 'maintainer': row['MAINTAINER'], 'maintainer_email': row['MAINTAINER_EMAIL'], 'tags': csplit(row['TAGS']), 'license_id': 'ukcrown', 'extras': { 'date_released': row['RELEASE_DATE'], 'categories': csplit(row['CATEGORIES']), 'geographical_granularity': row['GEOGRAPHY'], 'geographical_coverage': row['EXTENT'], 'temporal_granularity': row['UPDATE_FREQUENCY'], 'temporal_coverage': row['DATE_RANGE'], 'license_summary': row['LICENSE_SUMMARY'], 'license_details': row['license_details'], 'spatial_reference_system': row['spatial_ref'], 'harvest_dataset_url': row['DATASTORE_URL'], # Common extras 'harvest_catalogue_name': 'London Datastore', 'harvest_catalogue_url': 'http://data.london.gov.uk', 'eu_country': 'UK', 'eu_nuts1': 'UKI' }, 'resources': [] } def pkg_format(prefix, mime_type): if row.get(prefix + "_URL"): package_dict['resources'].append({ 'url': row.get(prefix + "_URL"), 'format': mime_type, 'description': "%s version" % prefix.lower() }) pkg_format('EXCEL', 'application/vnd.ms-excel') pkg_format('CSV', 'text/csv') pkg_format('TAB', 'text/tsv') pkg_format('XML', 'text/xml') pkg_format('GOOGLEDOCS', 'api/vnd.google-spreadsheet') pkg_format('JSON', 'application/json') pkg_format('SHP', 'application/octet-stream+esri') pkg_format('KML', 'application/vnd.google-earth.kml+xml') except Exception, e: log.exception(e) self._save_object_error('%r' % e, harvest_object, 'Import')
def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None): '''Does a dataset search on a remote CKAN and returns the results. Deals with paging to return all the results, not just the first page. ''' base_search_url = remote_ckan_base_url + self._get_search_api_offset() params = {'rows': '100', 'start': '0'} # There is the worry that datasets will be changed whilst we are paging # through them. # * In SOLR 4.7 there is a cursor, but not using that yet # because few CKANs are running that version yet. # * However we sort, then new names added or removed before the current # page would cause existing names on the next page to be missed or # double counted. # * Another approach might be to sort by metadata_modified and always # ask for changes since (and including) the date of the last item of # the day before. However if the entire page is of the exact same # time, then you end up in an infinite loop asking for the same page. # * We choose a balanced approach of sorting by ID, which means # datasets are only missed if some are removed, which is far less # likely than any being added. If some are missed then it is assumed # they will harvested the next time anyway. When datasets are added, # we are at risk of seeing datasets twice in the paging, so we detect # and remove any duplicates. params['sort'] = 'id asc' if fq_terms: params['fq'] = ' '.join(fq_terms) pkg_dicts = [] pkg_ids = set() previous_content = None while True: url = base_search_url + '?' + urllib.urlencode(params) log.debug('Searching for CKAN datasets: %s', url) try: content = self._get_content(url) except ContentFetchError, e: raise SearchError('Error sending request to search remote ' 'CKAN instance %s using URL %r. Error: %s' % (remote_ckan_base_url, url, e)) if previous_content and content == previous_content: raise SearchError('The paging doesn\'t seem to work. URL: %s' % url) try: response_dict = json.loads(content) except ValueError: raise SearchError( 'Response from remote CKAN was not JSON: %r' % content) try: pkg_dicts_page = response_dict.get('result', {}).get('results', []) except ValueError: raise SearchError('Response JSON did not contain ' 'result/results: %r' % response_dict) # Weed out any datasets found on previous pages (should datasets be # changing while we page) ids_in_page = set(p['id'] for p in pkg_dicts_page) duplicate_ids = ids_in_page & pkg_ids if duplicate_ids: pkg_dicts_page = [ p for p in pkg_dicts_page if p['id'] not in duplicate_ids ] pkg_ids |= ids_in_page pkg_dicts.extend(pkg_dicts_page) if len(pkg_dicts_page) == 0: break params['start'] = str(int(params['start']) + int(params['rows']))
class PackageSearchQuery(SearchQuery): def get_all_entity_ids(self, max_results=1000): """ Return a list of the IDs of all indexed packages. """ query = "*:*" fq = "+site_id:\"%s\" " % config.get('ckan.site_id') fq += "+state:active " conn = make_connection() try: data = conn.query(query, fq=fq, rows=max_results, fields='id') finally: conn.close() return [r.get('id') for r in data.results] def run(self, query): ''' Performs a dataset search using the given query. @param query - dictionary with keys like: q, fq, sort, rows, facet @return - dictionary with keys results and count May raise SearchQueryError or SearchError. ''' from solr import SolrException assert isinstance(query, (dict, MultiDict)) # check that query keys are valid if not set(query.keys()) <= VALID_SOLR_PARAMETERS: invalid_params = [s for s in set(query.keys()) - VALID_SOLR_PARAMETERS] raise SearchQueryError("Invalid search parameters: %s" % invalid_params) # default query is to return all documents q = query.get('q') if not q or q == '""' or q == "''": query['q'] = "*:*" # number of results query['rows'] = min(1000, int(query.get('rows', 10))) # order by score if no 'sort' term given order_by = query.get('sort') if order_by == 'rank' or order_by is None: query['sort'] = 'score desc, name asc' # show only results from this CKAN instance fq = query.get('fq', '') if not '+site_id:' in fq: fq += ' +site_id:"%s"' % config.get('ckan.site_id') # filter for package status if not '+state:' in fq: fq += " +state:active" query['fq'] = fq # faceting query['facet'] = query.get('facet', 'true') query['facet.limit'] = query.get('facet.limit', config.get('search.facets.limit', '50')) query['facet.mincount'] = query.get('facet.mincount', 1) # return the package ID and search scores query['fl'] = query.get('fl', 'name') # return results as json encoded string query['wt'] = query.get('wt', 'json') # query field weighting: disabled for now as solr 3.* is required for # the 'edismax' query parser, our current Ubuntu version only has # packages for 1.4 # # query['defType'] = 'edismax' # query['tie'] = '0.5' # query['qf'] = query.get('qf', QUERY_FIELDS) conn = make_connection() log.debug('Package query: %r' % query) try: solr_response = conn.raw_query(**query) except SolrException, e: raise SearchError('SOLR returned an error running query: %r Error: %r' % (query, e.reason)) try: data = json.loads(solr_response) response = data['response'] self.count = response.get('numFound', 0) self.results = response.get('docs', []) # get any extras and add to 'extras' dict for result in self.results: extra_keys = filter(lambda x: x.startswith('extras_'), result.keys()) extras = {} for extra_key in extra_keys: value = result.pop(extra_key) extras[extra_key[len('extras_'):]] = value if extra_keys: result['extras'] = extras # if just fetching the id or name, return a list instead of a dict if query.get('fl') in ['id', 'name']: self.results = [r.get(query.get('fl')) for r in self.results] # get facets and convert facets list to a dict self.facets = data.get('facet_counts', {}).get('facet_fields', {}) for field, values in self.facets.iteritems(): self.facets[field] = dict(zip(values[0::2], values[1::2])) except Exception, e: log.exception(e) raise SearchError(e)
def _getjson(self): return json.loads(self.body)