def data(self): # Get the Europe dataset rootdir = get_root_dir() data_file = os.path.join(rootdir, 'ckanext', 'offenedaten', 'data', 'eu.json') f = open(data_file, 'r') o = json.load(f) # Get the package count by country q = Session.query( distinct(PackageExtra.value), func.count(PackageExtra.value) ).\ filter(PackageExtra.key == u'eu_country').\ group_by(PackageExtra.value) values = dict(q.all()) # Set the package count for each country for ft in o['features']: code = ft['properties']['NUTS'] ft['properties']['packages'] = (values.get(code, 0)) response.content_type = 'application/json' response.pragma = None response.cache_control = 'public; max-age: 3600' response.cache_expires(seconds=3600) return json.dumps(o)
def data(self): # Get the Europe dataset rootdir = get_root_dir() data_file = os.path.join(rootdir, 'ckanext', 'pdeu', 'data', 'eu.json') f = open(data_file, 'r') o = json.load(f) # Get the package count by country q = Session.query( distinct(PackageExtra.value), func.count(PackageExtra.value) ).\ filter(PackageExtra.key == u'eu_country').\ group_by(PackageExtra.value) values = dict(q.all()) # Set the package count for each country for ft in o['features']: code = ft['properties']['NUTS'] ft['properties']['packages'] = (values.get(code, 0)) response.content_type = 'application/json' response.pragma = None response.cache_control = 'public; max-age: 3600' response.cache_expires(seconds=3600) return json.dumps(o)
def test_dump(self): assert os.path.exists(self.outpath) dumpeddata = json.load(open(self.outpath)) assert dumpeddata['version'] == ckan.__version__ tables = dumpeddata.keys() for key in ['Package', 'Tag', 'Group', 'PackageGroup', 'PackageExtra']: assert key in tables, '%r not in %s' % (key, tables) for key in ['User']: assert key not in tables, '%s should not be in %s' % (key, tables) assert len(dumpeddata['Package']) == 2, len(dumpeddata['Package']) assert len(dumpeddata['Tag']) == 3, len(dumpeddata['Tag']) assert len(dumpeddata['PackageRevision']) == 2, len(dumpeddata['PackageRevision']) assert len(dumpeddata['Group']) == 2, len(dumpeddata['Group'])
def test_dump(self): assert os.path.exists(self.outpath) dumpeddata = json.load(open(self.outpath)) assert dumpeddata['version'] == ckan.__version__ tables = dumpeddata.keys() for key in ['Package', 'Tag', 'Group', 'PackageGroup', 'PackageExtra']: assert key in tables, '%r not in %s' % (key, tables) for key in ['User']: assert key not in tables, '%s should not be in %s' % (key, tables) assert len(dumpeddata['Package']) == 2, len(dumpeddata['Package']) assert len(dumpeddata['Tag']) == 3, len(dumpeddata['Tag']) assert len(dumpeddata['PackageRevision']) == 2, len( dumpeddata['PackageRevision']) assert len(dumpeddata['Group']) == 2, len(dumpeddata['Group'])
def fetch_stage(self,harvest_object): ''' Fetches the list of datasets from the catalog ''' log.debug('In CustomHarvester fetch_stage') self._set_config(harvest_object.job.source.config) db=client.odm db_jobs=db.jobs config=db_jobs.find_one({"cat_url":harvest_object.source.url}) api_key=config['apikey'] dataset_url=config['dataset_url'] metadata_mappings=json.loads(config['metadata_mappings']) if "data.norge.no" in harvest_object.source.url.rstrip('/'): many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4'] else: many_datasets_list.append(datasets_list_url) if dataset_url!="": fetch_url=harvest_object.source.url.rstrip('/')+dataset_url.replace("{api}",api_key).replace("{id}", harvest_object.guid) #print(fetch_url) else: fetch_url="" dataset={} features=[] if fetch_url!="": result=urllib2.urlopen(fetch_url) try: try: dataset=json.load(result) except: try: headers = {'Accept':'application/json'} r=urllib2.Request(fetch_url,headers=headers) dataset=json.loads(urllib2.urlopen(r).read()) except: result=urllib2.urlopen(fetch_url) read=result.read() read=read.replace("null(","dataset=").rstrip(')') exec(read) #print(dataset) except Exception, e: log.exception('Could not load ' + fetch_url) self._save_gather_error('%r'%e.message,harvest_object)
def gather_stage(self, harvest_job): log.debug('In SocrataHarvester gather_stage') base_url = harvest_job.source.url.strip("/") limit = 20 ids = [] for page in count(): url = base_url + "/api/search/views.json?q=&limit=%s&page=%s" % (limit, page+1) print "URL", url indexfh = urllib2.urlopen(url) result = json.load(indexfh)[0] indexfh.close() for res in result.get('results', []): id = res.get('id') obj = HarvestObject(guid=id, job=harvest_job, content=json.dumps(res)) obj.save() ids.append(obj.id) break if (page+1)*limit > result.get('count'): break return ids
def gather_stage(self, harvest_job): # The gather stage scans a remote resource (in our case, the /data.json file) for # a list of datasets to import. log.debug('In datajson harvester gather_stage (%s)' % harvest_job.source.url) source = json.load(urllib2.urlopen(harvest_job.source.url)) if len(source) == 0: return None # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_datajson_identifier, # which corresponds to the /data.json 'identifier' field. Make a mapping # so we know how to update existing records. existing_datasets = { } for hobj in model.Session.query(HarvestObject).filter_by(source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue for extra in pkg["extras"]: if extra["key"] == "source_datajson_identifier": existing_datasets[extra["value"]] = hobj.package_id # If we've lost an association to the HarvestSource, scan all packages in the database. if False: for pkg in model.Session.query(Package): if pkg.extras.get("source_datajson_url") == harvest_job.source.url \ and pkg.extras.get("source_datajson_identifier"): existing_datasets[pkg.extras["source_datajson_identifier"]] = pkg.id # Create HarvestObjects for any records in the /data.json file. object_ids = [] seen_datasets = set() for dataset in source: # Create a new HarvestObject for this identifier and save the # dataset metdata inside it for later. # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. if dataset['identifier'] in existing_datasets: pkg_id = existing_datasets[dataset["identifier"]] seen_datasets.add(pkg_id) else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the /data.json file. obj = HarvestObject( guid=pkg_id, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # Remove packages no longer in the /data.json file. for id in existing_datasets.values(): if id not in seen_datasets: log.warn('deleting package %s because it is no longer in %s' % (id, harvest_job.source.url)) Session.query(Package).filter(Package.id == id) return object_ids
def gather_stage(self,harvest_job): log.debug('In CustomHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True db=client.odm db_jobs=db.jobs config=db_jobs.find_one({"cat_url":harvest_job.source.url}) datasets_list_url=config['datasets_list_url'] datasets_list_identifier=config['datasets_list_identifier'] dataset_id=config['dataset_id'] api_key=config['apikey'] if "data.norge.no" in harvest_job.source.url.rstrip('/'): many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4'] else: many_datasets_list.append(datasets_list_url) j=0 all_datasets=[] while j<len(many_datasets_list): url=harvest_job.source.url.rstrip('/')+many_datasets_list[j].replace('{api}',api_key) print(url) result=urllib2.urlopen(url) try: datasets=json.load(result) if datasets_list_identifier!="": datasets=datasets[datasets_list_identifier] except: try: headers = {'Accept':'application/json'} r=urllib2.Request(url,headers=headers) datasets=t=json.loads(urllib2.urlopen(r).read()) if datasets_list_identifier!="": datasets=datasets[datasets_list_identifier] except: result=urllib2.urlopen(url) read=result.read() read=read.replace("null(","datasets=").rstrip(')') exec(read) count=0 while count<len(datasets): all_datasets.append(datasets[count]) count+=1 datasets[:]=[] j+=1 i=0 package_ids=[] while i<len(all_datasets): package_ids.append(all_datasets[i][dataset_id]) i+=1 #print('****package ids****') #print(package_ids) #print(len(package_ids)) ###load existing datasets names and ids from mongoDb datasets=list(custom_db.find({'catalogue_url':harvest_job.source.url})) datasets_ids=[] datasets_names=[] j=0 while j<len(datasets): datasets_ids.append(datasets[j]['id']) j+=1 ###check for deleted datasets that exist in mongo count_pkg_ids=0 while count_pkg_ids<len(package_ids): temp_pckg_id=package_ids[count_pkg_ids] if temp_pckg_id in datasets_ids: datasets_ids.remove(temp_pckg_id) count_pkg_ids+=1 if len(datasets_ids)>0: j=0 while j<len(datasets_ids): i=0 while i<len(datasets): if datasets_ids[j] in datasets[i]['id']: document=datasets[i] document.update({"deleted_dataset":True}) custom_db.save(document) i+=1 j+=1 try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
log.exception('Could not load ' + fetch_url) self._save_gather_error('%r'%e.message,harvest_object) ##case that api do not return json per dataset but all as a list else: #print(harvest_object.guid) datasets_list_url=config['datasets_list_url'] datasets_list_identifier=config['datasets_list_identifier'] api_key=config['apikey'] dataset_id=config['dataset_id'] j=0 while j<len(many_datasets_list): url=harvest_object.job.source.url.rstrip('/')+many_datasets_list[j].replace('{api}',api_key) result=urllib2.urlopen(url) datasets=json.load(result) if datasets_list_identifier!="": datasets=datasets[datasets_list_identifier] i=0 while i<len(datasets): if datasets[i][dataset_id]==harvest_object.guid: dataset=datasets[i] i+=1 j+=1 content={} db_jobs=db.jobs base_url = harvest_object.source.url #print(base_url)
def gather_stage(self, harvest_job): # The gather stage scans a remote resource (in our case, the /data.json file) for # a list of datasets to import. log.debug('In datajson harvester gather_stage (%s)' % harvest_job.source.url) source = json.load(urllib2.urlopen(harvest_job.source.url)) if len(source) == 0: return None # Loop through the packages we've already imported from this source # and go into their extra fields to get their source_datajson_identifier, # which corresponds to the /data.json 'identifier' field. Make a mapping # so we know how to update existing records. existing_datasets = {} for hobj in model.Session.query(HarvestObject).filter_by( source=harvest_job.source, current=True): try: pkg = get_action('package_show')(self.context(), { "id": hobj.package_id }) except: # reference is broken continue for extra in pkg["extras"]: if extra["key"] == "source_datajson_identifier": existing_datasets[extra["value"]] = hobj.package_id # If we've lost an association to the HarvestSource, scan all packages in the database. if False: for pkg in model.Session.query(Package): if pkg.extras.get("source_datajson_url") == harvest_job.source.url \ and pkg.extras.get("source_datajson_identifier"): existing_datasets[ pkg.extras["source_datajson_identifier"]] = pkg.id # Create HarvestObjects for any records in the /data.json file. object_ids = [] seen_datasets = set() for dataset in source: # Create a new HarvestObject for this identifier and save the # dataset metdata inside it for later. # Get the package_id of this resource if we've already imported # it into our system. Otherwise, assign a brand new GUID to the # HarvestObject. I'm not sure what the point is of that. if dataset['identifier'] in existing_datasets: pkg_id = existing_datasets[dataset["identifier"]] seen_datasets.add(pkg_id) else: pkg_id = uuid.uuid4().hex # Create a new HarvestObject and store in it the GUID of the # existing dataset (if it exists here already) and the dataset's # metadata from the /data.json file. obj = HarvestObject(guid=pkg_id, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # Remove packages no longer in the /data.json file. for id in existing_datasets.values(): if id not in seen_datasets: log.warn('deleting package %s because it is no longer in %s' % (id, harvest_job.source.url)) Session.query(Package).filter(Package.id == id) return object_ids