Esempio n. 1
0
    def data(self):
        # Get the Europe dataset
        rootdir = get_root_dir()
        data_file = os.path.join(rootdir, 'ckanext', 'offenedaten', 'data', 'eu.json')
        f = open(data_file, 'r')
        o = json.load(f)

        # Get the package count by country
        q = Session.query(
                distinct(PackageExtra.value),
                func.count(PackageExtra.value)
            ).\
                filter(PackageExtra.key == u'eu_country').\
                group_by(PackageExtra.value)

        values = dict(q.all())
        # Set the package count for each country
        
        for ft in o['features']:
            code = ft['properties']['NUTS']
            ft['properties']['packages'] = (values.get(code, 0))

        response.content_type = 'application/json'
        response.pragma = None
        response.cache_control = 'public; max-age: 3600'
        response.cache_expires(seconds=3600)
        return json.dumps(o)
Esempio n. 2
0
    def data(self):
        # Get the Europe dataset
        rootdir = get_root_dir()
        data_file = os.path.join(rootdir, 'ckanext', 'pdeu', 'data', 'eu.json')
        f = open(data_file, 'r')
        o = json.load(f)

        # Get the package count by country
        q = Session.query(
                distinct(PackageExtra.value),
                func.count(PackageExtra.value)
            ).\
                filter(PackageExtra.key == u'eu_country').\
                group_by(PackageExtra.value)

        values = dict(q.all())
        # Set the package count for each country

        for ft in o['features']:
            code = ft['properties']['NUTS']
            ft['properties']['packages'] = (values.get(code, 0))

        response.content_type = 'application/json'
        response.pragma = None
        response.cache_control = 'public; max-age: 3600'
        response.cache_expires(seconds=3600)
        return json.dumps(o)
Esempio n. 3
0
 def test_dump(self):
     assert os.path.exists(self.outpath) 
     dumpeddata = json.load(open(self.outpath))
     assert dumpeddata['version'] == ckan.__version__
     tables = dumpeddata.keys()
     for key in ['Package', 'Tag', 'Group', 'PackageGroup', 'PackageExtra']:
         assert key in tables, '%r not in %s' % (key, tables)
     for key in ['User']:
         assert key not in tables, '%s should not be in %s' % (key, tables)
     assert len(dumpeddata['Package']) == 2, len(dumpeddata['Package'])
     assert len(dumpeddata['Tag']) == 3, len(dumpeddata['Tag'])
     assert len(dumpeddata['PackageRevision']) == 2, len(dumpeddata['PackageRevision'])
     assert len(dumpeddata['Group']) == 2, len(dumpeddata['Group'])
Esempio n. 4
0
 def test_dump(self):
     assert os.path.exists(self.outpath)
     dumpeddata = json.load(open(self.outpath))
     assert dumpeddata['version'] == ckan.__version__
     tables = dumpeddata.keys()
     for key in ['Package', 'Tag', 'Group', 'PackageGroup', 'PackageExtra']:
         assert key in tables, '%r not in %s' % (key, tables)
     for key in ['User']:
         assert key not in tables, '%s should not be in %s' % (key, tables)
     assert len(dumpeddata['Package']) == 2, len(dumpeddata['Package'])
     assert len(dumpeddata['Tag']) == 3, len(dumpeddata['Tag'])
     assert len(dumpeddata['PackageRevision']) == 2, len(
         dumpeddata['PackageRevision'])
     assert len(dumpeddata['Group']) == 2, len(dumpeddata['Group'])
    def fetch_stage(self,harvest_object):

        '''
        Fetches the list of datasets from the catalog
        '''
        log.debug('In CustomHarvester fetch_stage')

        self._set_config(harvest_object.job.source.config)
        db=client.odm
        db_jobs=db.jobs
        config=db_jobs.find_one({"cat_url":harvest_object.source.url})
        api_key=config['apikey']
        dataset_url=config['dataset_url']
        metadata_mappings=json.loads(config['metadata_mappings'])
        if "data.norge.no" in harvest_object.source.url.rstrip('/'):
        	many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4']
        else:
        	many_datasets_list.append(datasets_list_url) 
        
        
        if dataset_url!="":
		  fetch_url=harvest_object.source.url.rstrip('/')+dataset_url.replace("{api}",api_key).replace("{id}", harvest_object.guid)
		  #print(fetch_url)
        else:
		  fetch_url=""
        
        dataset={}
        features=[]
        
        if fetch_url!="":  
		  result=urllib2.urlopen(fetch_url)
		  try:
			try:
			  dataset=json.load(result)
			except:
			  try:
				headers = {'Accept':'application/json'}
				r=urllib2.Request(fetch_url,headers=headers)
				dataset=json.loads(urllib2.urlopen(r).read())
			  except:
				result=urllib2.urlopen(fetch_url)
				read=result.read()
				read=read.replace("null(","dataset=").rstrip(')')
				exec(read)
			#print(dataset)
		  except Exception, e:
			  log.exception('Could not load ' + fetch_url)
			  self._save_gather_error('%r'%e.message,harvest_object)
Esempio n. 6
0
 def gather_stage(self, harvest_job):
     log.debug('In SocrataHarvester gather_stage')
     base_url = harvest_job.source.url.strip("/")
     limit = 20
     ids = []
     for page in count():
         url = base_url + "/api/search/views.json?q=&limit=%s&page=%s" % (limit, page+1)
         print "URL", url
         indexfh = urllib2.urlopen(url)
         result = json.load(indexfh)[0]
         indexfh.close()
         for res in result.get('results', []):
             id = res.get('id')
             obj = HarvestObject(guid=id, job=harvest_job,
                     content=json.dumps(res))
             obj.save()
             ids.append(obj.id)
         break
         if (page+1)*limit > result.get('count'):
             break
     return ids
    def gather_stage(self, harvest_job):
        # The gather stage scans a remote resource (in our case, the /data.json file) for
        # a list of datasets to import.
        
        log.debug('In datajson harvester gather_stage (%s)' % harvest_job.source.url)

        source = json.load(urllib2.urlopen(harvest_job.source.url))
        if len(source) == 0: return None

        # Loop through the packages we've already imported from this source
        # and go into their extra fields to get their source_datajson_identifier,
        # which corresponds to the /data.json 'identifier' field. Make a mapping
        # so we know how to update existing records.
        existing_datasets = { }
        for hobj in model.Session.query(HarvestObject).filter_by(source=harvest_job.source, current=True):
            try:
                pkg = get_action('package_show')(self.context(), { "id": hobj.package_id })
            except:
                # reference is broken
                continue
            for extra in pkg["extras"]:
                if extra["key"] == "source_datajson_identifier":
                    existing_datasets[extra["value"]] = hobj.package_id
                    
        # If we've lost an association to the HarvestSource, scan all packages in the database.
        if False:
            for pkg in model.Session.query(Package):
                if pkg.extras.get("source_datajson_url") == harvest_job.source.url \
                    and pkg.extras.get("source_datajson_identifier"):
                        existing_datasets[pkg.extras["source_datajson_identifier"]] = pkg.id
                    
        # Create HarvestObjects for any records in the /data.json file.
            
        object_ids = []
        seen_datasets = set()
        
        for dataset in source:
            # Create a new HarvestObject for this identifier and save the
            # dataset metdata inside it for later.
            
            # Get the package_id of this resource if we've already imported
            # it into our system. Otherwise, assign a brand new GUID to the
            # HarvestObject. I'm not sure what the point is of that.
            
            if dataset['identifier'] in existing_datasets:
                pkg_id = existing_datasets[dataset["identifier"]]
                seen_datasets.add(pkg_id)
            else:
                pkg_id = uuid.uuid4().hex

            # Create a new HarvestObject and store in it the GUID of the
            # existing dataset (if it exists here already) and the dataset's
            # metadata from the /data.json file.
            obj = HarvestObject(
                guid=pkg_id,
                job=harvest_job,
                content=json.dumps(dataset))
            obj.save()
            object_ids.append(obj.id)
            
        # Remove packages no longer in the /data.json file.
        for id in existing_datasets.values():
            if id not in seen_datasets:
                log.warn('deleting package %s because it is no longer in %s' % (id, harvest_job.source.url))
                Session.query(Package).filter(Package.id == id)
            
        return object_ids
    def gather_stage(self,harvest_job):
        log.debug('In CustomHarvester  gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True
        db=client.odm
        db_jobs=db.jobs
        config=db_jobs.find_one({"cat_url":harvest_job.source.url})
        datasets_list_url=config['datasets_list_url']
        datasets_list_identifier=config['datasets_list_identifier']
        dataset_id=config['dataset_id']
        api_key=config['apikey']
        if "data.norge.no" in harvest_job.source.url.rstrip('/'):
        	many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4']
        else:
        	many_datasets_list.append(datasets_list_url) 

        j=0
        all_datasets=[]
        while j<len(many_datasets_list): 
			url=harvest_job.source.url.rstrip('/')+many_datasets_list[j].replace('{api}',api_key)
			print(url)
			result=urllib2.urlopen(url)
			try:
				  datasets=json.load(result)
				  if datasets_list_identifier!="":
					datasets=datasets[datasets_list_identifier]
			except:
				  try:
					headers = {'Accept':'application/json'}
					r=urllib2.Request(url,headers=headers)
					datasets=t=json.loads(urllib2.urlopen(r).read())
					if datasets_list_identifier!="":
					  datasets=datasets[datasets_list_identifier]
				  except:
					result=urllib2.urlopen(url)
					read=result.read()
					read=read.replace("null(","datasets=").rstrip(')')
					exec(read)
			count=0
			while count<len(datasets):
				all_datasets.append(datasets[count])
				count+=1
			datasets[:]=[]
			j+=1
	 
        
        i=0
        package_ids=[]	
        while i<len(all_datasets):
		  package_ids.append(all_datasets[i][dataset_id])
		  i+=1

        #print('****package ids****')
        #print(package_ids)
        #print(len(package_ids))
        
        ###load existing datasets names and ids from mongoDb
        datasets=list(custom_db.find({'catalogue_url':harvest_job.source.url}))
        datasets_ids=[]
        datasets_names=[]
        j=0
        while j<len(datasets):
		  datasets_ids.append(datasets[j]['id'])
		  j+=1

        
        
        ###check for deleted datasets that exist in mongo
        count_pkg_ids=0
        while count_pkg_ids<len(package_ids):
		  temp_pckg_id=package_ids[count_pkg_ids]
		  if temp_pckg_id in datasets_ids:
			datasets_ids.remove(temp_pckg_id)
		  count_pkg_ids+=1
        if len(datasets_ids)>0:
		j=0
		while j<len(datasets_ids):
		  i=0
		  while i<len(datasets):
			if datasets_ids[j] in datasets[i]['id']:
			  document=datasets[i]
			  document.update({"deleted_dataset":True})
			  custom_db.save(document)
			i+=1
		  j+=1

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                      obj = HarvestObject(guid = package_id, job = harvest_job)
                      obj.save()
                      object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
			  log.exception('Could not load ' + fetch_url)
			  self._save_gather_error('%r'%e.message,harvest_object)
        
        ##case that api do not return json per dataset but all as a list
        else:
		  #print(harvest_object.guid)
		  datasets_list_url=config['datasets_list_url']
		  datasets_list_identifier=config['datasets_list_identifier']
		  api_key=config['apikey']     
		  dataset_id=config['dataset_id']
		  j=0
		  while j<len(many_datasets_list):
			  url=harvest_object.job.source.url.rstrip('/')+many_datasets_list[j].replace('{api}',api_key)
			  result=urllib2.urlopen(url)

			  datasets=json.load(result)
			  if datasets_list_identifier!="":
				datasets=datasets[datasets_list_identifier]
			  i=0
			  while i<len(datasets):
				if datasets[i][dataset_id]==harvest_object.guid:
				  dataset=datasets[i]
				i+=1
			  j+=1		  	
        


        content={}
        db_jobs=db.jobs
        base_url = harvest_object.source.url
        #print(base_url)
    def gather_stage(self, harvest_job):
        # The gather stage scans a remote resource (in our case, the /data.json file) for
        # a list of datasets to import.

        log.debug('In datajson harvester gather_stage (%s)' %
                  harvest_job.source.url)

        source = json.load(urllib2.urlopen(harvest_job.source.url))
        if len(source) == 0: return None

        # Loop through the packages we've already imported from this source
        # and go into their extra fields to get their source_datajson_identifier,
        # which corresponds to the /data.json 'identifier' field. Make a mapping
        # so we know how to update existing records.
        existing_datasets = {}
        for hobj in model.Session.query(HarvestObject).filter_by(
                source=harvest_job.source, current=True):
            try:
                pkg = get_action('package_show')(self.context(), {
                    "id": hobj.package_id
                })
            except:
                # reference is broken
                continue
            for extra in pkg["extras"]:
                if extra["key"] == "source_datajson_identifier":
                    existing_datasets[extra["value"]] = hobj.package_id

        # If we've lost an association to the HarvestSource, scan all packages in the database.
        if False:
            for pkg in model.Session.query(Package):
                if pkg.extras.get("source_datajson_url") == harvest_job.source.url \
                    and pkg.extras.get("source_datajson_identifier"):
                    existing_datasets[
                        pkg.extras["source_datajson_identifier"]] = pkg.id

        # Create HarvestObjects for any records in the /data.json file.

        object_ids = []
        seen_datasets = set()

        for dataset in source:
            # Create a new HarvestObject for this identifier and save the
            # dataset metdata inside it for later.

            # Get the package_id of this resource if we've already imported
            # it into our system. Otherwise, assign a brand new GUID to the
            # HarvestObject. I'm not sure what the point is of that.

            if dataset['identifier'] in existing_datasets:
                pkg_id = existing_datasets[dataset["identifier"]]
                seen_datasets.add(pkg_id)
            else:
                pkg_id = uuid.uuid4().hex

            # Create a new HarvestObject and store in it the GUID of the
            # existing dataset (if it exists here already) and the dataset's
            # metadata from the /data.json file.
            obj = HarvestObject(guid=pkg_id,
                                job=harvest_job,
                                content=json.dumps(dataset))
            obj.save()
            object_ids.append(obj.id)

        # Remove packages no longer in the /data.json file.
        for id in existing_datasets.values():
            if id not in seen_datasets:
                log.warn('deleting package %s because it is no longer in %s' %
                         (id, harvest_job.source.url))
                Session.query(Package).filter(Package.id == id)

        return object_ids