Esempio n. 1
0
    def gather_stage(self, harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' %
                  harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)
        print('****')
        print(len(package_ids))
        print(package_ids)

        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    if "http" not in package_id:
                        # Create a new HarvestObject for this identifier
                        obj = HarvestObject(guid=package_id, job=harvest_job)
                        obj.save()
                        object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' %
                                        url, harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r' % e.message, harvest_job)
Esempio n. 2
0
    def import_stage(self, harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' %
                                    harvest_object.id,
                                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            d = socrataAdaptor()
            log.debug("Converting View")
            stripped_source = harvest_object.source.url.rstrip('/')
            package_dict = d.convertViewXml(harvest_object.id,
                                            stripped_source,
                                            harvest_object.content)

            package_dict.update({"catalogue_url":
                                str(harvest_object.source.url.rstrip('/'))})
            package_dict.update({"platform": "socrata"})

            if 'category' in package_dict.keys():
                package_dict['extras'].update({'category':
                                              package_dict['category']})
                del package_dict['category']

            log.debug(package_dict)
            if package_dict['id'] not in ids:
                metadata_created = datetime.datetime.now()
                package_dict.update({"metadata_created":
                                    str(metadata_created)})
                socrata_db.save(package_dict)
                log.info('Metadata saved succesfully to MongoDb.')
            else:
                document = socrata_db.find_one({"id": package_dict['id']})
                met_created = document['metadata_created']
                package_dict.update({'metadata_created': met_created})
                package_dict.update({'metadata_updated':
                                    str(datetime.datetime.now())})
                package_dict.update({'updated_dataset': True})
                socrata_db.remove({"id": package_dict['id']})
                socrata_db.save(package_dict)
                log.info('Metadata updated succesfully to MongoDb.')

            # Set default tags if needed
            default_tags = self.config.get('default_tags', [])
            if default_tags:
                if 'tags' not in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags
                                             if t not in package_dict['tags']])

            # Set default groups if needed
            default_groups = self.config.get('default_groups', [])
            if default_groups:
                if 'groups' not in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if
                                              g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,
                                                    harvest_object)

            if result and self.config.get('read_only', False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user', u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user,
                                             role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor', u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package,
                                                 user=user,
                                                 role=model.Role.READER)
            return True

        except ValidationError, e:
            self._save_object_error('Invalid package with GUID %s: %r' %
                                    (harvest_object.guid, e.error_dict),
                                    harvest_object, 'Import')
            log.debug("Validation Error: %s", harvest_object.guid)
    def gather_stage(self,harvest_job):
        log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url)
        get_all_packages = True

        dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/')
        log.debug(dcatUrl)

        adaptorInstance = socrataAdaptor()
        package_ids = adaptorInstance.listDatasetIds(dcatUrl)
        #print('****')
        #print(len(package_ids))
        #print(package_ids)


        ##load existing datasets names and ids from mongoDb
        datasets=list(socrata_db.find({'catalogue_url':harvest_job.source.url.rstrip('/')}))
        datasets_ids=[]
        datasets_names=[]
        j=0
        while j<len(datasets):
		  datasets_ids.append(datasets[j]['id'])
		  datasets_names.append(datasets[j]['name'])
		  j+=1
        #print(datasets_names)
        




        ##check for deleted datasets that exist in mongo
        count_pkg_ids=0
        while count_pkg_ids<len(package_ids):
		  temp_pckg_id=package_ids[count_pkg_ids]
		  if temp_pckg_id in datasets_ids:
			datasets_ids.remove(temp_pckg_id)
		  if temp_pckg_id in datasets_names:
			datasets_names.remove(temp_pckg_id)
		  count_pkg_ids+=1
        if len(datasets_names)<len(datasets_ids):
		  #print(datasets_names)
		  j=0
		  #print(harvest_job.source.url.rstrip('/'))
		  while j<len(datasets_names):
			i=0
			while i<len(datasets):
			  if datasets_names[j] in datasets[i]['name']:
				document=datasets[i]
				document.update({"deleted_dataset":True})
				socrata_db.save(document)
			  i+=1

			j+=1
        else:
		  #print(datasets_ids)
		  j=0
		  while j<len(datasets_ids):
			i=0
			while i<len(datasets):
			  if datasets_ids[j] in datasets[i]['id']:
				document=datasets[i]
				document.update({"deleted_dataset":True})
				socrata_db.save(document)
			  i+=1

			j+=1


        try:
            object_ids = []
            if len(package_ids):
                for package_id in package_ids:
                    if "http" not in package_id: 
                    # Create a new HarvestObject for this identifier
                      obj = HarvestObject(guid = package_id, job = harvest_job)
                      obj.save()
                      object_ids.append(obj.id)

                return object_ids

            else:
                self._save_gather_error('No packages received for URL: %s' % url,
                    harvest_job)
                return None
        except Exception, e:
            self._save_gather_error('%r'%e.message,harvest_job)
    def import_stage(self,harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            #log.debug(harvest_object.content)
            language=""
            try:
            	doc=db_jobs.find_one({"cat_url":str(base_url)})
            	language=doc['language']
            except:pass
	
            d = socrataAdaptor()
            log.debug("Converting View")
            package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content)
            package_dict.update({"catalogue_url":str(harvest_object.source.url.rstrip('/'))})
            package_dict.update({"platform":"socrata"})
            package_dict.update({"language":language})
            if 'notes_rendered' in package_dict.keys():
                package_dict.update({"notes":package_dict['notes_rendered']})
                del package_dict['notes_rendered']
            if 'category' in package_dict.keys():
               package_dict['extras'].update({'category':package_dict['category']})
               del package_dict['category']
            log.debug(package_dict)
            mainurl=str(harvest_object.source.url.rstrip('/'))	
            #if package_dict['id'] not in ids:
            document=socrata_db.find_one({"catalogue_url":harvest_object.source.url.rstrip('/'),'id':package_dict['id']})
            if document==None:
                  metadata_created=datetime.datetime.now()
                  package_dict.update({"metadata_created":str(metadata_created)})
                  socrata_db.save(package_dict)
                  log.info('Metadata saved succesfully to MongoDb.')
                  fetch_document=db_fetch_temp.find_one()
		  if fetch_document==None:
			fetch_document={}
			fetch_document.update({"cat_url":mainurl})
			fetch_document.update({"new":1})
			fetch_document.update({"updated":0})
			db_fetch_temp.save(fetch_document)
		  else:
			if mainurl==fetch_document['cat_url']:
			  new_count=fetch_document['new']
			  new_count+=1
			  fetch_document.update({"new":new_count})
			  db_fetch_temp.save(fetch_document)
			else:
			  last_cat_url=fetch_document['cat_url']
			  doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']})
			  if 'new' in fetch_document.keys():
				new=fetch_document['new']
				if 'new' in doc.keys():
				  last_new=doc['new']
				  doc.update({"last_new":last_new})
				doc.update({"new":new})
				db_jobs.save(doc)
			  if 'updated' in fetch_document.keys():
				updated=fetch_document['updated']
				if 'updated' in doc.keys():
				  last_updated=doc['updated']
				  doc.update({"last_updated":last_updated})
				doc.update({"updated":updated})
				db_jobs.save(doc)
			  fetch_document.update({"cat_url":mainurl})
			  fetch_document.update({"new":1})
			  fetch_document.update({"updated":0})
			  db_fetch_temp.save(fetch_document)
            else:
                #document=socrata_db.find_one({"id":package_dict['id']})
                  met_created=document['metadata_created']
		  if 'copied' in document.keys():
			package_dict.update({'copied':document['copied']})
                  package_dict.update({'metadata_created':met_created})
                  package_dict.update({'metadata_updated':str(datetime.datetime.now())})
                  package_dict.update({'updated_dataset':True})
                  #existing_dataset=socrata_db.find_one({"id":package_dict['id'],"catalogue_url":mainurl})
                  objectid=document['_id']
                  package_dict.update({'_id':objectid})		
                  socrata_db.save(package_dict)
                  log.info('Metadata updated succesfully to MongoDb.')
                  fetch_document=db_fetch_temp.find_one()
		  if fetch_document==None:
			fetch_document={}
			fetch_document.update({"cat_url":mainurl})
			fetch_document.update({"updated":1})
			fetch_document.update({"new":0})
			db_fetch_temp.save(fetch_document)
		  else:
			if mainurl==fetch_document['cat_url']:
			  updated_count=fetch_document['updated']
			  updated_count+=1
			  fetch_document.update({"updated":updated_count})
			  db_fetch_temp.save(fetch_document)
			else:
			  last_cat_url=fetch_document['cat_url']
			  doc=db_jobs.find_one({'cat_url':fetch_document['cat_url']})
			  if 'new' in fetch_document.keys():
				new=fetch_document['new']
				if 'new' in doc.keys():
				  last_new=doc['new']
				  doc.update({"last_new":last_new})
				doc.update({"new":new})
				db_jobs.save(doc)
			  if 'updated' in fetch_document.keys():
				updated=fetch_document['updated']
				if 'updated' in doc.keys():
				  last_updated=doc['updated']
				  doc.update({"last_updated":last_updated})
				doc.update({"updated":updated})
				db_jobs.save(doc)
			  fetch_document.update({"cat_url":mainurl})
			  fetch_document.update({"updated":1})
			  fetch_document.update({"new":0})
			  db_fetch_temp.save(fetch_document)	

            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])


            # Set default groups if needed
            default_groups = self.config.get('default_groups',[])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,harvest_object)
            #log.debug(result)

            if result and self.config.get('read_only',False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
            return True



        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')
            print('ValidationErrorr')
Esempio n. 5
0
    def import_stage(self,harvest_object):
        '''
        Imports each dataset from Socrata, into the CKAN server
        '''
        log.debug('In SocrataHarvester import_stage')
        if not harvest_object:
            log.error('No harvest object received')
            return False

        if harvest_object.content is None:
            self._save_object_error('Empty content for object %s' % harvest_object.id,
                    harvest_object, 'Import')
            return False

        self._set_config(harvest_object.job.source.config)

        log.debug(harvest_object.job.source.config)
        try:
            #log.debug(harvest_object.content)

            d = socrataAdaptor()
            log.debug("Converting View")
            package_dict = d.convertViewXml(harvest_object.id, harvest_object.source.url.rstrip('/'), harvest_object.content)
            log.debug(package_dict)

            # Set default tags if needed
            default_tags = self.config.get('default_tags',[])
            if default_tags:
                if not 'tags' in package_dict:
                    package_dict['tags'] = []
                package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])


            # Set default groups if needed
            default_groups = self.config.get('default_groups',[])
            if default_groups:
                if not 'groups' in package_dict:
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

            log.debug(package_dict)

            result = self._create_or_update_package(package_dict,harvest_object)
            #log.debug(result)

            if result and self.config.get('read_only',False) == True:
                package = model.Package.get(package_dict['id'])

                # Clear default permissions
                model.clear_user_roles(package)

                # Setup harvest user as admin
                user_name = self.config.get('user',u'harvest')
                user = model.User.get(user_name)
                pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)

                # Other users can only read
                for user_name in (u'visitor',u'logged_in'):
                    user = model.User.get(user_name)
                    pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)



        except ValidationError,e:
            self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
                    harvest_object, 'Import')