def gather_stage(self, harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) print('****') print(len(package_ids)) print(package_ids) try: object_ids = [] if len(package_ids): for package_id in package_ids: if "http" not in package_id: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def harvest_object_create(context, data_dict): """ Create a new harvest object :type guid: string (optional) :type content: string (optional) :type job_id: string :type source_id: string (optional) :type package_id: string (optional) :type extras: dict (optional) """ check_access('harvest_object_create', context, data_dict) data, errors = _validate( data_dict, harvest_object_create_schema(), context) if errors: raise logic.ValidationError(errors) obj = HarvestObject( guid=data.get('guid'), content=data.get('content'), job=data['job_id'], harvest_source_id=data.get('source_id'), package_id=data.get('package_id'), extras=[HarvestObjectExtra(key=k, value=v) for k, v in data.get('extras', {}).items()] ) obj.save() return harvest_object_dictize(obj, context)
def gather_stage(self, harvest_job): print('Html Harvest Gather Stage') db2 = client.odm collection = db2.html_jobs backupi = 0 ## Get source URL source_url = harvest_job.source.url ## mongoDb connection document = collection.find_one({"cat_url": source_url}) id1 = document['_id'] if 'btn_identifier' in document.keys(): if document['btn_identifier'] != None and document[ 'btn_identifier'] != '': cat_url = document['cat_url'] dataset_identifier = document['identifier'] btn_identifier = document['btn_identifier'] action_type = document['action_type'] try: sleep_time = document['sleep_time'] except: sleep_time = 3 package_ids = javascript_case.ParseJavascriptPages( cat_url, dataset_identifier, btn_identifier, action_type, sleep_time) print(package_ids) else: package_ids = harvester_final.read_data(id1, backupi) else: package_ids = harvester_final.read_data(id1, backupi) print(package_ids) #print(len(package_ids)) #package_ids=[] #package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender') #package_ids.append('test') try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % source_url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def gather_stage(self, harvest_job): print("Html Harvest Gather Stage") db2 = client.odm collection = db2.html_jobs backupi = 0 ## Get source URL source_url = harvest_job.source.url ## mongoDb connection document = collection.find_one({"cat_url": source_url}) id1 = document["_id"] if "btn_identifier" in document.keys(): if document["btn_identifier"] != None and document["btn_identifier"] != "": cat_url = document["cat_url"] dataset_identifier = document["identifier"] btn_identifier = document["btn_identifier"] action_type = document["action_type"] try: sleep_time = document["sleep_time"] except: sleep_time = 3 package_ids = javascript_case.ParseJavascriptPages( cat_url, dataset_identifier, btn_identifier, action_type, sleep_time ) print(package_ids) else: package_ids = harvester_final.read_data(id1, backupi) else: package_ids = harvester_final.read_data(id1, backupi) print(package_ids) # print(len(package_ids)) # package_ids=[] # package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender') # package_ids.append('test') try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error("No packages received for URL: %s" % source_url, harvest_job) return None except Exception, e: self._save_gather_error("%r" % e.message, harvest_job)
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False obj = HarvestObject.get(id) if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False if 'html_job:' not in str(id): obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def harvest_object_show(context, data_dict): p.toolkit.check_access('harvest_object_show', context, data_dict) id = data_dict.get('id') dataset_id = data_dict.get('dataset_id') if id: attr = data_dict.get('attr', None) obj = HarvestObject.get(id, attr=attr) elif dataset_id: model = context['model'] pkg = model.Package.get(dataset_id) if not pkg: raise p.toolkit.ObjectNotFound('Dataset not found') obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id == pkg.id) \ .filter(HarvestObject.current) \ .first() else: raise p.toolkit.ValidationError( 'Please provide either an "id" or a "dataset_id" parameter') if not obj: raise p.toolkit.ObjectNotFound('Harvest object not found') return harvest_object_dictize(obj, context)
def gather_stage(self, harvest_job): if harvest_job.source.url.startswith('basic_test'): obj = HarvestObject(guid = 'test1', job = harvest_job) obj.extras.append(HarvestObjectExtra(key='key', value='value')) obj2 = HarvestObject(guid = 'test2', job = harvest_job) obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job) obj.add() obj2.add() obj3.save() # this will commit both return [obj.id, obj2.id, obj3.id] return []
def _create_harvest_objects(self, remote_ids, harvest_job): ''' Given a list of remote ids and a Harvest Job, create as many Harvest Objects and return a list of their ids to be passed to the fetch stage. TODO: Not sure it is worth keeping this function ''' try: object_ids = [] if len(remote_ids): for remote_id in remote_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = remote_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No remote datasets could be identified', harvest_job) except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def gather_stage(self,harvest_job): log.debug('In CustomHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True db=client.odm db_jobs=db.jobs config=db_jobs.find_one({"cat_url":harvest_job.source.url}) datasets_list_url=config['datasets_list_url'] datasets_list_identifier=config['datasets_list_identifier'] dataset_id=config['dataset_id'] api_key=config['apikey'] if "data.norge.no" in harvest_job.source.url.rstrip('/'): many_datasets_list=['/api/dcat/data.json?page=1','/api/dcat/data.json?page=2','/api/dcat/data.json?page=3','/api/dcat/data.json?page=4'] else: many_datasets_list.append(datasets_list_url) j=0 all_datasets=[] while j<len(many_datasets_list): url=harvest_job.source.url.rstrip('/')+many_datasets_list[j].replace('{api}',api_key) print(url) result=urllib2.urlopen(url) try: datasets=json.load(result) if datasets_list_identifier!="": datasets=datasets[datasets_list_identifier] except: try: headers = {'Accept':'application/json'} r=urllib2.Request(url,headers=headers) datasets=t=json.loads(urllib2.urlopen(r).read()) if datasets_list_identifier!="": datasets=datasets[datasets_list_identifier] except: result=urllib2.urlopen(url) read=result.read() read=read.replace("null(","datasets=").rstrip(')') exec(read) count=0 while count<len(datasets): all_datasets.append(datasets[count]) count+=1 datasets[:]=[] j+=1 i=0 package_ids=[] while i<len(all_datasets): package_ids.append(all_datasets[i][dataset_id]) i+=1 #print('****package ids****') #print(package_ids) #print(len(package_ids)) ###load existing datasets names and ids from mongoDb datasets=list(custom_db.find({'catalogue_url':harvest_job.source.url})) datasets_ids=[] datasets_names=[] j=0 while j<len(datasets): datasets_ids.append(datasets[j]['id']) j+=1 ###check for deleted datasets that exist in mongo count_pkg_ids=0 while count_pkg_ids<len(package_ids): temp_pckg_id=package_ids[count_pkg_ids] if temp_pckg_id in datasets_ids: datasets_ids.remove(temp_pckg_id) count_pkg_ids+=1 if len(datasets_ids)>0: j=0 while j<len(datasets_ids): i=0 while i<len(datasets): if datasets_ids[j] in datasets[i]['id']: document=datasets[i] document.update({"deleted_dataset":True}) custom_db.save(document) i+=1 j+=1 try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self,harvest_job): log.debug('In OpendatasoftHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True url=harvest_job.source.url.rstrip('/')+"/api/datasets/1.0/search/?rows=100000000" result=json.load(urllib2.urlopen(url)) datasets=result['datasets'] i=0 package_ids=[] while i<len(datasets): package_ids.append(datasets[i]['datasetid']) i+=1 #package_ids = adaptorInstance.listDatasetIds(dcatUrl) #print('****package ids****') #print(package_ids) ###load existing datasets names and ids from mongoDb datasets=list(opendatasoft_db.find({'catalogue_url':harvest_job.source.url})) datasets_ids=[] datasets_names=[] j=0 while j<len(datasets): datasets_ids.append(datasets[j]['id']) j+=1 ###check for deleted datasets that exist in mongo count_pkg_ids=0 while count_pkg_ids<len(package_ids): temp_pckg_id=package_ids[count_pkg_ids] if temp_pckg_id in datasets_ids: datasets_ids.remove(temp_pckg_id) count_pkg_ids+=1 if len(datasets_ids)>0: #print(datasets_ids) j=0 while j<len(datasets_ids): i=0 while i<len(datasets): if datasets_ids[j] in datasets[i]['id']: document=datasets[i] document.update({"deleted_dataset":True}) opendatasoft_db.save(document) i+=1 j+=1 try: object_ids = [] if len(package_ids): for package_id in package_ids: #if "http" not in package_id: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self,harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) #print('****') #print(len(package_ids)) #print(package_ids) ##load existing datasets names and ids from mongoDb datasets=list(socrata_db.find({'catalogue_url':harvest_job.source.url.rstrip('/')})) datasets_ids=[] datasets_names=[] j=0 while j<len(datasets): datasets_ids.append(datasets[j]['id']) datasets_names.append(datasets[j]['name']) j+=1 #print(datasets_names) ##check for deleted datasets that exist in mongo count_pkg_ids=0 while count_pkg_ids<len(package_ids): temp_pckg_id=package_ids[count_pkg_ids] if temp_pckg_id in datasets_ids: datasets_ids.remove(temp_pckg_id) if temp_pckg_id in datasets_names: datasets_names.remove(temp_pckg_id) count_pkg_ids+=1 if len(datasets_names)<len(datasets_ids): #print(datasets_names) j=0 #print(harvest_job.source.url.rstrip('/')) while j<len(datasets_names): i=0 while i<len(datasets): if datasets_names[j] in datasets[i]['name']: document=datasets[i] document.update({"deleted_dataset":True}) socrata_db.save(document) i+=1 j+=1 else: #print(datasets_ids) j=0 while j<len(datasets_ids): i=0 while i<len(datasets): if datasets_ids[j] in datasets[i]['id']: document=datasets[i] document.update({"deleted_dataset":True}) socrata_db.save(document) i+=1 j+=1 try: object_ids = [] if len(package_ids): for package_id in package_ids: if "http" not in package_id: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self,harvest_job): print('Html Harvest Gather Stage') db2 = client.odm odm=db2.odm collection=db2.html_jobs backupi=0 mainurl="" ctlg_url=harvest_job.source.url if 'http://' in ctlg_url: mainurl1=ctlg_url[ctlg_url.find('http://')+7:] mainurl='http://'+mainurl1[0:mainurl1.find('/')] if 'https://' in ctlg_url: mainurl1=ctlg_url[ctlg_url.find('https://')+8:] mainurl='https://'+mainurl1[0:mainurl1.find('/')] ##load existing datasets names and ids from mongoDb datasets=list(odm.find({'catalogue_url':mainurl})) if len(datasets)==0: datasets=list(odm.find({'catalogue_url':harvest_job.source.url.rstrip('/')+'/'})) datasets_ids=[] j=0 while j<len(datasets): datasets_ids.append(datasets[j]['url']) j+=1 #print("==============================================") #print(datasets_ids) #print(len(datasets_ids)) #print("==============================================") ## Get source URL source_url = harvest_job.source.url ## mongoDb connection document=collection.find_one({"cat_url":source_url}) id1=document['_id'] if 'btn_identifier' in document.keys(): if document['btn_identifier']!=None and document['btn_identifier']!='': cat_url=document['cat_url'] dataset_identifier=document['identifier'] btn_identifier=document['btn_identifier'] action_type=document['action_type'] try: sleep_time=document['sleep_time'] except: sleep_time=3 package_ids=javascript_case.ParseJavascriptPages(cat_url,dataset_identifier,btn_identifier,action_type,sleep_time) #print(package_ids) else: package_ids=harvester_final.read_data(id1,backupi) else: package_ids=harvester_final.read_data(id1,backupi) #print(package_ids) ##check for deleted datasets that exist in mongo count_pkg_ids=0 if len(package_ids)>0: while count_pkg_ids<len(package_ids): temp_pckg_id=package_ids[count_pkg_ids] if temp_pckg_id in datasets_ids: datasets_ids.remove(temp_pckg_id) count_pkg_ids+=1 j=0 while j<len(datasets_ids): i=0 while i<len(datasets): if datasets_ids[j] == datasets[i]['url']: document=datasets[i] document.update({"deleted_dataset":True}) odm.save(document) i+=1 j+=1 #print(len(package_ids)) #package_ids=[] #package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender') #package_ids.append('test') try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % source_url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
#if document==None: #document=odm.find_one({"catalogue_url":str(base_url)+"/","id":datasets_ids[j]}) #document.update({"deleted_dataset":True}) #odm.save(document) j+=1 try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job) def fetch_stage(self,harvest_object): log.debug('In CKANHarvester fetch_stage')