def gather_stage(self, harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) print('****') print(len(package_ids)) print(package_ids) try: object_ids = [] if len(package_ids): for package_id in package_ids: if "http" not in package_id: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def gather_stage(self, harvest_job): print('Html Harvest Gather Stage') db2 = client.odm collection = db2.html_jobs backupi = 0 ## Get source URL source_url = harvest_job.source.url ## mongoDb connection document = collection.find_one({"cat_url": source_url}) id1 = document['_id'] if 'btn_identifier' in document.keys(): if document['btn_identifier'] != None and document[ 'btn_identifier'] != '': cat_url = document['cat_url'] dataset_identifier = document['identifier'] btn_identifier = document['btn_identifier'] action_type = document['action_type'] try: sleep_time = document['sleep_time'] except: sleep_time = 3 package_ids = javascript_case.ParseJavascriptPages( cat_url, dataset_identifier, btn_identifier, action_type, sleep_time) print(package_ids) else: package_ids = harvester_final.read_data(id1, backupi) else: package_ids = harvester_final.read_data(id1, backupi) print(package_ids) #print(len(package_ids)) #package_ids=[] #package_ids.append('http://data.belgium.be/dataset/mortality-tables-gender') #package_ids.append('test') try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % source_url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def _create_harvest_objects(self, remote_ids, harvest_job): ''' Given a list of remote ids and a Harvest Job, create as many Harvest Objects and return a list of their ids to be passed to the fetch stage. TODO: Not sure it is worth keeping this function ''' try: object_ids = [] if len(remote_ids): for remote_id in remote_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = remote_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No remote datasets could be identified', harvest_job) except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)