def fetch_stage(self, harvest_object): # Check harvest object status status = self._get_object_extra(harvest_object,'status') if status == 'delete': # No need to fetch anything, just pass to the import stage return True # We need to fetch the remote document # Get location url = self._get_object_extra(harvest_object, 'waf_location') if not url: self._save_object_error( 'No location defined for object {0}'.format(harvest_object.id), harvest_object) return False # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: msg = 'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key='original_format', value=document_format) extra.save() return True
def add_status(): records = open('wafurls.txt') results = open('wafurlsstatus.txt', 'w+') headers = 'count,count_with_date,server,status_code,error,standard,id,unapproved,url' results.write(headers + '\n') writer = csv.DictWriter( results, headers.split(',') ) for row in records: row_dict = dict(zip('id unapproved url'.split(),row.split())) try: response = requests.get(row_dict['url'], timeout=60) content = response.content server = str(response.headers.get('server')) if server == 'Microsoft-IIS/7.5': scraper = 'iis' elif 'apache' in server.lower() or 'nginx' in server.lower() or not response.headers.get('server'): scraper = 'apache' else: scraper = 'other' row_dict['status_code'] = str(response.status_code) row_dict['server'] = server if content and response.status_code == 200: extracted_waf = extract_waf(content,row_dict['url'], scraper) row_dict['count'] = str(len(extracted_waf)) row_dict['count_with_date'] = str(len([i for i in extracted_waf if i[1]])) if extracted_waf: try: content_doc = requests.get(extracted_waf[0][0], timeout=60).content standard = guess_standard(content_doc) row_dict['standard'] = standard except Exception, e: print 'Error guessing format. Error is', e else: row_dict['count'] = "0" row_dict['count_with_date'] = "0"
def add_status(): records = open('wafurls.txt') results = open('wafurlsstatus.txt', 'w+') headers = 'count,count_with_date,server,status_code,error,standard,id,unapproved,url' results.write(headers + '\n') writer = csv.DictWriter(results, headers.split(',')) for row in records: row_dict = dict(zip('id unapproved url'.split(), row.split())) try: response = requests.get(row_dict['url'], timeout=60) content = response.content server = str(response.headers.get('server')) if server == 'Microsoft-IIS/7.5': scraper = 'iis' elif 'apache' in server.lower() or 'nginx' in server.lower( ) or not response.headers.get('server'): scraper = 'apache' else: scraper = 'other' row_dict['status_code'] = str(response.status_code) row_dict['server'] = server if content and response.status_code == 200: extracted_waf = extract_waf(content, row_dict['url'], scraper) row_dict['count'] = str(len(extracted_waf)) row_dict['count_with_date'] = str( len([i for i in extracted_waf if i[1]])) if extracted_waf: try: content_doc = requests.get(extracted_waf[0][0], timeout=60).content standard = guess_standard(content_doc) row_dict['standard'] = standard except Exception, e: print 'Error guessing format. Error is', e else: row_dict['count'] = "0" row_dict['count_with_date'] = "0"
if not url: self._save_object_error( u'No location defined for object {0}'.format(harvest_object.id), harvest_object) return False # Get contents try: content = self._get_content_as_unicode(url) except Exception, e: msg = u'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == u'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key=u'original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key=u'original_format', value=document_format) extra.save()
class GeoDataGovGeoportalHarvester(CSWHarvester, GeoDataGovHarvester): ''' A Harvester for CSW servers, with customizations for geo.data.gov ''' def info(self): return { 'name': 'geoportal', 'title': 'Geoportal Server', 'description': 'A Geoportal Server CSW endpoint', } def output_schema(self): return 'csw' def fetch_stage(self, harvest_object): log = logging.getLogger(__name__ + '.geoportal.fetch') log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id) url = harvest_object.source.url identifier = harvest_object.guid parts = urlparse.urlparse(url) url = urlparse.urlunparse((parts.scheme, parts.netloc, '/'.join( parts.path.rstrip('/').split('/')[:-2]), None, None, None)) url = url.rstrip('/') + '/rest/document?id=%s' % identifier try: response = requests.get(url) content = response.content except Exception, e: self._save_object_error( 'Error getting the record with GUID %s from %s' % (identifier, url), harvest_object) return False try: # Save the fetch contents in the HarvestObject # Contents come from csw_client already declared and encoded as utf-8 # Remove original XML declaration content = re.sub('<\?xml(.*)\?>', '', content) document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() elif document_format == 'fgdc': extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() else: harvest_object.report_status = 'ignored' harvest_object.save() return False except Exception, e: self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \ (identifier, e), harvest_object) return False
if not url: self._save_object_error( 'No location defined for object {0}'.format(harvest_object.id), harvest_object) return False # Get contents try: content = self._get_content_as_unicode(url) except Exception, e: msg = 'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key='original_format', value=document_format) extra.save()
class DocHarvester(SpatialHarvester, SingletonPlugin): '''A Harvester for individual spatial metadata documents TODO: Move to new logic ''' implements(IHarvester) def info(self): ''' ''' return { u'name': u'single-doc', u'title': u'Single spatial metadata document', u'description': u'A single spatial metadata document' } def get_original_url(self, harvest_object_id): ''' :param harvest_object_id: ''' obj = model.Session.query(HarvestObject).filter( HarvestObject.id == harvest_object_id).first() if not obj: return None return obj.source.url def gather_stage(self, harvest_job): ''' :param harvest_job: ''' log = logging.getLogger(__name__ + u'.individual.gather') log.debug(u'DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception, e: self._save_gather_error(u'Unable to get content for URL: %s: %r' % (url, e), harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter( HarvestObject.current is True).filter( HarvestObject.harvest_source_id == harvest_job.source.id).first() def create_extras(url, status): ''' :param url: :param status: ''' return [HOExtra(key=u'doc_location', value=url), HOExtra(key=u'status', value=status)] if not existing_object: guid = hashlib.md5(url.encode(u'utf8', u'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, u'new'), guid=guid ) else: harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, u'change'), guid=existing_object.guid, package_id=existing_object.package_id ) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == u'iso': harvest_object.content = content else: extra = HOExtra( object=harvest_object, key=u'original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key=u'original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]