def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('z3950Harvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # get current objects out of db query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = dict((res[0], res[1]) for res in query) current_guids = set(guid_to_package_id.keys()) current_guids_in_harvest = set() # Get contents try: conn = zoom.Connection(source_url, int(self.source_config.get('port', 210))) conn.databaseName = self.source_config.get('database', '') conn.preferredRecordSyntax = 'XML' conn.elementSetName = 'T' query = zoom.Query('CCL', 'metadata') res = conn.search(query) ids = [] for num, result in enumerate(res): hash = hashlib.md5(result.data).hexdigest() if hash in current_guids: current_guids_in_harvest.add(hash) else: obj = HarvestObject( job=harvest_job, guid=hash, extras=[ HOExtra(key='status', value='new'), HOExtra(key='original_document', value=result.data.decode('latin-1')), HOExtra(key='original_format', value='fgdc') ]) obj.save() ids.append(obj.id) for guid in (current_guids - current_guids_in_harvest): obj = HarvestObject( job=harvest_job, guid=guid, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) obj.save() ids.append(obj.id) return ids except Exception, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None
def _gather_entry(self, entry, auth=None): # Create a harvest object for each entry entry_guid = entry['guid'] log.debug('gathering %s', entry_guid) entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '') # noqa: E501 entry_restart_date = entry['restart_date'] package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = None obj.save() return obj.id
def create_extras(url, status): ''' :param url: :param status: ''' return [HOExtra(key=u'doc_location', value=url), HOExtra(key=u'status', value=status)]
def create_extras(url, date, status): extras = [HOExtra(key='waf_modified_date', value=date), HOExtra(key='waf_location', value=url), HOExtra(key='status', value=status)] if collection_package_id: extras.append( HOExtra(key='collection_package_id', value=collection_package_id) ) return extras
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.ITagEnricher.gather') log.debug('ITagEnricher gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } org_id = model.Package.get(harvest_job.source.id).owner_org organization = logic.get_action('organization_show')(context, { 'id': org_id }) # noqa: E501 # Exclude Sentinel-3 because it seems like iTag can't handle the curved # footprints. filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format( organization['name']) # noqa: E501 ids = [] # We'll limit this to 10 datasets per job so that results appear # faster start = 0 rows = self.source_config.get('datasets_per_job', 10) untagged = logic.get_action('package_search')(context, { 'fq': filter_query, 'rows': rows, 'start': start }) results = untagged['results'] for result in results: spatial = None for i in result['extras']: if i['key'] == 'spatial': spatial = i['value'] if spatial: obj = HarvestObject( guid=result['id'], job=self.job, extras=[ HOExtra(key='status', value='change'), # noqa: E501 HOExtra(key='spatial', value=spatial), # noqa: E501 HOExtra(key='package', value=json.dumps(result)) ]) # noqa: E501 obj.save() ids.append(obj.id) return ids
def _gather_object(self, job, url, size, start_date, forecast_date): filename = parse_filename(url) filename_id = (filename.replace('-v02.0-fv02.0', '').replace( '-fv02.0', '').replace('-sv01.00', '').replace('-sv05.00', '').replace( '-v02', '').replace('-sv10.00', '').replace('-sv09.00', '').replace('-sv07.00', '')) status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps( { 'identifier': filename_id, 'ftp_link': url, 'size': size, 'start_date': start_date, 'forecast_date': forecast_date, 'restart_date': start_date }, default=str) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
def fetch_stage(self, harvest_object): # Check harvest object status status = self._get_object_extra(harvest_object,'status') if status == 'delete': # No need to fetch anything, just pass to the import stage return True # We need to fetch the remote document # Get location url = self._get_object_extra(harvest_object, 'waf_location') if not url: self._save_object_error( 'No location defined for object {0}'.format(harvest_object.id), harvest_object) return False # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: msg = 'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key='original_format', value=document_format) extra.save() return True
def _create_harvest_object(self, content_dict): extras = [ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=content_dict['date_string']) ] # The NextGEOSS harvester flow requires content in the import stage. content = json.dumps(content_dict) obj = HarvestObject(job=self.job, guid=content_dict['identifier'], extras=extras, content=content) obj.save() return obj.id
def _create_object(self, ebv_type, dataset_info): extras = [HOExtra(key='status', value='new')] if ebv_type == 'tree_species': collectionID = 'TREE_SPECIES_DISTRIBUTION_HABITAT_SUITABILITY' collection_name = 'Tree Species Distribution Habitat Suitability' collection_description = ' European Distribution of the tress species for the years 2000 (Habitat Suitability baseline), 2020, 2050 and 2080 (Habitat Suitability future), based on different models such as ENS, CCCMA, CSIRO, HADCM3.' # noqa: E501 elif ebv_type == 'flood_hazards': collectionID = 'FLOOD_HAZARD_EU_GL' collection_name = 'Flood Hazard Europe/Global' collection_description = 'The maps depict flood prone areas at global/european scale for flood events. Resolution is 30 arcseconds (approx. 1km). Cell values indicate water depth (in m). The map can be used to assess flood exposure and risk of population and assets. NOTE: this dataset is based on JRC elaborations and is not an official flood hazard map.' # noqa: E501 title = dataset_info[0] description = dataset_info[1] start_date = dataset_info[2] end_date = dataset_info[3] spatial = dataset_info[4] filename = dataset_info[5] identifier = dataset_info[6] download_url = dataset_info[7] tags = dataset_info[8] content = json.dumps( { 'collectionID': collectionID, 'title': title, 'description': description, 'start_date': start_date, 'end_date': end_date, # noqa: E501 'identifier': identifier, 'downloadURL': download_url, # noqa: E501 'spatial': spatial, 'filename': filename, 'collection_name': collection_name, 'collection_description': collection_description, 'tags': tags }, default=str) obj = HarvestObject(job=self.job, guid=unicode(uuid.uuid4()), extras=extras, content=content) obj.save() return obj.id
def _create_object(self, sensor, dataset_info): extras = [HOExtra(key='status', value='new')] if sensor == 'avhrr': collectionID = 'LAI_1KM_AVHRR_8DAYS_GL' elif sensor == 'modis': collectionID = 'LAI_1KM_MODIS_8DAYS_GL' title = dataset_info[0] description = dataset_info[1] start_date = dataset_info[2] end_date = dataset_info[3] spatial = dataset_info[4] filename = dataset_info[5] identifier = dataset_info[6] downloadURL = dataset_info[7] thumbnailURL = dataset_info[8] metadataURL = dataset_info[9] tags = dataset_info[10] content = json.dumps( { 'collectionID': collectionID, 'title': title, 'description': description, 'start_date': start_date, 'end_date': end_date, # noqa: E501 'identifier': identifier, 'downloadURL': downloadURL, 'thumbnailURL': thumbnailURL, 'metadataURL': metadataURL, # noqa: E501 'spatial': spatial, 'filename': filename, 'tags': tags }, default=str) obj = HarvestObject(job=self.job, guid=unicode(uuid.uuid4()), extras=extras, content=content) obj.save() return obj.id
def _gather_object(self, job, url, start_date): filename = parse_filename(url) filename_id = filename status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps({ 'identifier': filename_id, 'http_link': url, 'start_date': start_date, 'restart_date': start_date }, default=str ) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
def _gather_object(self, job, product, resources, manifest_content, last_harvest_date): name = parse_filename(product).lower() status, package = self._was_harvested(name, self.update_all) extras = [HOExtra(key='status', value=status)] content = json.dumps( { 'name': name, 'restart_date': last_harvest_date.strftime('%Y-%m-%d'), 'manifest_content': manifest_content, 'resources': resources }, default=str) obj = HarvestObject(job=job, guid=unicode(uuid.uuid4()), extras=extras, content=content) obj.package = package obj.save() return obj.id
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.AKANA.gather') log.info('Akana gather_stage for job: %r', harvest_job) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } # get the current objevcts ids and add them to a set query = model.Session.query(HarvestObject.guid, HarvestObject.package_id). \ filter(HarvestObject.current == True). \ filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = guid_to_package_id.keys() # Get akana ID's contents # make request to get object from akana based on tag search url = harvest_job.source.url pa = PingAuth(environment=pingi_env) resp = pa.get(url) resp_dict = json.loads(resp.content) if resp.status_code == 200: try: ids = [] obid = [] x = 0 for api in resp_dict: uuid = api['api-id'] + api['swagger']['info'][ 'version'] + harvest_job.source_id ids.append(uuid) json_api = json.dumps(api) if uuid in guids_in_db: log.info( "This package is already in ckan and is going to be updated: %r", uuid) status = "update" else: log.info("This package is being created: %r", uuid) status = "new" obj = HarvestObject( guid=ids[x], job=harvest_job, extras=[HOExtra(key='status', value=status)], content=json_api) obj.save() obid.append(obj.id) x += 1 obj_del = list(set(guids_in_db) - set(ids)) if obj_del: for uuid in obj_del: log.info("This package is being deleted: %r", uuid) obj = HarvestObject( guid=uuid, job=harvest_job, extras=[HOExtra(key='status', value="delete")], content=[]) model.Session.query(HarvestObject). \ filter_by(guid=guid). \ update({'current': False}, False) obj.save() obid.append(obj.id) # need to return the list of ID's here that are created above return obid except Exception, e: log.error('Exception: %s' % e) self._save_gather_error( 'Error gathering the identifiers from the AKANA server [%s]' % str(e), harvest_job) return None
class ArcGISHarvester(SpatialHarvester, SingletonPlugin): implements(IHarvester) extent_template = Template(''' {"type": "Polygon", "coordinates": [[[$minx, $miny], [$minx, $maxy], [$maxx, $maxy], [$maxx, $miny], [$minx, $miny]]]} ''') def info(self): ''' Harvesting implementations must provide this method, which will return a dictionary containing different descriptors of the harvester. The returned dictionary should contain: * name: machine-readable name. This will be the value stored in the database, and the one used by ckanext-harvest to call the appropiate harvester. * title: human-readable name. This will appear in the form's select box in the WUI. * description: a small description of what the harvester does. This will appear on the form as a guidance to the user. A complete example may be:: { 'name': 'csw', 'title': 'CSW Server', 'description': 'A server that implements OGC's Catalog Service for the Web (CSW) standard' } returns: A dictionary with the harvester descriptors ''' return { 'name': 'arcgis', 'title': 'ArcGIS REST API', 'description': 'An ArcGIS REST API endpoint' } def extra_schema(self): return { 'private_datasets': [ignore_empty, boolean_validator], 'extra_search_criteria': [ignore_empty, unicode], } def gather_stage(self, harvest_job): self.harvest_job = harvest_job source_url = harvest_job.source.url source_config = json.loads(harvest_job.source.config or '{}') extra_search_criteria = source_config.get('extra_search_criteria') num = 100 modified_from = 0 modified_to = 999999999999999999 query_template = 'modified:[{modified_from}+TO+{modified_to}]' if extra_search_criteria: query_template = query_template + ' AND (%s)' % extra_search_criteria #accountid:0123456789ABCDEF query = query_template.format( modified_from=str(modified_from).rjust(18, '0'), modified_to=str(modified_to).rjust(18, '0'), ) start = 0 new_metadata = {} while start <> -1: search_path = 'sharing/search?f=pjson&q={query}&num={num}&start={start}'.format( query=query, num=num, start=start, ) url = urlparse.urljoin(source_url, search_path) try: r = requests.get(url) r.raise_for_status() except requests.exceptions.RequestException, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None results = r.json() for result in results['results']: if result['type'] not in TYPES: continue new_metadata[result['id']] = result start = results['nextStart'] existing_guids = dict() query = model.Session.query(HarvestObject.guid, HOExtra.value).\ filter(HarvestObject.current==True).\ join(HOExtra, HarvestObject.extras).\ filter(HOExtra.key=='arcgis_modified_date').\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) for (guid, value) in query: existing_guids[guid] = value new = set(new_metadata) - set(existing_guids) harvest_objects = [] for guid in new: date = str(new_metadata[guid]['modified']) obj = HarvestObject(job=harvest_job, content=json.dumps(new_metadata[guid]), extras=[ HOExtra(key='arcgis_modified_date', value=date), HOExtra(key='format', value='arcgis_json'), HOExtra(key='status', value='new') ], guid=guid) obj.save() harvest_objects.append(obj.id) deleted = set(existing_guids) - set(new_metadata) for guid in deleted: obj = HarvestObject(job=harvest_job, extras=[HOExtra(key='status', value='delete')], guid=guid) obj.save() harvest_objects.append(obj.id) changed = set(existing_guids) & set(new_metadata) for guid in changed: date = str(new_metadata[guid]['modified']) if date == existing_guids[guid]: continue obj = HarvestObject(job=harvest_job, content=json.dumps(new_metadata[guid]), extras=[ HOExtra(key='arcgis_modified_date', value=date), HOExtra(key='format', value='arcgis_json'), HOExtra(key='status', value='changed') ], guid=guid) obj.save() harvest_objects.append(obj.id) return harvest_objects
existing_object.metadata_modified_date.date() < dataset_last_modified: status = 'changed' else: log.debug('Dataset unchanged: %s this="%s" previous="%s"', dataset['title'], dataset_last_modified, existing_object.metadata_modified_date) continue else: status = 'new' obj = HarvestObject( guid=guid, job=harvest_job, content=doc.serialize_node(dataset_node), harvest_source_reference=guid, metadata_modified_date=dataset_last_modified, extras=[HOExtra(key='status', value=status)], ) obj.save() ids.append(obj.id) return ids def fetch_stage(self, harvest_object): ''' Check that we have content from the gather stage and just return success :returns: True if everything went right, False if errors were found ''' # There is no fetching because all the content for the objects were got # in one request during the gather stage. return bool(harvest_object.content)
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SatcenBetter Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = (self._get_last_harvesting_index( harvest_job.source_id, interface)) interface.update_index(last_product_index) interface.build_url() log.debug('URL: {}'.format(interface.current_url)) # noqa: E501 ids = [] try: results = interface.get_results() except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids if type(results) is not list: self._save_gather_error('{} error: {}'.format( results['status_code'], results['message']), self.job) # noqa: E501 return ids for entry in results: name_path = interface.get_name_path() name_url = get_field(entry, name_path['relative_location'].split(","), name_path['fixed_attributes']) entry_name = parse_name(name_url).lower() entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids
class GeoDataGovGeoportalHarvester(CSWHarvester, GeoDataGovHarvester): ''' A Harvester for CSW servers, with customizations for geo.data.gov ''' def info(self): return { 'name': 'geoportal', 'title': 'Geoportal Server', 'description': 'A Geoportal Server CSW endpoint', } def output_schema(self): return 'csw' def fetch_stage(self, harvest_object): log = logging.getLogger(__name__ + '.geoportal.fetch') log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id) url = harvest_object.source.url identifier = harvest_object.guid parts = urlparse.urlparse(url) url = urlparse.urlunparse((parts.scheme, parts.netloc, '/'.join( parts.path.rstrip('/').split('/')[:-2]), None, None, None)) url = url.rstrip('/') + '/rest/document?id=%s' % identifier try: response = requests.get(url) content = response.content except Exception, e: self._save_object_error( 'Error getting the record with GUID %s from %s' % (identifier, url), harvest_object) return False try: # Save the fetch contents in the HarvestObject # Contents come from csw_client already declared and encoded as utf-8 # Remove original XML declaration content = re.sub('<\?xml(.*)\?>', '', content) document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content harvest_object.save() elif document_format == 'fgdc': extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() else: harvest_object.report_status = 'ignored' harvest_object.save() return False except Exception, e: self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \ (identifier, e), harvest_object) return False
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.gather') log.debug('%s gather_stage for job: %r', self.harvester_name(), harvest_job) # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) try: index = self.create_index(url) log.debug(f'Index created for {self.harvester_name()}') except Exception as e: self._save_gather_error( 'Error harvesting %s: %s' % (self.harvester_name(), e), harvest_job) log.warning(f"Error while creating index: {e}") return None query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current == True).\ filter(HarvestObject.harvest_source_id == harvest_job.source.id) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = set(guid_to_package_id.keys()) #log.debug('Starting gathering for %s' % url) guids_in_harvest = index.keys() new = guids_in_harvest - guids_in_db delete = guids_in_db - guids_in_harvest change = guids_in_db & guids_in_harvest ids = [] for guid in new: doc = index.get_as_string(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=doc, extras=[HOExtra(key='status', value='new')]) obj.save() ids.append(obj.id) for guid in change: doc = index.get_as_string(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=doc, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='change')]) obj.save() ids.append(obj.id) for guid in delete: obj = HarvestObject(guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) ids.append(obj.id) model.Session.query(HarvestObject).\ filter_by(guid=guid).\ update({'current': False}, False) obj.save() if len(ids) == 0: self._save_gather_error( 'No records received from the %s service' % self.harvester_name(), harvest_job) return None return ids
def gather_stage(self, harvest_job): requests_cache.install_cache() requests_cache.clear() session = requests_cache.CachedSession() self.log = logging.getLogger(__file__) self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) base_url = self.source_config.get('oai_pmh_url') metadata_prefix = self.source_config.get('metadata_prefix') start_date = self.source_config.get('start_date', None) self.update_all = self.source_config.get('update_all', False) last_token = self._get_last_harvesting_index(self.job.source_id, 'last_token') next_token = self._get_last_harvesting_index(self.job.source_id, 'next_token') next_station = self._get_last_harvesting_index(self.job.source_id, 'next_station') restart_date = self._get_last_harvesting_index(self.job.source_id, 'restart_date') restart_date = restart_date if last_token else None ids = [] first_query = True while (ids == [] and next_token) or first_query: first_query = False current_token = last_token if next_station else next_token if current_token: query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format( base_url, current_token) elif restart_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, restart_date) elif start_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, start_date) else: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format( base_url, metadata_prefix) self.log.debug('Querying: {}.'.format(query_url)) raw_list_ids = self.get_list_identifiers(session, query_url) list_stations, largest_datastamp = self.get_station_ids( raw_list_ids) next_token = self.get_resumption_token(raw_list_ids) last_token = current_token restart_date = restart_date if restart_date else '' restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date if list_stations == []: next_station = None else: valid_deployment = None station_index = 0 while not valid_deployment and station_index <= len( list_stations) - 1: station = list_stations[station_index] next_station = None if (next_station == station) else next_station if not next_station: station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format( base_url, metadata_prefix, station) print('Querying station: {}.'.format(station)) record = self.get_record(session, station_query) if record: station_info = StationInfo(record) if station_info.isValid(): station_info.id = station observation_list = station_info.get_observations( ) station_dict = station_info.get_dict() station_info = None for observation in observation_list: observation_info = ObservationInfo( session, observation) deployments_list = observation_info.get_deployments( ) observation_dict = observation_info.get_dict( ) observation_info = None for deployment in deployments_list: deployment_info = DeploymentInfo( session, deployment) if deployment_info.isValid(): deployment_dict = deployment_info.get_dict( ) deployment_info = None valid_deployment = True if station_index + 1 <= len( list_stations) - 1: next_station = list_stations[ station_index + 1] else: next_station = None entry_guid = unicode(uuid.uuid4()) entry_id = '{}_{}'.format( station_dict['id'], deployment_dict['id']) entry_name = clean_snakecase( entry_id) self.log.debug( 'Gathering %s', entry_name) content = {} content['station'] = station_dict content[ 'observation'] = observation_dict content[ 'deployment'] = deployment_dict package_query = Session.query( Package) query_filtered = package_query.filter( Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: self.log.debug( '{} already exists and will be updated.' .format(entry_name) ) # noqa: E501 status = 'change' else: self.log.debug( '{} will not be updated.' .format(entry_name) ) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. self.log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name )) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='last_token', value=last_token), HOExtra(key='next_token', value=next_token), HOExtra( key='next_station', value=next_station), HOExtra(key='restart_date', value=restart_date) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() ids.append(obj.id) if not valid_deployment: self.log.debug( 'Station {} does not have valid deployments.' .format(station)) else: self.log.debug( 'Station {} is not valid.'.format(station)) station_index += 1 return ids
def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ]
class DocHarvester(SpatialHarvester, SingletonPlugin): '''A Harvester for individual spatial metadata documents TODO: Move to new logic ''' implements(IHarvester) def info(self): ''' ''' return { u'name': u'single-doc', u'title': u'Single spatial metadata document', u'description': u'A single spatial metadata document' } def get_original_url(self, harvest_object_id): ''' :param harvest_object_id: ''' obj = model.Session.query(HarvestObject).filter( HarvestObject.id == harvest_object_id).first() if not obj: return None return obj.source.url def gather_stage(self, harvest_job): ''' :param harvest_job: ''' log = logging.getLogger(__name__ + u'.individual.gather') log.debug(u'DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception, e: self._save_gather_error(u'Unable to get content for URL: %s: %r' % (url, e), harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter( HarvestObject.current is True).filter( HarvestObject.harvest_source_id == harvest_job.source.id).first() def create_extras(url, status): ''' :param url: :param status: ''' return [HOExtra(key=u'doc_location', value=url), HOExtra(key=u'status', value=status)] if not existing_object: guid = hashlib.md5(url.encode(u'utf8', u'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, u'new'), guid=guid ) else: harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, u'change'), guid=existing_object.guid, package_id=existing_object.package_id ) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == u'iso': harvest_object.content = content else: extra = HOExtra( object=harvest_object, key=u'original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key=u'original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) max_dataset = self.source_config.get('max_dataset', 100) wfs_url = self.source_config.get('wfs_url') wfs_version = self.source_config.get('wfs_version') collection = self.source_config.get('collection') typename = COLLECTION[collection].get('collection_typename') tag_typename = COLLECTION[collection].get('tag_typename', None) self.update_all = self.source_config.get('update_all', False) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id) ) if last_product_index: last_product_index = last_product_index + 1 else: last_product_index = 0 wfs = WFS(url=wfs_url, version=wfs_version) wfs.set_collection(typename) sortby=['When'] result = wfs.make_request(max_dataset, sortby, last_product_index) entries = result['features'] name = '{}_{}'.format(collection.lower(), '{}') ids = [] for entry in entries: entry_guid = unicode(uuid.uuid4()) entry_name = name.format(convert_to_clean_snakecase(entry['id'])) log.debug('gathering %s', entry_name) content = {} content['collection_content'] = entry if tag_typename: wfs.set_collection(tag_typename) filterxml = wfs.set_filter_equal_to('image_id', entry['id']) result = wfs.make_request(constraint=filterxml) result = wfs.get_request(constraint=filterxml) content['tag_url'] = result package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='index', value=last_product_index) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() last_product_index += 1 ids.append(obj.id) return ids
class WAFCollectionHarvester(GeoDataGovWAFHarvester): def info(self): return { 'name': 'waf-collection', 'title': 'Web Accessible Folder (WAF) Homogeneous Collection', 'description': 'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents with a collection record' } def extra_schema(self): extra_schema = super(WAFCollectionHarvester, self).extra_schema() extra_schema['collection_metadata_url'] = [not_empty, unicode] return extra_schema def get_package_dict(self, iso_values, harvest_object): package_dict = super(WAFCollectionHarvester, self).get_package_dict(iso_values, harvest_object) if not package_dict: return None collection_package_id = self._get_object_extra( harvest_object, 'collection_package_id') if collection_package_id: package_dict['extras'].append( dict(key='collection_package_id', value=collection_package_id)) collection_metadata = self._get_object_extra(harvest_object, 'collection_metadata') if collection_metadata: package_dict['extras'].append( dict(key='collection_metadata', value=collection_metadata)) status = self._get_object_extra(harvest_object, 'status') if status == 'change': self.force_import = True else: self.force_import = False return package_dict def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('WafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) collection_metadata_url = self.source_config.get( 'collection_metadata_url') if not collection_metadata_url: self._save_gather_error('collection url does not exist', harvest_job) return None try: response = requests.get(source_url, timeout=60) content = response.content except Exception, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None guid = hashlib.md5(collection_metadata_url.encode( 'utf8', 'ignore')).hexdigest() existing_harvest_object = model.Session.\ query(HarvestObject.guid, HarvestObject.package_id, HOExtra.value).\ join(HOExtra, HarvestObject.extras).\ filter(HOExtra.key=='collection_metadata').\ filter(HOExtra.value=='true').\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).first() if existing_harvest_object: status = 'change' guid = existing_harvest_object.guid package_id = existing_harvest_object.package_id else: status, package_id = 'new', None obj = HarvestObject(job=harvest_job, extras=[ HOExtra(key='collection_metadata', value='true'), HOExtra(key='waf_location', value=collection_metadata_url), HOExtra(key='status', value=status) ], guid=guid, status=status, package_id=package_id) queue.fetch_and_import_stages(self, obj) if obj.state == 'ERROR': self._save_gather_error( 'Collection object failed to harvest, not harvesting', harvest_job) return None return GeoDataGovWAFHarvester.gather_stage( self, harvest_job, collection_package_id=obj.package_id)
def _gather_entry(self, entry, path, row, update_all=False): # Create a harvest object for each entry entry_guid = unicode(uuid.uuid4()) entry_name = entry.lower() # noqa: E501 log.debug('gathering %s', entry) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = None obj.save() return obj.id
def gather_stage(self, harvest_job): logger.debug('CswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) parts = urlparse.urlparse(url) params = {'keywords__slug__in': self.keywords, 'limit': 10000} url = urlparse.urlunparse(( parts.scheme, parts.netloc, '/api/layers', None, urllib.urlencode(params, True), None )) query = model.Session.query( HarvestObject.guid, HarvestObject.package_id ).filter(HarvestObject.current == True).filter( HarvestObject.harvest_source_id == harvest_job.source.id ) guid_to_package_id = {} for guid, package_id in query: guid_to_package_id[guid] = package_id guids_in_db = set(guid_to_package_id.keys()) logger.debug('Starting gathering for %s' % url) guids_in_harvest = set() try: for obj in requests.get(url).json()['objects']: try: uuid = obj['uuid'] logger.info('Got identifier %s from the PacGeo', uuid) guids_in_harvest.add(uuid) except Exception, e: self._save_gather_error( 'Error for the identifier from <%r>: %s' % (obj, e), harvest_job ) continue except Exception as e: logger.error('Exception: %s', e) self._save_gather_error( 'Error gathering the identifiers from the PacGeo server [%s]' % str(e), harvest_job ) return None new = guids_in_harvest - guids_in_db delete = guids_in_db - guids_in_harvest change = guids_in_db & guids_in_harvest ids = [] for guid in new: obj = HarvestObject( guid=guid, job=harvest_job, extras=[HOExtra(key='status', value='new')] ) obj.save() ids.append(obj.id) for guid in change: obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='change')] ) obj.save() ids.append(obj.id) for guid in delete: obj = HarvestObject( guid=guid, job=harvest_job, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')] ) model.Session.query(HarvestObject).filter_by(guid=guid).update({ 'current': False }, False) obj.save() ids.append(obj.id) if len(ids) == 0: self._save_gather_error( 'No records received from the CSW server', harvest_job ) return None return ids
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]
def _crawl_results(self, harvest_url, timeout=5, limit=100, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 first_query = True while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') json_content = json.loads(soup.text) # Get the URL for the next loop, or None to break the loop log.debug(harvest_url) harvest_url = self._get_next_url(harvest_url, json_content) # Get the entries from the results entry_list = self._get_entries_from_results(json_content) if first_query: entries = entry_list else: entries = entry_list[1:] first_query = False # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry['content']) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry['content']) obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def _parse_products(self, products): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: entry_guid = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_name = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_restart_date = entry['master'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
# Get contents try: content = self._get_content_as_unicode(url) except Exception, e: msg = u'Could not harvest WAF link {0}: {1}'.format(url, e) self._save_object_error(msg, harvest_object) return False # Check if it is an ISO document document_format = guess_standard(content) if document_format == u'iso': harvest_object.content = content harvest_object.save() else: extra = HOExtra( object=harvest_object, key=u'original_document', value=content) extra.save() extra = HOExtra( object=harvest_object, key=u'original_format', value=document_format) extra.save() return True apache = parse.SkipTo(parse.CaselessLiteral(u'<a href='), include=True).suppress() + parse.quotedString.setParseAction( parse.removeQuotes).setResultsName(u'url') + parse.SkipTo(u'</a>',
def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, auth=HTTPBasicAuth(username, password), verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(soup) # Get the entries from the results entries = self._get_entries_from_results(soup) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. # We need package_show to ensure that all the conversions # are carried out. context = {"user": "******", "ignore_auth": True, "model": model, "session": Session} pkg_dict = logic.get_action('package_show')(context, {"id": package.name}) # noqa: E501 previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 # E.g., a Sentinel dataset exists, # but doesn't have a NOA resource yet. elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra): # noqa: E501 log.debug('{} already exists and will be extended.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date)]) obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date)]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids