def _gather_entry(self, entry, auth=None): # Create a harvest object for each entry entry_guid = entry['guid'] log.debug('gathering %s', entry_guid) entry_name = entry['identifier'].replace('v101_', '').replace('.hdf5', '') # noqa: E501 entry_restart_date = entry['restart_date'] package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = entry['content'] obj.package = None obj.save() return obj.id
def _run_import(self, xml, job): if not model.User.get('harvest'): model.User(name='harvest', sysadmin=True).save() if not model.Group.get('test'): get_action('organization_create')({ 'user': '******' }, { 'name': 'test' }) record = _get_record(xml) metadata = CmdiReader()(record) metadata['unified']['owner_org'] = "test" harvest_object = HarvestObject() harvest_object.content = json.dumps(metadata.getMap()) harvest_object.id = xml harvest_object.guid = xml harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.save() self.harvester.import_stage(harvest_object) return harvest_object
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def gather_stage(self, harvest_job, encoding=None): self._set_config(harvest_job.source.config) # Request all remote packages try: content = self._get_content(harvest_job.source.url) LOGGER.debug('Grabbing zip file: %s', harvest_job.source.url) object_ids = [] packages = [] file_content = StringIO.StringIO(content) archive = zipfile.ZipFile(file_content, 'r') for name in archive.namelist(): if name.endswith('.json'): archive_content = archive.read(name) if encoding is not None: archive_content = archive_content.decode(encoding) else: archive_content = self.lstrip_bom(archive_content) package = json.loads(archive_content) normalize_api_dataset(package) packages.append(package) obj = HarvestObject(guid=package['name'], job=harvest_job) obj.content = json.dumps(package) obj.save() object_ids.append(obj.id) except zipfile.BadZipfile as err: self._save_gather_error(err.message, harvest_job) return None except ContentFetchError as err: self._save_gather_error(err.message, harvest_job) return None except Exception as err: error_template = 'Unable to get content for URL: %s: %s' error = error_template % (harvest_job.source.url, str(err)) self._save_gather_error(error, harvest_job) return None if object_ids: # delete obsolete packages super(JSONZipBaseHarvester, self).delete_deprecated_datasets( packages, harvest_job ) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % harvest_job.source.url, harvest_job ) return None
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730180'}) self.assertEquals(package.get('id', None), 'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('name', None), 'urn-nbn-fi-lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = {u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'metadata'} self.assertTrue(expected_pid in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) self.assertEquals(len(harvest_object.errors), 0, u"\n".join(unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({'user': '******'}, {'id': 'urn-nbn-fi-lb-20140730186'}) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def test_harvester_urlerror(self): harv, job = self._create_harvester() urllib2.urlopen = realopen self.assert_(harv.gather_stage(job) == None) errs = Session.query(HarvestGatherError).all() self.assert_(len(errs) == 1) harv_obj = HarvestObject() harv_obj.job = job harv_obj.content = json.dumps({'url': "http://foo"}) # XML error and URL error, also the lack of url in content self.assert_(harv.import_stage(harv_obj) == False) errs = Session.query(HarvestObjectError).all() print errs self.assert_(len(errs) == 1)
def _make_retry_lists(self, harvest_job, ident2rec, ident2set, from_until): recs = [] for ident, harv in ident2rec.items(): info = json.loads(harv.content) harv.content = None harv.save() harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps(info) harvest_obj.save() recs.append(harvest_obj.id) log.debug('Retrying record: %s' % harv.id) sets = [] insertion_retries = set() def update_until(info, from_until): if 'until' not in info: return # Wanted up to current time earlier. if 'until' not in from_until: del info['until'] # Want up to current time now. return fu = self._str_from_datetime(from_until['until']) if info['until'] < fu: # Keep latest date from the two alternatives. info['until'] = fu for name, obj in ident2set.items(): info = json.loads(obj.content) obj.content = None obj.save() update_until(info, from_until) harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps(info) harvest_obj.save() sets.append(harvest_obj.id) if 'set' not in info: insertion_retries.add(name) log.debug('Retrying set insertions: %s' % info['set_name']) else: log.debug('Retrying set: %s' % info['set_name']) return recs, sets, insertion_retries
def gather_stage(self, harvest_job): """Gather the URLs to fetch from a URL which has a list of links to XML documents containing the DDI documents. """ self._set_config(self.config) previous_job = ( Session.query(HarvestJob) .filter(HarvestJob.source == harvest_job.source) .filter(HarvestJob.gather_finished != None) .filter(HarvestJob.id != harvest_job.id) .order_by(HarvestJob.gather_finished.desc()) .limit(1) .first() ) if previous_job: self.incremental = True gather_url = harvest_job.source.url try: urls = urllib2.urlopen(gather_url) harvest_objs = [] for url in urls.readlines(): gather = True if self.incremental: request = urllib2.Request(url) request.get_method = lambda: "HEAD" doc_url = urllib2.urlopen(request) lastmod = parser.parse(doc_url.headers["last-modified"], ignoretz=True) if previous_job.gather_finished < lastmod: log.debug("Gather false") gather = False if gather and not self.incremental: harvest_obj = HarvestObject() harvest_obj.content = json.dumps({"url": url}) harvest_obj.job = harvest_job harvest_obj.save() harvest_objs.append(harvest_obj.id) except urllib2.URLError: self._save_gather_error("Could not gather XML files from URL!", harvest_job) return None return harvest_objs
def _run_import(self, xml, job): if not model.User.get('harvest'): model.User(name='harvest', sysadmin=True).save() if not model.Group.get('test'): get_action('organization_create')({'user': '******'}, {'name': 'test'}) record = _get_record(xml) metadata = CmdiReader()(record) metadata['unified']['owner_org'] = "test" harvest_object = HarvestObject() harvest_object.content = json.dumps(metadata.getMap()) harvest_object.id = xml harvest_object.guid = xml harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.save() self.harvester.import_stage(harvest_object) return harvest_object
def gather_stage(self, harvest_job): super(JSONDumpBaseCKANHarvester, self)._set_config( harvest_job.source.config ) try: content = self._get_content(harvest_job.source.url) except ContentFetchError as err: self._save_gather_error(err.message, harvest_job) return None except Exception as err: error_template = 'Unable to get content for URL: %s: %s' error = error_template % (harvest_job.source.url, str(err)) self._save_gather_error(error, harvest_job) return None object_ids = [] packages = json.loads(content) for package in packages: normalize_api_dataset(package) obj = HarvestObject(guid=package['name'], job=harvest_job) obj.content = json.dumps(package) obj.save() object_ids.append(obj.id) if object_ids: # delete obsolete packages self.delete_deprecated_datasets(packages, harvest_job) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % harvest_job.source.url, harvest_job ) return None
def test_import(self): source = HarvestSource(url="http://localhost/test_cmdi", type="cmdi") source.save() job = HarvestJob(source=source) job.save() harvest_object = self._run_import("cmdi_1.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package.get('name', None), utils.pid_to_name(package.get('id', None))) self.assertEquals(utils.get_primary_pid(package), u'http://urn.fi/urn:nbn:fi:lb-20140730180') self.assertEquals(package.get('notes', None), u'{"eng": "Test description"}') self.assertEquals(package.get('version', None), '2012-09-07') self.assertEquals(package.get('title', []), '{"eng": "Longi Corpus"}') self.assertEquals(package.get('license_id', None), 'undernegotiation') provider = config['ckan.site_url'] expected_pid = { u'id': u'http://islrn.org/resources/248-895-085-557-0', u'provider': provider, u'type': u'relation', u'relation': u'generalRelation' } self.assertTrue(expected_pid not in package.get('pids')) model.Session.flush() harvest_object = self._run_import("cmdi_2.xml", job) package_id = json.loads(harvest_object.content)['unified']['id'] self.assertEquals( len(harvest_object.errors), 0, u"\n".join( unicode(error.message) for error in (harvest_object.errors or []))) package = get_action('package_show')({ 'user': '******' }, { 'id': package_id }) self.assertEquals(package['temporal_coverage_begin'], '1880') self.assertEquals(package['temporal_coverage_end'], '1939') self.assertEquals(package.get('license_id', None), 'other') # Delete package harvest_object = HarvestObject() harvest_object.content = None harvest_object.id = "test-cmdi-delete" harvest_object.guid = "test-cmdi-delete" harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.package_id = package.get('id') harvest_object.report_status = "deleted" harvest_object.save() self.harvester.import_stage(harvest_object) model.Session.flush() self.assertEquals(model.Package.get(package['id']).state, 'deleted')
def _crawl_results(self, harvest_url, limit=100, timeout=5): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 base_url = self.source_config.get('source_url') while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() soup = self._make_request(harvest_url, timeout) if not soup: return ids next_url = soup.find('csw:searchresults', elementset="summary") records_returned = next_url['numberofrecordsreturned'] next_record = next_url['nextrecord'] number_records_matched = next_url['numberofrecordsmatched'] if next_record != '0': current_record = str(eval(next_record) - eval(records_returned)) # noqa: E501 else: current_record = str(eval(number_records_matched) - eval(records_returned)) # noqa: E501 # Get the URL for the next loop, or None to break the loop # Only works if StartPosition is last URL parameter harvest_url = self._get_next_url(harvest_url, records_returned, next_record, limit) # noqa: E501 # Get the entries from the results entries = self._get_entries_from_results(soup, current_record, next_record) # noqa: E501 # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = 'saeon_csag_' + entry['identifier'].lower().replace('.', '_').replace('/', '-') # noqa: E501 full_content = {} full_content['extra_content'] = self._get_entry_time_and_author(base_url, entry['identifier'], timeout) # noqa: E501 full_content['raw_content'] = entry['content'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} already exists and will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_record', value=entry['restart_record'])]) # noqa: E501 obj.content = json.dumps(full_content) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_record', value=entry['restart_record'])]) # noqa: E501 new_counter += 1 obj.content = json.dumps(full_content) obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
def _gather_stage(self, harvest_job): from_until = self._get_time_limits(harvest_job) client, identifier = self._get_client_identifier( harvest_job.source.url, harvest_job) if not identifier: raise RuntimeError('Could not get source identifier.') # Get things to retry. ident2rec, ident2set = {}, {} rec_idents = [] domain = identifier.repositoryName() try: args = {self.metadata_prefix_key: self.metadata_prefix_value} if not self.config.get('force_all', False): args.update(from_until) for ident in client.listIdentifiers(**args): if ident.identifier() in ident2rec: continue # On our retry list already, do not fetch twice. rec_idents.append(ident.identifier()) except NoRecordsMatchError: log.debug('No records matched: %s' % domain) pass # Ok. Just nothing to get. except Exception as e: # Once we know of something specific, handle it separately. log.debug(traceback.format_exc(e)) self._save_gather_error( 'Could not fetch identifier list.', harvest_job) raise RuntimeError('Could not fetch an identifier list.') # Gathering the set list here. Member identifiers in fetch. sets = [] try: for set_ in client.listSets(): identifier, name, _ = set_ # Is set due for retry and it is not missing member insertion? # Set either failed in retry of misses packages but not both. # Set with failed insertions may have new members. if name in ident2set: continue sets.append((identifier, name,)) except NoSetHierarchyError: log.debug('No sets: %s' % domain) except urllib2.URLError: # Possibly timeout. self._save_gather_error( 'Could not fetch a set list.', harvest_job) # We got something so perhaps records can gen gotten, hence []. raise RuntimeError('Could not fetch set list.') # Since network errors can't occur anymore, it's ok to create the # harvest objects to return to caller since we are not missing anything # crucial. harvest_objs, set_objs, insertion_retries = [], [], set() for ident in rec_idents: info = {'fetch_type': 'record', 'record': ident, 'domain': domain} harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps(info) harvest_obj.save() harvest_objs.append(harvest_obj.id) log.info('Gathered %i records from %s.' % (len(harvest_objs), domain,)) # Add sets to retry first. harvest_objs.extend(set_objs) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) info = {'fetch_type': 'set', 'set': set_id, 'set_name': set_name, 'domain': domain} if 'from_' in from_until: info['from_'] = self._str_from_datetime(from_until['from_']) if 'until' in from_until: info['until'] = self._str_from_datetime(from_until['until']) harvest_obj.content = json.dumps(info) harvest_obj.save() harvest_objs.append(harvest_obj.id) log.info( 'Gathered %i records/sets from %s.' % (len(harvest_objs), domain,)) return harvest_objs
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('VITO Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id, interface) ) interface.update_index(last_product_index) interface.build_url_date() path_to_entries = interface.get_entries_path() ids = [] try: results = interface.get_results() if results: entries = self.get_field(results, path_to_entries[:]) else: return ids except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids except Exception as e: return ids if entries == None: return ids elif type(entries) is not list: entries = [entries] identifier_path = interface.get_identifier_path() for entry in entries: entry_id = self.clean_snakecase(self.get_field(entry, identifier_path[:])[0]) entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_id) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_id)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_id)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_id)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids
def _crawl_results(self, harvest_url, limit=100, timeout=5, username=None, password=None, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, auth=HTTPBasicAuth(username, password), verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format(r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info(log_message.format(self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') # Get the URL for the next loop, or None to break the loop harvest_url = self._get_next_url(soup) # Get the entries from the results entries = self._get_entries_from_results(soup) # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. # We need package_show to ensure that all the conversions # are carried out. context = {"user": "******", "ignore_auth": True, "model": model, "session": Session} pkg_dict = logic.get_action('package_show')(context, {"id": package.name}) # noqa: E501 previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 # E.g., a Sentinel dataset exists, # but doesn't have a NOA resource yet. elif self.flagged_extra and not get_pkg_dict_extra(pkg_dict, self.flagged_extra): # noqa: E501 log.debug('{} already exists and will be extended.'.format(entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date)]) obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug('{} has not been harvested before. Creating a new harvest object.'.format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date)]) new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SatcenBetter Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) self.update_all = self.source_config.get('update_all', False) interface = INTERFACE(self.source_config, COLLECTION) last_product_index = (self._get_last_harvesting_index( harvest_job.source_id, interface)) interface.update_index(last_product_index) interface.build_url() log.debug('URL: {}'.format(interface.current_url)) # noqa: E501 ids = [] try: results = interface.get_results() except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 return ids if type(results) is not list: self._save_gather_error('{} error: {}'.format( results['status_code'], results['message']), self.job) # noqa: E501 return ids for entry in results: name_path = interface.get_name_path() name_url = get_field(entry, name_path['relative_location'].split(","), name_path['fixed_attributes']) entry_name = parse_name(name_url).lower() entry_guid = unicode(uuid.uuid4()) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key=interface.get_pagination_mechanism(), value=interface.get_index()) ]) obj.content = json.dumps(entry) obj.package = None if status == 'new' else package obj.save() interface.increment_index() ids.append(obj.id) return ids
job=harvest_job, content=json.dumps(pkg_dict)) obj.save() object_ids.append(obj.id) for deleted_id in deleted_ids: # Original harvest object needs to be updated log.debug('Creating deleting HarvestObject for %s', deleted_id) obj = model.Session.query(HarvestObject)\ .filter( HarvestObject.current == True # noqa )\ .filter(HarvestObject.guid == deleted_id).one() obj.job = harvest_job obj.content = '{"id":"%s", "delete":true}' % deleted_id obj.save() object_ids.append(obj.id) return object_ids except Exception, e: self._save_gather_error('%r' % e.message, harvest_job) def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None): '''Does a dataset search on a remote CKAN and returns the results. Deals with paging to return all the results, not just the first page. ''' base_search_url = remote_ckan_base_url + self._get_search_api_offset() params = {'rows': '100', 'start': '0'} # There is the worry that datasets will be changed whilst we are paging
def _parse_products(self, products, mosquito_type): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: # Add mosquito type on object entry['mosquito_type'] = mosquito_type # Correct Date if entry['dt_placement'].startswith('00'): entry['dt_corrected'] = '20' + entry['dt_placement'][2:] filename = "{}_{}_{}".format(mosquito_type, entry['station_id'], entry['dt_corrected']) else: filename = "{}_{}_{}".format(mosquito_type, entry['station_id'], entry['dt_placement']) # Sanitize filename filename = self._sanitize_filename(filename) # Add coast_mean on aedes for uniqueness if mosquito_type == 'aedes': filename = filename + '_' + str( int(entry['coast_mean_dist_1000'])) entry_guid = filename entry_name = filename entry['filename'] = filename entry_restart_date = entry['dt_placement'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def _parse_products(self, products): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 # Create a harvest object for each entry for entry in products: entry_guid = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_name = entry['imgtif'].split('/')[1].lower( ) + "_" + entry['type'] + "_" + str(entry['intid']) entry_restart_date = entry['master'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry) obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): self.log = logging.getLogger(__file__) self.log.debug('SCENT Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) max_dataset = self.source_config.get('max_dataset', 100) wfs_url = self.source_config.get('wfs_url') wfs_version = self.source_config.get('wfs_version') collection = self.source_config.get('collection') typename = COLLECTION[collection].get('collection_typename') tag_typename = COLLECTION[collection].get('tag_typename', None) self.update_all = self.source_config.get('update_all', False) last_product_index = ( self._get_last_harvesting_index(harvest_job.source_id) ) if last_product_index: last_product_index = last_product_index + 1 else: last_product_index = 0 wfs = WFS(url=wfs_url, version=wfs_version) wfs.set_collection(typename) sortby=['When'] result = wfs.make_request(max_dataset, sortby, last_product_index) entries = result['features'] name = '{}_{}'.format(collection.lower(), '{}') ids = [] for entry in entries: entry_guid = unicode(uuid.uuid4()) entry_name = name.format(convert_to_clean_snakecase(entry['id'])) log.debug('gathering %s', entry_name) content = {} content['collection_content'] = entry if tag_typename: wfs.set_collection(tag_typename) filterxml = wfs.set_filter_equal_to('image_id', entry['id']) result = wfs.make_request(constraint=filterxml) result = wfs.get_request(constraint=filterxml) content['tag_url'] = result package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='index', value=last_product_index) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() last_product_index += 1 ids.append(obj.id) return ids
def _gather(self, job, config): ftp_user = config['username'] ftp_passwd = config['password'] source_type = config['harvester_type'] ftp_source = create_ftp_source(source_type) if not hasattr(self, 'harvester_logger'): self.harvester_logger = self.make_harvester_logger() self.provider = 'deimos_imaging' existing_files = ftp_source._get_ftp_urls(ftp_user, ftp_passwd) metadata_dict = {} ids = [] new_counter = 0 for ftp_url in existing_files: filename = self.parse_filename(ftp_url) product_type = self.parse_filedirectory(ftp_url) identifier = filename content = {'identifier': identifier, 'product_type': product_type, 'ftp_link': ftp_url} # noqa: E501 raw_id = identifier.replace(product_type, 'L0R') if raw_id in metadata_dict: metadata = metadata_dict[raw_id] else: metadata = self._get_metadata(raw_id) metadata_dict[raw_id] = metadata for key in metadata: content[key] = metadata[key] content = json.dumps(content, default=str) package = Session.query(Package) \ .filter(Package.name == identifier.lower()).first() if package: log.debug('{} will not be updated.'.format(identifier)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=ftp_url, job=job, extras=[HOExtra(key='status', value=status)]) obj.content = content obj.package = package obj.save() ids.append(obj.id) elif not package: log.debug('{} has not been harvested before. Creating a new harvest object.'.format(identifier)) # noqa: E501 status = 'new' new_counter += 1 extras = [HOExtra(key='status', value=status)] obj = HarvestObject(job=job, guid=ftp_url, extras=extras) obj.content = content obj.package = None obj.save() ids.append(obj.id) harvester_msg = '{:<12} | {} | Job ID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info(harvester_msg.format(self.provider, timestamp, job.id, new_counter, '0')) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): requests_cache.install_cache() requests_cache.clear() session = requests_cache.CachedSession() self.log = logging.getLogger(__file__) self.log.debug('OSCAR Harvester gather_stage for job: %r', harvest_job) self.job = harvest_job self.source_config = self._get_config(harvest_job) base_url = self.source_config.get('oai_pmh_url') metadata_prefix = self.source_config.get('metadata_prefix') start_date = self.source_config.get('start_date', None) self.update_all = self.source_config.get('update_all', False) last_token = self._get_last_harvesting_index(self.job.source_id, 'last_token') next_token = self._get_last_harvesting_index(self.job.source_id, 'next_token') next_station = self._get_last_harvesting_index(self.job.source_id, 'next_station') restart_date = self._get_last_harvesting_index(self.job.source_id, 'restart_date') restart_date = restart_date if last_token else None ids = [] first_query = True while (ids == [] and next_token) or first_query: first_query = False current_token = last_token if next_station else next_token if current_token: query_url = "{}?verb=ListIdentifiers&resumptionToken={}".format( base_url, current_token) elif restart_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, restart_date) elif start_date: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}&from={}".format( base_url, metadata_prefix, start_date) else: query_url = "{}?verb=ListIdentifiers&metadataPrefix={}".format( base_url, metadata_prefix) self.log.debug('Querying: {}.'.format(query_url)) raw_list_ids = self.get_list_identifiers(session, query_url) list_stations, largest_datastamp = self.get_station_ids( raw_list_ids) next_token = self.get_resumption_token(raw_list_ids) last_token = current_token restart_date = restart_date if restart_date else '' restart_date = largest_datastamp if largest_datastamp > restart_date else restart_date if list_stations == []: next_station = None else: valid_deployment = None station_index = 0 while not valid_deployment and station_index <= len( list_stations) - 1: station = list_stations[station_index] next_station = None if (next_station == station) else next_station if not next_station: station_query = '{}?verb=GetRecord&metadataPrefix={}&identifier={}'.format( base_url, metadata_prefix, station) print('Querying station: {}.'.format(station)) record = self.get_record(session, station_query) if record: station_info = StationInfo(record) if station_info.isValid(): station_info.id = station observation_list = station_info.get_observations( ) station_dict = station_info.get_dict() station_info = None for observation in observation_list: observation_info = ObservationInfo( session, observation) deployments_list = observation_info.get_deployments( ) observation_dict = observation_info.get_dict( ) observation_info = None for deployment in deployments_list: deployment_info = DeploymentInfo( session, deployment) if deployment_info.isValid(): deployment_dict = deployment_info.get_dict( ) deployment_info = None valid_deployment = True if station_index + 1 <= len( list_stations) - 1: next_station = list_stations[ station_index + 1] else: next_station = None entry_guid = unicode(uuid.uuid4()) entry_id = '{}_{}'.format( station_dict['id'], deployment_dict['id']) entry_name = clean_snakecase( entry_id) self.log.debug( 'Gathering %s', entry_name) content = {} content['station'] = station_dict content[ 'observation'] = observation_dict content[ 'deployment'] = deployment_dict package_query = Session.query( Package) query_filtered = package_query.filter( Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: self.log.debug( '{} already exists and will be updated.' .format(entry_name) ) # noqa: E501 status = 'change' else: self.log.debug( '{} will not be updated.' .format(entry_name) ) # noqa: E501 status = 'unchanged' elif not package: # It's a product we haven't harvested before. self.log.debug( '{} has not been harvested before. Creating a new harvest object.' . # noqa: E501 format(entry_name )) # noqa: E501 status = 'new' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='last_token', value=last_token), HOExtra(key='next_token', value=next_token), HOExtra( key='next_station', value=next_station), HOExtra(key='restart_date', value=restart_date) ]) obj.content = json.dumps(content) obj.package = None if status == 'new' else package obj.save() ids.append(obj.id) if not valid_deployment: self.log.debug( 'Station {} does not have valid deployments.' .format(station)) else: self.log.debug( 'Station {} is not valid.'.format(station)) station_index += 1 return ids
def _gather_entry(self, entry, path, row, update_all=False): # Create a harvest object for each entry entry_guid = unicode(uuid.uuid4()) entry_name = entry.lower() # noqa: E501 log.debug('gathering %s', entry) package_query = Session.query(Package) query_filtered = package_query.filter(Package.name == entry_name) package = query_filtered.first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if update_all: log.debug('{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id else: log.debug( '{} will not be updated.'.format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = package obj.save() return obj.id elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.'. # noqa: E501 format(entry_name)) # noqa: E501 obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='path', value=path), HOExtra(key='row', value=row) ]) obj.content = entry obj.package = None obj.save() return obj.id
def _crawl_results(self, harvest_url, timeout=5, limit=100, provider=None): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 first_query = True while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, verify=False, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') json_content = json.loads(soup.text) # Get the URL for the next loop, or None to break the loop log.debug(harvest_url) harvest_url = self._get_next_url(harvest_url, json_content) # Get the entries from the results entry_list = self._get_entries_from_results(json_content) if first_query: entries = entry_list else: entries = entry_list[1:] first_query = False # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] entry_restart_date = entry['restart_date'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' else: log.debug('{} will not be updated.'.format( entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_date', value=entry_restart_date) ]) obj.content = json.dumps(entry['content']) obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra(key='restart_date', value=entry_restart_date) ]) new_counter += 1 obj.content = json.dumps(entry['content']) obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, 0)) # noqa: E128, E501 return ids
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.individual.gather') log.debug('DocHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # Get contents try: content = self._get_content_as_unicode(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\ filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id).\ first() def create_extras(url, status): return [ HOExtra(key='doc_location', value=url), HOExtra(key='status', value=status) ] if not existing_object: guid = hashlib.md5(url.encode('utf8', 'ignore')).hexdigest() harvest_object = HarvestObject(job=harvest_job, extras=create_extras(url, 'new'), guid=guid) else: harvest_object = HarvestObject( job=harvest_job, extras=create_extras(url, 'change'), guid=existing_object.guid, package_id=existing_object.package_id) harvest_object.add() # Check if it is an ISO document document_format = guess_standard(content) if document_format == 'iso': harvest_object.content = content else: extra = HOExtra(object=harvest_object, key='original_document', value=content) extra.save() extra = HOExtra(object=harvest_object, key='original_format', value=document_format) extra.save() harvest_object.save() return [harvest_object.id]
return True # Get source URL url = harvest_object.source.url.rstrip('/') url = url + self._get_rest_api_offset() + '/package/' + harvest_object.guid # Get contents try: content = self._get_content(url) except ContentFetchError, e: self._save_object_error('Unable to get content for package: %s: %r' % (url, e), harvest_object) return None # Save the fetched contents in the HarvestObject harvest_object.content = content harvest_object.save() return True def import_stage(self, harvest_object): log.debug('In CKANHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False if harvest_object.content is None: self._save_object_error('Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False self._set_config(harvest_object.job.source.config)
def _crawl_results(self, harvest_url, limit=100, timeout=5): # noqa: E501 """ Iterate through the results, create harvest objects, and return the ids. """ ids = [] new_counter = 0 update_counter = 0 while len(ids) < limit and harvest_url: # We'll limit ourselves to one request per second start_request = time.time() # Make a request to the website timestamp = str(datetime.utcnow()) log_message = '{:<12} | {} | {} | {}s' try: r = requests.get(harvest_url, timeout=timeout) except Timeout as e: self._save_gather_error('Request timed out: {}'.format(e), self.job) # noqa: E501 status_code = 408 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, status_code, timeout)) # noqa: E128 return ids if r.status_code != 200: self._save_gather_error('{} error: {}'.format( r.status_code, r.text), self.job) # noqa: E501 elapsed = 9999 if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format(self.provider, timestamp, r.status_code, elapsed)) # noqa: E128 return ids if hasattr(self, 'provider_logger'): self.provider_logger.info( log_message.format( self.provider, timestamp, r.status_code, r.elapsed.total_seconds())) # noqa: E128, E501 soup = Soup(r.content, 'lxml') next_url = soup.find('csw:searchresults', elementset="summary") records_returned = next_url['numberofrecordsreturned'] next_record = next_url['nextrecord'] number_records_matched = next_url['numberofrecordsmatched'] if next_record != '0': current_record = str( eval(next_record) - eval(records_returned)) # noqa: E501 else: current_record = str( eval(number_records_matched) - eval(records_returned)) # noqa: E501 # Get the URL for the next loop, or None to break the loop # Only works if StartPosition is last URL parameter harvest_url = self._get_next_url(harvest_url, records_returned, next_record, limit) # noqa: E501 # Get the entries from the results entries = self._get_entries_from_results(soup, current_record, next_record) # noqa: E501 # Create a harvest object for each entry for entry in entries: entry_guid = entry['guid'] entry_name = entry['identifier'] package = Session.query(Package) \ .filter(Package.name == entry_name).first() if package: # Meaning we've previously harvested this, # but we may want to reharvest it now. previous_obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == entry_guid) \ .filter(HarvestObject.current == True) \ .first() # noqa: E712 if previous_obj: previous_obj.current = False previous_obj.save() if self.update_all: log.debug( '{} already exists and will be updated.'.format( entry_name)) # noqa: E501 status = 'change' update_counter += 1 else: log.debug('{} already exists and will not be updated.'. format(entry_name)) # noqa: E501 status = 'unchanged' obj = HarvestObject( guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value=status), HOExtra(key='restart_record', value=entry['restart_record']) ]) # noqa: E501 obj.content = entry['content'] obj.package = package obj.save() ids.append(obj.id) elif not package: # It's a product we haven't harvested before. log.debug( '{} has not been harvested before. Creating a new harvest object.' .format(entry_name)) # noqa: E501 obj = HarvestObject(guid=entry_guid, job=self.job, extras=[ HOExtra(key='status', value='new'), HOExtra( key='restart_record', value=entry['restart_record']) ]) # noqa: E501 new_counter += 1 obj.content = entry['content'] obj.package = None obj.save() ids.append(obj.id) end_request = time.time() request_time = end_request - start_request if request_time < 1.0: time.sleep(1 - request_time) harvester_msg = '{:<12} | {} | jobID:{} | {} | {}' if hasattr(self, 'harvester_logger'): timestamp = str(datetime.utcnow()) self.harvester_logger.info( harvester_msg.format(self.provider, timestamp, self.job.id, new_counter, update_counter)) # noqa: E128, E501 return ids