def gather_stage(self, harvest_job): log.debug('In ZhstatHarvester gather_stage') ids = [] parser = etree.XMLParser(encoding='utf-8') for dataset in etree.fromstring(self._fetch_metadata(), parser=parser): # Get the german data if one is available, # otherwise get the first one base_datas = dataset.xpath("data[@xml:lang='de']") if len(base_datas) != 0: base_data = base_datas[0] else: base_data = dataset.find('data') metadata = self._generate_metadata(base_data, dataset) if metadata: obj = HarvestObject(guid=dataset.get('id'), job=harvest_job, content=json.dumps(metadata)) obj.save() log.debug('adding ' + dataset.get('id') + ' to the queue') ids.append(obj.id) else: log.debug( 'Skipping %s since no resources or groups are available' % dataset.get('id')) return ids
def gather_stage(self,harvest_job): log.debug('In ArrayExpressHarvester.gather_stage(%s)' % harvest_job.source.url) # Get feed contents self._set_config(harvest_job.source.config) #previous_job = Session.query(HarvestJob) \ # .filter(HarvestJob.source==harvest_job.source) \ # .filter(HarvestJob.gather_finished!=None) \ # .filter(HarvestJob.id!=harvest_job.id) \ # .order_by(HarvestJob.gather_finished.desc()) \ # .limit(1).first() baseURL = harvest_job.source.url+"/xml/v2/experiments" #if (previous_job and not previous_job.gather_errors # and not len(previous_job.objects) == 0): # if not self.config.get('force_all',False): # last_time = harvest_job.gather_started.isoformat() # today = format(datetime.date.today()) # self.params['date'] = '['+last_time+' '+today+']' url = baseURL + "?" + self.getParams() print "Fetching from "+url doc = etree.parse(url) ids = [] for accessionElement in doc.findall('//experiment/accession'): accession = accessionElement.text.strip() obj = HarvestObject(guid=accession, job=harvest_job, content=accession) print "ArrayExpress accession: "+accession obj.save() ids.append(obj.id) print ids return ids
def gather_stage(self,harvest_job): log.debug('In SocrataHarvester 2 gather_stage (%s)' % harvest_job.source.url) get_all_packages = True dcatUrl = "%s/api/dcat.rdf" % harvest_job.source.url.rstrip('/') log.debug(dcatUrl) adaptorInstance = socrataAdaptor() package_ids = adaptorInstance.listDatasetIds(dcatUrl) try: object_ids = [] if len(package_ids): for package_id in package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self,harvest_job): log.debug('In NTPCHarvester gather_stage (%s)' % harvest_job.source.url) url = self.PREFIX_URL + self.CATALOGUE_INDEX_URL get_all_packages = True try: package_ids = [] dataset_count = self._get_ntpc_dataset_count(url) msg_count = 0 for x in range(dataset_count/10 + 1): page_url = url + '?currentPage=%s' % (x + 1) data = urllib2.urlopen(page_url) doc = html.parse(data) for div in doc.findall("//a[@href]"): if '/NTPC/od/query;' in div.attrib['href']: link = div.attrib['href'] id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() package_ids.append(obj.id) msg_count = msg_count + 1 if msg_count == 0: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self, harvest_job): """Retrieve datasets""" log.debug('In RostockTestHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(None) base_url = harvest_job.source.url.rstrip('/') package_list_url = base_url + '/api/rest/package' content = self._get_content(package_list_url) package_ids = json.loads(content) try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid = package_id, job = harvest_job) obj.save() object_ids.append(obj.id) log.info('Got ID from source: %s' %package_id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r'%e.message,harvest_job)
def gather_stage(self, harvest_job): """Retrieve datasets""" log.debug('In ' + self.city + 'CKANHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(None) base_url = harvest_job.source.url.rstrip('/') package_list_url = base_url + '/3/action/package_list' content = self._get_content(package_list_url) content_json = json.loads(content) package_ids = content_json['result'] try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error('No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def _gather_ids(self,url = None, jar= None): log.debug('Page %s'%self.page) if jar is None: jar = CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar)) url = url or self.INITIAL_INDEX fh = opener.open(url) doc = html.parse(fh) fh.close() new_ids = [] for a in doc.findall(".//div[@class='main']//a"): href = a.get('href').split('?', 1)[0] id = href.split('/').pop() if not id in self.gathered_ids: log.debug('Got Id: %s' % id) obj = HarvestObject(guid=sha1(id).hexdigest(), job=self.job, content=id) obj.save() self.object_ids.append(obj.id) new_ids.append(id) if len(new_ids) == 0: #or self.page == 2: return self.gathered_ids else: self.gathered_ids.extend(new_ids) inputs = [] for input in doc.findall(".//form[@id='main_form']//input"): inputs.append((input.get('name'), input.get('value'))) inputs.append(('listbox_nextPage:method', '')) next_url = self.INDEX_URL + '?' + urllib.urlencode(inputs) self.page = self.page + 1 return self._gather_ids(url=next_url,jar=jar)
def gather_stage(self, harvest_job): log.debug('In FSOHarvester gather_stage') http = urllib3.PoolManager() metadata_file = http.request('GET', self.METADATA_FILE_URL) ids = [] parser = etree.XMLParser(encoding='utf-8') for package in etree.fromstring(metadata_file.data, parser=parser): # Get the german dataset if one is available, otherwise get the first one base_datasets = package.xpath("dataset[@xml:lang='de']") if len(base_datasets) != 0: base_dataset = base_datasets[0] else: base_dataset = package.find('dataset') metadata = self._generate_metadata(base_dataset, package) if metadata: obj = HarvestObject( guid = self._create_uuid(base_dataset.get('datasetID')), job = harvest_job, content = json.dumps(metadata) ) obj.save() log.debug('adding ' + base_dataset.get('datasetID') + ' to the queue') ids.append(obj.id) else: log.debug('Skipping ' + base_dataset.get('datasetID') + ' since no resources or groups are available') return ids
def gather_stage(self, harvest_job): """Retrieve datasets""" log.debug('In KoelnCKANHarvester gather_stage (%s)' % harvest_job.source.url) package_ids = [] self._set_config(None) base_url = harvest_job.source.url.rstrip('/') package_list_url = base_url + '/3/action/package_list' content = self._get_content(package_list_url) content_json = json.loads(content) package_ids = content_json['result'] try: object_ids = [] if len(package_ids): for package_id in package_ids: obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % url, harvest_job) return None except Exception, e: self._save_gather_error('%r' % e.message, harvest_job)
def gather_stage(self, harvest_job): log.debug('In ZhstatHarvester gather_stage') ids = [] parser = etree.XMLParser(encoding='utf-8') for dataset in etree.fromstring(self._fetch_metadata(), parser=parser): # Get the german data if one is available, # otherwise get the first one base_datas = dataset.xpath("data[@xml:lang='de']") if len(base_datas) != 0: base_data = base_datas[0] else: base_data = dataset.find('data') metadata = self._generate_metadata(base_data, dataset) if metadata: obj = HarvestObject( guid=dataset.get('id'), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + dataset.get('id') + ' to the queue') ids.append(obj.id) else: log.debug( 'Skipping %s since no resources or groups are available' % dataset.get('id') ) return ids
def _gather_object(self, job, url, size, start_date, forecast_date): filename = parse_filename(url) filename_id = (filename.replace('-v02.0-fv02.0', '').replace( '-fv02.0', '').replace('-sv01.00', '').replace('-sv05.00', '').replace( '-v02', '').replace('-sv10.00', '').replace('-sv09.00', '').replace('-sv07.00', '')) status, package = self._was_harvested(filename_id, self.update_all) extras = [HOExtra(key='status', value=status)] assert start_date content = json.dumps( { 'identifier': filename_id, 'ftp_link': url, 'size': size, 'start_date': start_date, 'forecast_date': forecast_date, 'restart_date': start_date }, default=str) obj = HarvestObject(job=job, guid=url, extras=extras, content=content) obj.package = package obj.save() return obj.id
def _run_import(self, xml, job): if not model.User.get('harvest'): model.User(name='harvest', sysadmin=True).save() if not model.Group.get('test'): get_action('organization_create')({ 'user': '******' }, { 'name': 'test' }) record = _get_record(xml) metadata = CmdiReader()(record) metadata['unified']['owner_org'] = "test" harvest_object = HarvestObject() harvest_object.content = json.dumps(metadata.getMap()) harvest_object.id = xml harvest_object.guid = xml harvest_object.source = job.source harvest_object.harvest_source_id = None harvest_object.job = job harvest_object.save() self.harvester.import_stage(harvest_object) return harvest_object
def gather_stage(self, harvest_job): log.debug('In SNLHarvester gather_stage') metadata_path = self._fetch_metadata_file() ids = [] try: parser = MetaDataParser(metadata_path) for dataset in parser.list_datasets(): metadata = parser.parse_set(dataset) metadata['translations'].extend( self._metadata_term_translations() ) log.debug(metadata) obj = HarvestObject( guid=metadata['id'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + metadata['id'] + ' to the queue') ids.append(obj.id) finally: temp_dir = os.path.dirname(metadata_path) log.debug('Deleting directory ' + temp_dir) shutil.rmtree(temp_dir) return ids
def gather_stage(self,harvest_job): log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) doc = html.parse(data) for td in doc.findall("//td[@class='left_p12_title']/a"): link = td.get('href') if re.match(r"/search/fsciitem", link): id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job= harvest_job, content=link) obj.save() package_ids.append(obj.id) self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() return package_ids
def setup_class(cls): try: from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra except ImportError: raise SkipTest('The harvester extension is needed for these tests') cls.content1 = '<xml>Content 1</xml>' ho1 = HarvestObject( guid='test-ho-1', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content1) cls.content2 = '<xml>Content 2</xml>' cls.original_content2 = '<xml>Original Content 2</xml>' ho2 = HarvestObject( guid='test-ho-2', job=HarvestJob(source=HarvestSource(url='http://', type='xx')), content=cls.content2) hoe = HarvestObjectExtra(key='original_document', value=cls.original_content2, object=ho2) Session.add(ho1) Session.add(ho2) Session.add(hoe) Session.commit() cls.object_id_1 = ho1.id cls.object_id_2 = ho2.id
def gather_stage(self, harvest_job): """ The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids """ log.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials) client.identify() # check if identify works for header in self._identifier_generator(client): harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) except: log.exception("Gather stage failed %s" % harvest_job.source.url) self._save_gather_error("Could not gather anything from %s!" % harvest_job.source.url, harvest_job) return None return harvest_obj_ids
def gather_stage(self, harvest_job): log.debug('In SRDAHarvester gather_stage (%s)' % harvest_job.source.url) get_all_packages = True package_ids = [] data = urllib2.urlopen(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) doc = html.parse(data) for td in doc.findall("//td[@class='left_p12_title']/a"): link = td.get('href') if re.match(r"/search/fsciitem", link): id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() package_ids.append(obj.id) self._set_config(harvest_job.source.config) # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source==harvest_job.source) \ .filter(HarvestJob.gather_finished!=None) \ .filter(HarvestJob.id!=harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() return package_ids
def harvest_object_create(context, data_dict): """ Create a new harvest object :type guid: string (optional) :type content: string (optional) :type job_id: string :type source_id: string (optional) :type package_id: string (optional) :type extras: dict (optional) """ check_access('harvest_object_create', context, data_dict) data, errors = _validate(data_dict, harvest_object_create_schema(), context) if errors: raise logic.ValidationError(errors) obj = HarvestObject( guid=data.get('guid'), content=data.get('content'), job=data['job_id'], harvest_source_id=data.get('source_id'), package_id=data.get('package_id'), extras=[ HarvestObjectExtra(key=k, value=v) for k, v in data.get('extras', {}).items() ] ) obj.save() return harvest_object_dictize(obj, context)
def test_zfaulty_xml_unknown_errors(self): harv, job = self._create_harvester() res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD2355.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj)) print Package.text_search(\ Session.query(Package), 'Kansalaiskeskustelu ydinvoimasta 2006').all() self.assert_(len(Package.text_search(\ Session.query(Package), 'Kansalaiskeskustelu ydinvoimasta 2006').all()) >= 1) res = "http://www.fsd.uta.fi/fi/aineistot/luettelo/FSD0115/FSD0115.xml" urllib2.urlopen = mock.Mock(return_value=StringIO(res)) gathered = harv.gather_stage(job) urllib2.urlopen = mock.Mock(return_value=open("FSD2362.xml")) harvest_obj = HarvestObject.get(gathered[0]) self.assert_(harv.fetch_stage(harvest_obj)) self.assert_(harv.import_stage(harvest_obj)) self.assert_(len(Package.text_search(\ Session.query(Package), 'Energia-asennetutkimus 2004').all()) >= 1)
def harvest_object_create(context, data_dict): """ Create a new harvest object :type guid: string (optional) :type content: string (optional) :type job_id: string :type source_id: string (optional) :type package_id: string (optional) :type extras: dict (optional) """ check_access('harvest_object_create', context, data_dict) data, errors = _validate(data_dict, harvest_object_create_schema(), context) if errors: raise logic.ValidationError(errors) obj = HarvestObject(guid=data.get('guid'), content=data.get('content'), job=data['job_id'], harvest_source_id=data.get('source_id'), package_id=data.get('package_id'), extras=[ HarvestObjectExtra(key=k, value=v) for k, v in data.get('extras', {}).items() ]) obj.save() return harvest_object_dictize(obj, context)
def gather_stage(self, harvest_job): try: config = json.loads(harvest_job.source.config) ckan_term_url = config['ckan_term_url'] except Exception as e: log.exception(e) raise ConfigError( "In order to run the translation harvester " "you need to specify 'ckan_term_url' " "in your harvester config json" ) log.debug('Gathering term from %s' % ckan_term_url) try: terms = self._get_terms(ckan_term_url) obj = HarvestObject( job=harvest_job, content=json.dumps(terms) ) obj.save() return [obj.id] except Exception as e: log.exception(e) raise e
def test_ckan_duplicated_name(self): dataset0 = { 'owner_org': self.org['id'], 'holder_name': 'test holder', 'holder_identifier': 'abcdef', 'notes': 'some notes', 'modified': '2000-01-01', 'theme': 'AGRI', 'frequency': 'UNKNOWN', 'publisher_name': 'publisher', 'identifier': 'aasdfa', 'publisher_identifier': 'publisher', 'resources': [], 'extras': [], } dataset1 = { 'owner_org': self.org['id'], 'title': 'duplicated title', 'name': 'duplicated-title', 'id': 'dummyid' } dataset1.update(dataset0) data = json.dumps(dataset1) harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname1', owner_org=self.org['id']) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = data h = DCATRDFHarvester() import_successful = h.import_stage(harvest_obj) self.assertTrue(import_successful, harvest_obj.errors) Session.flush() dataset1['_id'] = harvest_obj.package_id dataset2 = {'title': 'duplicated title', 'name': 'duplicated-title', 'id': 'dummyid2'} dataset2.update(dataset0) dataset2['identifier'] = 'otherid' data = json.dumps(dataset2) harvest_dict = self._create_harvest_obj('http://mock/source/', name='dupname2', owner_org=self.org['id']) harvest_obj = HarvestObject.get(harvest_dict['id']) harvest_obj.content = data h = DCATRDFHarvester() import_successful = h.import_stage(harvest_obj) self.assertTrue(import_successful, harvest_obj.errors) Session.flush() dataset2['_id'] = harvest_obj.package_id # duplicated names are mangled, one should have numeric suffix pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset1['_id']) self.assertEqual(pkg_dict['title'], dataset1['title']) self.assertEqual(pkg_dict['name'], 'duplicated-title') pkg_dict = helpers.call_action('package_show', context={}, name_or_id=dataset2['_id']) self.assertEqual(pkg_dict['title'], dataset2['title']) self.assertEqual(pkg_dict['name'], 'duplicated-title1')
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.VariantStore.gather') log.debug('VariantStoreHarvester gather_stage for job: %r', harvest_job) self._set_config(harvest_job.source.config, log=log) obj = HarvestObject(guid = self.guid, job = harvest_job) obj.save() return [ obj.id ]
def doi_update(context, data_dict): model = context['model'] new_package = data_dict source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest() old_package = p.toolkit.get_action('package_show')({ 'model': model, 'ignore_auth': True }, { "id": new_package['id'] }) for extra in old_package['extras']: if extra['key'] == 'source_hash': old_source_hash = extra['value'] break else: old_source_hash = None if source_hash == old_source_hash and old_package.get('state') == 'active': print str(datetime.datetime.now() ) + ' No change for doi id ' + new_package['id'] return new_package["extras"].append({"key": "source_hash", "value": source_hash}) new_package["extras"].append({"key": "metadata-source", "value": "doi"}) new_package["extras"].append({ "key": "source_doi_import_identifier", "value": True }) new_package.pop("name", None) owner_org = model.Group.get( ORG_MAPPING.get(new_package['organization']['name'])) if not owner_org: print str( datetime.datetime.now()) + ' Fail to update doi id ' + new_package[ 'id'] + '. Organization ' + new_package['organization'][ 'name'] + ' does not exist.' return new_package['owner_org'] = owner_org.name group_name = new_package.pop('owner_name', None) resources = [] for resource in new_package['resources']: resource.pop('resource_group_id', None) resource.pop('revision_id', None) resource.pop('id', None) resources.append(resource) new_package['resources'] = resources obj = HarvestObject(guid=uuid.uuid4().hex, job=context['harvest_job'], content=context['harvestobj']) obj.save() new_package["extras"].append({"key": "harvest_object_id", "value": obj.id}) context['return_id_only'] = True p.toolkit.get_action('package_update')(context, new_package) print str(datetime.datetime.now()) + ' Updated doi id ' + new_package['id']
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL url = harvest_job.source.url # Get contents try: content = self._get_content(url) except Exception as e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None ids = [] try: for url in self._extract_urls(content, url): try: content = self._get_content(url) except Exception as e: msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e) self._save_gather_error(msg, harvest_job) continue else: # We need to extract the guid to pass it to the next stage try: gemini_string, gemini_guid = self.get_gemini_string_and_guid( content, url) if gemini_guid: log.debug('Got GUID %s' % gemini_guid) # Create a new HarvestObject for this identifier # Generally the content will be set in the fetch stage, but as we alredy # have it, we might as well save a request obj = HarvestObject(guid=gemini_guid, job=harvest_job, content=gemini_string) obj.save() ids.append(obj.id) except Exception as e: msg = 'Could not get GUID for source %s: %r' % (url, e) self._save_gather_error(msg, harvest_job) continue except Exception as e: msg = 'Error extracting URLs from %s' % url self._save_gather_error(msg, harvest_job) return None if len(ids) > 0: return ids else: self._save_gather_error( 'Couldn\'t find any links to metadata files', harvest_job) return None
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.CSW.gather') log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job) # Get source URL url = harvest_job.source.url try: self._setup_csw_client(url) except Exception as e: self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job) return None log.debug('Starting gathering for %s' % url) used_identifiers = [] ids = [] try: for identifier in self.csw.getidentifiers(page=10): try: log.info('Got identifier %s from the CSW', identifier) if identifier in used_identifiers: log.error( 'CSW identifier %r already used, skipping...' % identifier) continue if identifier is None: log.error('CSW returned identifier %r, skipping...' % identifier) ## log an error here? happens with the dutch data continue # Create a new HarvestObject for this identifier obj = HarvestObject(guid=identifier, job=harvest_job) obj.save() ids.append(obj.id) used_identifiers.append(identifier) except Exception as e: self._save_gather_error( 'Error for the identifier %s [%r]' % (identifier, e), harvest_job) continue except Exception as e: log.error('Exception: %s' % text_traceback()) self._save_gather_error( 'Error gathering the identifiers from the CSW server [%s]' % six.text_type(e), harvest_job) return None if len(ids) == 0: self._save_gather_error('No records received from the CSW server', harvest_job) return None return ids
def gather_stage(self, harvest_job): log.debug('In SFAHarvester gather_stage') try: file_path = self._fetch_metadata_file() ids = [] de_rows = self._get_row_dict_array(0, file_path) for row in de_rows: # Construct the metadata dict for the dataset on CKAN metadata = { 'datasetID': row[u'id'], 'title': row[u'title'], 'url': row[u'url'], 'notes': row[u'notes'], 'author': row[u'author'], 'maintainer': row[u'maintainer'], 'maintainer_email': row[u'maintainer_email'], 'license_id': row[u'licence'], 'license_url': row[u'licence_url'], 'translations': [], 'tags': row[u'tags'].split(u', '), 'groups': [row[u'groups']] } metadata['resources'] = self._generate_resources_dict_array( row[u'id'] ) metadata['resources'][0]['version'] = row[u'version'] log.debug(metadata['resources']) # Adding term translations metadata['translations'].extend( self._generate_term_translations(1, file_path) # fr ) metadata['translations'].extend( self._generate_term_translations(2, file_path) # it ) metadata['translations'].extend( self._generate_term_translations(3, file_path) # en ) log.debug(metadata['translations']) obj = HarvestObject( guid=self._create_uuid(row[u'id']), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + row[u'id'] + ' to the queue') ids.append(obj.id) log.debug(de_rows) except Exception: return False return ids
def gather_stage(self, harvest_job): log.debug('In FOPHHarvester gather_stage') try: file_path = self._fetch_metadata_file() ids = [] de_cols = self._get_col_dict_array(0, file_path) for col in de_cols: # Construct the metadata dict for the dataset on CKAN metadata = { 'datasetID': col[u'id'], 'title': col[u'title'], 'url': col[u'url'], 'notes': col[u'notes'], 'author': col[u'author'], 'author_email': col[u'author_email'], 'maintainer': col[u'maintainer'], 'maintainer_email': col[u'maintainer_email'], 'license_id': col[u'license_id'].lower(), 'version': col[u'version'], 'translations': [], 'tags': [] } tags = col[u'tags'].split(u', ') tags = [munge_tag(tag) for tag in tags] metadata['tags'] = tags metadata['resources'] = self._generate_resources_dict_array( col[u'id']) metadata['resources'][0]['version'] = col[u'version'] log.debug(metadata['resources']) # Adding term translations metadata['translations'].extend( self._generate_term_translations(1, file_path)) # fr metadata['translations'].extend( self._generate_term_translations(2, file_path)) # it metadata['translations'].extend( self._generate_term_translations(3, file_path)) # en log.debug(metadata['translations']) obj = HarvestObject( guid=self._create_uuid(col[u'id']), job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + col[u'id'] + ' to the queue') ids.append(obj.id) log.debug(de_cols) except Exception: return False return ids
def _gen_harvest_obj_for_files(self, harvest_job): ids = [] for dataset_name, dataset in self.DATASETS.iteritems(): csw = ckan_csw.SwisstopoCkanMetadata() metadata = csw.get_ckan_metadata( dataset['csw_query'], 'de' ).copy() metadata_fr = csw.get_ckan_metadata( dataset['csw_query'], 'fr' ).copy() metadata_it = csw.get_ckan_metadata( dataset['csw_query'], 'it' ).copy() metadata_en = csw.get_ckan_metadata( dataset['csw_query'], 'en' ).copy() log.debug(metadata) metadata['translations'] = self._generate_term_translations() log.debug("Translations: %s" % metadata['translations']) metadata_trans = { u'de': metadata, u'fr': metadata_fr, u'it': metadata_it, u'en': metadata_en, } metadata['translations'].extend( self._generate_metadata_translations(metadata_trans) ) metadata['resources'] = self._generate_resources_dict_array( dataset_name ) metadata['resources'].extend( self._generate_api_resources(metadata, dataset_name) ) log.debug(metadata['resources']) metadata['license_id'], metadata['license_url'] = ( self._get_org_license(dataset_name) ) metadata['layer_name'] = dataset_name obj = HarvestObject( guid=metadata['id'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + dataset_name + ' to the queue') ids.append(obj.id) return ids
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' log.info("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] registry = self._create_metadata_registry() self._set_config(harvest_job.source.config) client = oaipmh.client.Client(harvest_job.source.url, registry, self.credentials, force_http_get=self.force_http_get) # Start looking from here client.identify() # check if identify works for header in self._identifier_generator(client): harvest_obj = HarvestObject(guid=header.identifier(), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) log.info("Harvest obj %s created" % harvest_obj.id) # return harvest_obj_ids # This is to get only one record except urllib.error.HTTPError as e: log.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None except Exception as e: log.exception('Gather stage failed on %s: %s' % ( harvest_job.source.url, str(e), )) self._save_gather_error( 'Could not gather anything from %s: %s / %s' % (harvest_job.source.url, str(e), traceback.format_exc()), harvest_job) return None log.info("Gather stage successfully finished with %s harvest objects" % len(harvest_obj_ids)) return harvest_obj_ids
def gather_stage(self, harvest_job, encoding=None): self._set_config(harvest_job.source.config) # Request all remote packages try: content = self._get_content(harvest_job.source.url) LOGGER.debug('Grabbing zip file: %s', harvest_job.source.url) object_ids = [] packages = [] file_content = StringIO.StringIO(content) archive = zipfile.ZipFile(file_content, 'r') for name in archive.namelist(): if name.endswith('.json'): archive_content = archive.read(name) if encoding is not None: archive_content = archive_content.decode(encoding) else: archive_content = self.lstrip_bom(archive_content) package = json.loads(archive_content) normalize_api_dataset(package) packages.append(package) obj = HarvestObject(guid=package['name'], job=harvest_job) obj.content = json.dumps(package) obj.save() object_ids.append(obj.id) except zipfile.BadZipfile as err: self._save_gather_error(err.message, harvest_job) return None except ContentFetchError as err: self._save_gather_error(err.message, harvest_job) return None except Exception as err: error_template = 'Unable to get content for URL: %s: %s' error = error_template % (harvest_job.source.url, str(err)) self._save_gather_error(error, harvest_job) return None if object_ids: # delete obsolete packages super(JSONZipBaseHarvester, self).delete_deprecated_datasets( packages, harvest_job ) return object_ids else: self._save_gather_error( 'No packages received for URL: %s' % harvest_job.source.url, harvest_job ) return None
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.ITagEnricher.gather') log.debug('ITagEnricher gather_stage for job: %r', harvest_job) # Save a reference self.job = harvest_job self._set_source_config(self.job.source.config) context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } org_id = model.Package.get(harvest_job.source.id).owner_org organization = logic.get_action('organization_show')(context, { 'id': org_id }) # noqa: E501 # Exclude Sentinel-3 because it seems like iTag can't handle the curved # footprints. filter_query = '+organization:{} -itag:tagged -FamilyName:Sentinel-3'.format( organization['name']) # noqa: E501 ids = [] # We'll limit this to 10 datasets per job so that results appear # faster start = 0 rows = self.source_config.get('datasets_per_job', 10) untagged = logic.get_action('package_search')(context, { 'fq': filter_query, 'rows': rows, 'start': start }) results = untagged['results'] for result in results: spatial = None for i in result['extras']: if i['key'] == 'spatial': spatial = i['value'] if spatial: obj = HarvestObject( guid=result['id'], job=self.job, extras=[ HOExtra(key='status', value='change'), # noqa: E501 HOExtra(key='spatial', value=spatial), # noqa: E501 HOExtra(key='package', value=json.dumps(result)) ]) # noqa: E501 obj.save() ids.append(obj.id) return ids
def populate_harvest_job(self, harvest_job, set_ids, config, client): # Check if this source has been harvested before previous_job = Session.query(HarvestJob) \ .filter(HarvestJob.source == harvest_job.source) \ .filter(HarvestJob.gather_finished != None) \ .filter(HarvestJob.id != harvest_job.id) \ .order_by(HarvestJob.gather_finished.desc()) \ .limit(1).first() last_time = None if previous_job and previous_job.finished and model.Package.get(harvest_job.source.id).metadata_modified < previous_job.gather_started: last_time = previous_job.gather_started.isoformat() # Collect package ids package_ids = list(self.get_package_ids(set_ids, config, last_time, client)) log.debug('Identifiers: %s', package_ids) if not self._recreate(harvest_job) and package_ids: converted_identifiers = {} for identifier in package_ids: converted_identifiers[datapid_to_name(identifier)] = identifier if identifier.endswith(u'm'): converted_identifiers[datapid_to_name(u"%ss" % identifier[0:-1])] = identifier for package in model.Session.query(model.Package).filter(model.Package.name.in_(converted_identifiers.keys())).all(): converted_name = package.name if converted_identifiers[converted_name] not in package_ids: converted_name = "%sm" % converted_name[0:-1] package_ids.remove(converted_identifiers[converted_name]) if previous_job: for previous_error in [error.guid for error in Session.query(HarvestObject). filter(HarvestObject.harvest_job_id == previous_job.id). filter(HarvestObject.state == 'ERROR').all()]: if previous_error not in package_ids: package_ids.append(previous_error) try: object_ids = [] if len(package_ids): for package_id in islice(package_ids, config['limit']) if 'limit' in config else package_ids: # Create a new HarvestObject for this identifier obj = HarvestObject(guid=package_id, job=harvest_job) obj.save() object_ids.append(obj.id) log.debug('Object ids: {i}'.format(i=object_ids)) return object_ids else: self._save_gather_error('No packages received for URL: {u}'.format( u=harvest_job.source.url), harvest_job) return None except Exception as e: self._save_gather_error('Gather: {e}'.format(e=e), harvest_job) raise
def gather_stage(self, harvest_job): log.debug('In DataWienGvAt gather_stage') doc = etree.parse(self.CATALOGUE_FEED_URL) ids = [] for link in doc.findall("//item/link"): link = link.text id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def delete_geocat_ids(self, harvest_job, harvest_obj_ids, packages_to_delete): delete_harvest_obj_ids = [] for package_info in packages_to_delete: obj = HarvestObject(guid=package_info[1].name, job=harvest_job, extras=[ HarvestObjectExtra(key='import_action', value='delete') ]) obj.save() delete_harvest_obj_ids.append(obj.id) return delete_harvest_obj_ids
def gather_stage(self, harvest_job): log.debug('In OpenDataCatHarvester gahter_stage') # Get feed contents doc = etree.parse(self.INDEX_URL) ids = [] for link_element in doc.findall('//item/link'): link = link_element.text.strip() id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): log.debug('In OpendataParisFr gather_stage') doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) ids = [] for link in doc.findall("//div[@class='animate download-portlet-element']/a"): link = link.get('href') if not "#comments" in link: id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self,harvest_job): log.debug('In OpenDataCatHarvester gahter_stage') # Get feed contents doc = etree.parse(self.INDEX_URL) ids = [] for link_element in doc.findall('//item/link'): link = link_element.text.strip() id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' logger.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] self._set_config(harvest_job.source.config) skip_licenses = { 'c12c3333-1ad7-4a3a-a629-ed51fcb636ac', 'a270745d-07d5-4e93-94fc-ba6e0afc97fb', } # TODO: switch # for record in json.loads(open('/tmp/data.json').read())['dataset']: for record in requests.get( urlparse.urljoin(harvest_job.source.url, 'data.json')).json()['dataset']: license_id = record.get('license', 'cc-by').strip('/').split('/')[-1] if license_id in skip_licenses: continue if 'hub.pacificdata' == record.get('isPartOf'): continue if 'Info' in record.get('theme', []): continue harvest_obj = HarvestObject(guid=record['identifier'], content=json.dumps(record), job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) except urllib2.HTTPError, e: logger.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' logger.debug("in gather stage: %s" % harvest_job.source.url) try: harvest_obj_ids = [] self._set_config(harvest_job.source.config) url = urljoin(harvest_job.source.url, '/v1/dataset/search') for record in self._fetch_record_outline(url): # if record['key'] != 'a38c7d49-5a5d-4aa6-a64e-421178bd06d7': # continue harvest_obj = HarvestObject(guid=record['key'], content=record['country'], job=harvest_job) harvest_obj.save() harvest_obj_ids.append(harvest_obj.id) # TODO: remove # break except (HTTPError) as e: logger.exception( 'Gather stage failed on %s (%s): %s, %s' % (harvest_job.source.url, e.fp.read(), e.reason, e.hdrs)) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None except (Exception) as e: logger.exception('Gather stage failed on %s: %s' % ( harvest_job.source.url, str(e), )) self._save_gather_error( 'Could not gather anything from %s' % harvest_job.source.url, harvest_job) return None return harvest_obj_ids
def test_harvester_urlerror(self): harv, job = self._create_harvester() urllib2.urlopen = realopen self.assert_(harv.gather_stage(job) == None) errs = Session.query(HarvestGatherError).all() self.assert_(len(errs) == 1) harv_obj = HarvestObject() harv_obj.job = job harv_obj.content = json.dumps({'url': "http://foo"}) # XML error and URL error, also the lack of url in content self.assert_(harv.import_stage(harv_obj) == False) errs = Session.query(HarvestObjectError).all() print errs self.assert_(len(errs) == 1)
def gather_stage(self,harvest_job): log.debug('In OpenGovSeHarvester gahter_stage') # Get feed contents doc = etree.parse(self.INDEX_URL) ids = [] for id_element in doc.findall('//{%(ns)s}entry/{%(ns)s}id' % {'ns':self.ATOM_NS}): link = id_element.text.strip() log.debug('Got link: %s' % link) id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def _save_harvest_object(self, metadata, harvest_job): ''' Save the harvest object with the given metadata dict and harvest_job ''' obj = HarvestObject( guid=metadata['datasetID'], job=harvest_job, content=json.dumps(metadata) ) obj.save() log.debug('adding ' + metadata['datasetID'] + ' to the queue') return obj.id
def gather_stage(self, harvest_job): log.debug('In OpendataParisFr gather_stage') doc = html.parse(self.PREFIX_URL + self.CATALOGUE_INDEX_URL) ids = [] for link in doc.findall( "//div[@class='animate download-portlet-element']/a"): link = link.get('href') if not "#comments" in link: id = sha1(link).hexdigest() obj = HarvestObject(guid=id, job=harvest_job, content=link) obj.save() ids.append(obj.id) return ids
def test_harvest_basic(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml', 'type': u'inventory', } source, job = self._create_source_and_job(source_fixture) # Gather harvester = InventoryHarvester() # mock boundary stuff to avoid needing PostGIS - it is not tested here # and that allows this test to run on sqlite with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary: get_boundary.return_value = None object_ids = harvester.gather_stage(job) assert_equal(len(object_ids), 3) assert len(job.gather_errors) == 0 # Fetch for object_id in object_ids: harvest_object = HarvestObject.get(object_id) assert harvest_object success = harvester.fetch_stage(harvest_object) assert_equal(success, True) assert not harvest_object.errors # Import objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) assert not harvest_object.errors pkgs = Session.query(Package).filter( Package.type != u'harvest_source').all() assert_equal(len(pkgs), 3) pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def gather_stage(self, harvest_job): log = logging.getLogger(__name__ + '.WAF.gather') log.debug('z3950Harvester gather_stage for job: %r', harvest_job) self.harvest_job = harvest_job # Get source URL source_url = harvest_job.source.url self._set_source_config(harvest_job.source.config) # get current objects out of db query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).filter(HarvestObject.current==True).\ filter(HarvestObject.harvest_source_id==harvest_job.source.id) guid_to_package_id = dict((res[0], res[1]) for res in query) current_guids = set(guid_to_package_id.keys()) current_guids_in_harvest = set() # Get contents try: conn = zoom.Connection(source_url, int(self.source_config.get('port', 210))) conn.databaseName = self.source_config.get('database', '') conn.preferredRecordSyntax = 'XML' conn.elementSetName = 'T' query = zoom.Query('CCL', 'metadata') res = conn.search(query) ids = [] for num, result in enumerate(res): hash = hashlib.md5(result.data).hexdigest() if hash in current_guids: current_guids_in_harvest.add(hash) else: obj = HarvestObject( job=harvest_job, guid=hash, extras=[ HOExtra(key='status', value='new'), HOExtra(key='original_document', value=result.data.decode('latin-1')), HOExtra(key='original_format', value='fgdc') ]) obj.save() ids.append(obj.id) for guid in (current_guids - current_guids_in_harvest): obj = HarvestObject( job=harvest_job, guid=guid, package_id=guid_to_package_id[guid], extras=[HOExtra(key='status', value='delete')]) obj.save() ids.append(obj.id) return ids except Exception, e: self._save_gather_error('Unable to get content for URL: %s: %r' % \ (source_url, e),harvest_job) return None
def gather_stage(self, harvest_object): log.debug('In OdgovltHarvester gather_stage') sync = IvpkIrsSync(sa.create_engine(harvest_object.source.url)) sync.sync_groups() ids = [] for ivpk_dataset in sync.get_ivpk_datasets(): content = json.dumps(dict(ivpk_dataset), cls=DatetimeEncoder) obj = HarvestObject(guid=ivpk_dataset.ID, job=harvest_object, content=content) obj.save() ids.append(obj.id) return ids
def test_harvest_basic(self): # Create source source_fixture = { 'title': 'Test Source', 'name': 'test-source', 'url': u'http://127.0.0.1:8999/esdInventory_live_truncated.xml', 'type': u'inventory', } source, job = self._create_source_and_job(source_fixture) # Gather harvester = InventoryHarvester() # mock boundary stuff to avoid needing PostGIS - it is not tested here # and that allows this test to run on sqlite with patch('ckanext.dgulocal.harvester.get_boundary') as get_boundary: get_boundary.return_value = None object_ids = harvester.gather_stage(job) assert_equal(len(object_ids), 3) assert len(job.gather_errors) == 0 # Fetch for object_id in object_ids: harvest_object = HarvestObject.get(object_id) assert harvest_object success = harvester.fetch_stage(harvest_object) assert_equal(success, True) assert not harvest_object.errors # Import objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) assert not harvest_object.errors pkgs = Session.query(Package).filter(Package.type!=u'harvest_source').all() assert_equal(len(pkgs), 3) pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def fetch_callback(channel, method, header, body): try: id = json.loads(body)['harvest_object_id'] log.info('Received harvest object id: %s' % id) except KeyError: log.error('No harvest object id received') channel.basic_ack(method.delivery_tag) return False obj = HarvestObject.get(id) if not obj: log.error('Harvest object does not exist: %s' % id) channel.basic_ack(method.delivery_tag) return False obj.retry_times += 1 obj.save() if obj.retry_times >= 5: obj.state = "ERROR" obj.save() log.error('Too many consecutive retries for object {0}'.format(obj.id)) channel.basic_ack(method.delivery_tag) return False # Send the harvest object to the plugins that implement # the Harvester interface, only if the source type # matches for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: fetch_and_import_stages(harvester, obj) model.Session.remove() channel.basic_ack(method.delivery_tag)
def test_harvester(self): job = HarvestJob(source = self.source) harvester = InventoryHarvester() # Gather all of the datasets from the XML content and make sure # we have created some harvest objects result = harvester.gather_stage(job, test_content=self._get_file_content('inventory.xml')) self.assertEqual(len(result), 79) # We only want one for testing harvest_object_id = result[0] harvest_obj = HarvestObject.get(harvest_object_id) # Run the fetch stage fetch_result = harvester.fetch_stage(harvest_obj) self.assertTrue(fetch_result) # Make sure we can create a dataset by running the import stage harvester.import_stage(harvest_obj) self.assertIsNotNone(harvest_obj.package_id) # Get the newly created package and make sure it is in the correct # organisation pkg = toolkit.get_action('package_show')( { 'ignore_auth': True, 'user': self.sysadmin['name'] }, { 'id': harvest_obj.package_id }, ) self.assertEqual(pkg['organization']['id'], self.publisher['id'])
def _run_job_for_single_document( self, job, force_import=False, expect_gather_errors=False, expect_obj_errors=False ): harvester = GeminiDocHarvester() harvester.force_import = force_import object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 if expect_gather_errors: assert len(job.gather_errors) > 0 else: assert len(job.gather_errors) == 0 assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) if expect_obj_errors: assert len(obj.errors) > 0 else: assert len(obj.errors) == 0 job.status = u"Finished" job.save() return obj
def test_harvest_basic(self): # Create source source_fixture = {"url": u"http://127.0.0.1:8999/waf/index.html", "type": u"gemini-waf"} source, job = self._create_source_and_job(source_fixture) harvester = GeminiWafHarvester() # We need to send an actual job, not the dict object_ids = harvester.gather_stage(job) assert len(object_ids) == 2 # Fetch stage always returns True for Waf harvesters assert harvester.fetch_stage(object_ids) == True objects = [] for object_id in object_ids: obj = HarvestObject.get(object_id) assert obj objects.append(obj) harvester.import_stage(obj) pkgs = Session.query(Package).all() assert len(pkgs) == 2 pkg_ids = [pkg.id for pkg in pkgs] for obj in objects: assert obj.current == True assert obj.package_id in pkg_ids
def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient): context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing() data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job['id'], 'extras': {'a key': 'a value'}, 'source_id': harvest_source['id'] } harvest_object = toolkit.get_action('harvest_object_create')( context, data_dict) harvest_object_model = HarvestObject.get(harvest_object['id']) # create a HarvestObjectError msg = 'HarvestObjectError occured: %s' % harvest_job['id'] harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model) harvest_object_error.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False): harvester = GeminiDocHarvester() harvester.force_import = force_import object_ids = harvester.gather_stage(job) assert object_ids, len(object_ids) == 1 if expect_gather_errors: assert len(job.gather_errors) > 0 else: assert len(job.gather_errors) == 0 assert harvester.fetch_stage(object_ids) == True obj = HarvestObject.get(object_ids[0]) assert obj, obj.content harvester.import_stage(obj) Session.refresh(obj) if expect_obj_errors: assert len(obj.errors) > 0 else: assert len(obj.errors) == 0 job.status = u'Finished' job.save() return obj
def harvest_object_show(context, data_dict): p.toolkit.check_access('harvest_object_show', context, data_dict) id = data_dict.get('id') dataset_id = data_dict.get('dataset_id') if id: attr = data_dict.get('attr', None) obj = HarvestObject.get(id, attr=attr) elif dataset_id: model = context['model'] pkg = model.Package.get(dataset_id) if not pkg: raise p.toolkit.ObjectNotFound('Dataset not found') obj = model.Session.query(HarvestObject) \ .filter(HarvestObject.package_id == pkg.id) \ .filter( HarvestObject.current == True # noqa: E711 ).first() else: raise p.toolkit.ValidationError( 'Please provide either an "id" or a "dataset_id" parameter') if not obj: raise p.toolkit.ObjectNotFound('Harvest object not found') return harvest_object_dictize(obj, context)