def _save_object_error(self, message, obj, stage=u'Fetch'): ''' Helper function to create an error during the fetch or import stage. ''' err = HarvestObjectError(message=message, object=obj, stage=stage) err.save() log.error(message)
def _save_object_error(self,message,obj,stage=u'Fetch'): ''' Helper function to create an error during the fetch or import stage. ''' err = HarvestObjectError(message=message,object=obj,stage=stage) err.save() log.error(message)
def _save_object_error(self, message, obj, stage=u"Fetch", line=None): err = HarvestObjectError(message=message, object=obj, stage=stage, line=line) try: err.save() except InvalidRequestError, e: Session.rollback() err.save()
def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient): context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing() data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job['id'], 'extras': {'a key': 'a value'}, 'source_id': harvest_source['id'] } harvest_object = toolkit.get_action('harvest_object_create')( context, data_dict) harvest_object_model = HarvestObject.get(harvest_object['id']) # create a HarvestObjectError msg = 'HarvestObjectError occured: %s' % harvest_job['id'] harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model) harvest_object_error.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
def import_stage(self, harvest_object): ''' save to CKAN ''' logger.info('Importing {}'.format(harvest_object.id)) self.set_paths() package_dict = json.loads(harvest_object.content) action = package_dict.pop('action') extras = package_dict.get('extras', []) extras = self.update_extras(extras, harvest_object) package_dict['extras'] = extras resources = package_dict.pop('resources', []) # Save (create or update) to CKAN # Using base class function ._create_or_update_package # seems no useful to deal with resources user_name = self._get_user_name() context = {'model': model, 'session': model.Session, 'user': user_name} if action == 'create': try: pkg = p.toolkit.get_action('package_create')(context, package_dict) except Exception, e: logger.error('Error creating package {}: {}'.format(str(e), package_dict)) # TODO, no debería suceder if str(e).find('already in use') > 0: action = 'update' else: msg = 'Import CREATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e) harvest_object_error = HarvestObjectError(message=msg, object=harvest_object) harvest_object_error.save() return False
def _save_object_error(self, message, obj, stage=u'Fetch', line=None): err = HarvestObjectError(message=message, object=obj, stage=stage, line=line) try: err.save() except InvalidRequestError, e: Session.rollback() err.save()
def is_part_of_to_package_id(self, ipo, harvest_object): """ Get an identifier from external source using isPartOf and returns the parent dataset or raises an ParentNotHarvestedException. Only search for datasets that are the parent of a collection. """ ps = p.toolkit.get_action('package_search') query = 'extras_identifier:{} AND extras_collection_metadata:true'.format( ipo) results = ps(self.context(), {"fq": query}) log.info('Package search results {}'.format(results)) if results[ 'count'] > 0: # event if we have only one we need to be sure is the parent I need # possible check identifier collision # check the URL of the source to validate datasets = results['results'] harvest_source = harvest_object.source for dataset in datasets: extras = dataset.get('extras', []) identifiers = [ extra['value'] for extra in extras if extra['key'] == 'identifier' ] if ipo not in identifiers: log.error('BAD SEARCH for {}:{}'.format(ipo, identifiers)) continue dataset_harvest_source_id = self.get_harvest_source_id( dataset['id']) if harvest_source.id == dataset_harvest_source_id: log.info('Parent dataset identified correctly') return dataset else: log.info('{} not found at {} for {}'.format( harvest_source.id, dataset_harvest_source_id, ipo)) # we have 0 o bad results msg = 'Parent identifier not found: "{}"'.format(ipo) log.error(msg) try: harvest_object_error = HarvestObjectError(message=msg, object=harvest_object) harvest_object_error.save() harvest_object.state = "ERROR" harvest_object.save() except: pass raise ParentNotHarvestedException( 'Unable to find parent dataset. Raising error to allow re-run later' )
def _get_xml_url_content(xml_url, urlopen_timeout, harvest_object): try: try: r = requests.get(xml_url, timeout=urlopen_timeout) ET.XML(r.content) # test for valid xml return r except ET.ParseError as e: msg = '%s: %s. From external XML content at %s' % ( type(e).__name__, str(e), xml_url) log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except requests.exceptions.Timeout as e: msg = '%s: %s. From external XML content at %s' % ( type(e).__name__, str(e), xml_url) log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except requests.exceptions.TooManyRedirects as e: msg = 'HTTP too many redirects: %s' % e.code log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except requests.exceptions.RequestException as e: msg = 'HTTP request exception: %s' % e.code log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except Exception as e: msg = '%s: %s. From external XML content at %s' % ( type(e).__name__, str(e), xml_url) log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() finally: return '' except StaleDataError as e: log.warn('Harvest object %s is stail. Error object not created. %s' % (harvest_object.id, str(e)))
def _save_object_error(self,message,obj,stage=u'Fetch'): err = HarvestObjectError(message=message,object=obj,stage=stage) err.save() log.error(message)