def _save_object_error(self,message,obj,stage=u'Fetch'): ''' Helper function to create an error during the fetch or import stage. ''' err = HarvestObjectError(message=message,object=obj,stage=stage) err.save() log.error(message)
def _save_object_error(self, message, obj, stage=u"Fetch", line=None): err = HarvestObjectError(message=message, object=obj, stage=stage, line=line) try: err.save() except InvalidRequestError, e: Session.rollback() err.save()
def _save_object_error(self, message, obj, stage=u'Fetch'): ''' Helper function to create an error during the fetch or import stage. ''' err = HarvestObjectError(message=message, object=obj, stage=stage) err.save() log.error(message)
def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient): context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing() data_dict = { 'guid': 'guid', 'content': 'content', 'job_id': harvest_job['id'], 'extras': {'a key': 'a value'}, 'source_id': harvest_source['id'] } harvest_object = toolkit.get_action('harvest_object_create')( context, data_dict) harvest_object_model = HarvestObject.get(harvest_object['id']) # create a HarvestObjectError msg = 'HarvestObjectError occured: %s' % harvest_job['id'] harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model) harvest_object_error.save() status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']}) send_error_mail( context, harvest_source['id'], status ) assert_equal(1, status['last_job']['stats']['errored']) assert mock_mailer_mail_recipient.called
def import_stage(self, harvest_object): ''' save to CKAN ''' logger.info('Importing {}'.format(harvest_object.id)) self.set_paths() package_dict = json.loads(harvest_object.content) action = package_dict.pop('action') extras = package_dict.get('extras', []) extras = self.update_extras(extras, harvest_object) package_dict['extras'] = extras resources = package_dict.pop('resources', []) # Save (create or update) to CKAN # Using base class function ._create_or_update_package # seems no useful to deal with resources user_name = self._get_user_name() context = {'model': model, 'session': model.Session, 'user': user_name} if action == 'create': try: pkg = p.toolkit.get_action('package_create')(context, package_dict) except Exception, e: logger.error('Error creating package {}: {}'.format(str(e), package_dict)) # TODO, no debería suceder if str(e).find('already in use') > 0: action = 'update' else: msg = 'Import CREATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e) harvest_object_error = HarvestObjectError(message=msg, object=harvest_object) harvest_object_error.save() return False
def is_part_of_to_package_id(self, ipo, harvest_object): """ Get an identifier from external source using isPartOf and returns the parent dataset or raises an ParentNotHarvestedException. Only search for datasets that are the parent of a collection. """ ps = p.toolkit.get_action('package_search') query = 'extras_identifier:{} AND extras_collection_metadata:true'.format( ipo) results = ps(self.context(), {"fq": query}) log.info('Package search results {}'.format(results)) if results[ 'count'] > 0: # event if we have only one we need to be sure is the parent I need # possible check identifier collision # check the URL of the source to validate datasets = results['results'] harvest_source = harvest_object.source for dataset in datasets: extras = dataset.get('extras', []) identifiers = [ extra['value'] for extra in extras if extra['key'] == 'identifier' ] if ipo not in identifiers: log.error('BAD SEARCH for {}:{}'.format(ipo, identifiers)) continue dataset_harvest_source_id = self.get_harvest_source_id( dataset['id']) if harvest_source.id == dataset_harvest_source_id: log.info('Parent dataset identified correctly') return dataset else: log.info('{} not found at {} for {}'.format( harvest_source.id, dataset_harvest_source_id, ipo)) # we have 0 o bad results msg = 'Parent identifier not found: "{}"'.format(ipo) log.error(msg) try: harvest_object_error = HarvestObjectError(message=msg, object=harvest_object) harvest_object_error.save() harvest_object.state = "ERROR" harvest_object.save() except: pass raise ParentNotHarvestedException( 'Unable to find parent dataset. Raising error to allow re-run later' )
def _save_object_error(self, message, obj, stage=u'Fetch', line=None): err = HarvestObjectError(message=message, object=obj, stage=stage, line=line) try: err.save() except InvalidRequestError, e: Session.rollback() err.save()
class SIUTransparenciaHarvester(HarvesterBase): def set_paths(self): here = os.path.dirname(os.path.abspath(__file__)) base = os.environ.get('CKAN_STORAGE_PATH', here) self.results_folder_path = os.path.join(base, 'siu-harvester-results') if not os.path.isdir(self.results_folder_path): os.makedirs(self.results_folder_path) # librearia que gestiona los datos en el portal de SIU self.siu_data_lib = SIUPoratlTransparenciaData() ## IHarvester def info(self): ''' :returns: A dictionary with the harvester descriptors ''' return { 'name': 'siu_transp', 'title': 'SIU Portal de transparencia', 'description': 'Extraer y publicar datos del portal de transparecnia de SIU', 'form_config_interface': 'Text' } def validate_config(self, config): ''' [optional] Harvesters can provide this method to validate the configuration entered in the form. It should return a single string, which will be stored in the database. Exceptions raised will be shown in the form's error messages. :param config: Config string coming from the form :returns: A string with the validated configuration options ''' if not config: raise ValueError('Set up the required configuration settings') try: config_obj = json.loads(config) except ValueError as e: raise e # allow to get config from URL # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json config_from_url = config_obj.get('from_url', None) if config_from_url is not None: logger.info('Updating config from URL') response = requests.get(config_from_url) update_config = response.json() config_obj.update(update_config) required_cfg = ['username', 'password'] # , 'owner_org'] faileds = [] for req in required_cfg: if req not in config_obj: faileds.append(req) if len(faileds) > 0: raise ValueError('Missing configs: {}'.format(faileds)) return config def gather_stage(self, harvest_job): ''' analyze the source, return a list of IDs and create one HarvestObject per dataset ''' logger.info('Starts Gather SIU Transp') # load paths self.set_paths() self.siu_data_lib.get_query_files() # basic things you'll need self.source = harvest_job.source self.source_config = json.loads(self.source.config) # allow to get config from URL # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json config_from_url = self.source_config.get('from_url', None) if config_from_url is not None: logger.info('Updating config from URL') response = requests.get(config_from_url) update_config = response.json() self.source_config.update(update_config) self.siu_data_lib.base_url = self.source.url self.siu_data_lib.username = self.source_config['username'] self.siu_data_lib.password = self.source_config['password'] # #################################### # get previous harvested packages pfr = self.get_packages_for_source(harvest_source_id=self.source.id) prev_names = [pkg['name'] for pkg in pfr['results']] logger.info('Get previous harvested objects {}'.format(prev_names)) # TODO # #################################### object_ids = [] # lista de IDs a procesar, esto se devuelve en esta funcion self.source_dataset = get_harvest_source(self.source.id) owner_org = self.source_dataset.get('owner_org') logger.info('Gather SIU Transp to ORG {}'.format(owner_org)) # Iterar por cada query para obtener diferentes conjuntos de datos # Por cada archivo en siu_transp_data/queries se generarán múltiples datasets para publicar report = [] # resumen de todos los resultados logger.info('Iter files') # ver si la config me pide sobreescribir metadatos en los datasets de cada archivo override = self.source_config.get('override', {}) logger.info("General override {}".format(override)) for qf in self.siu_data_lib.query_files: only_files = self.source_config.get('only_files', None) query_file_name = qf.split('/')[-1] if only_files is not None: if query_file_name not in only_files: logger.info('Skipping file by config {}'.format(query_file_name)) continue logger.info('Gather SIU Transp FILE {}'.format(qf)) stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf) # open to read query params stqf.open() # request all data stqf.request_all(results_folder_path=self.results_folder_path) for err in stqf.errors: hgerr = HarvestGatherError(message=err, job=harvest_job) hgerr.save() # ====== Prepare dict to override datasets metadata ============ override_this = override.get(query_file_name, {}) logger.info("To override {}: {}".format(query_file_name, override_this)) # extras need to be {"key": "extra name", "value": "extra value"} extras = override_this.get('extras', {}) new_extras = [] for extra_key, extra_value in extras.iteritems(): logger.info("Override extra found {}: {}".format(extra_key, extra_value)) if not isinstance(extra_value, str): extra_value = str(extra_value) new_extras.append({"key": extra_key, "value": extra_value}) if len(new_extras) > 0: override_this['extras'] = new_extras # tags need to be {"name": "tag name"} tags = override_this.get('tags', []) new_tags = [] for tag in tags: logger.info("Override tag found {}".format(unicode(tag).encode("utf-8"))) new_tags.append({"name": tag}) if len(new_tags) > 0: override_this['tags'] = new_tags # groups need to be {"name": "tag name"} groups = override_this.get('groups', []) new_groups = [] for group in groups: logger.info("Override group found {}".format(group)) # check if groups must be created context = {'model': model, 'session': model.Session, 'user': self._get_user_name()} try: p.toolkit.get_action('group_create')(context, {"name": group}) except Exception as e: logger.error('Error creating group (skipped) {}: {}'.format(group, e)) new_groups.append({"name": group}) if len(new_groups) > 0: override_this['groups'] = new_groups # ================================ report += stqf.requests for dataset in stqf.datasets: if dataset['name'] in prev_names: action = 'update' # leave this list just with packages to remove prev_names.remove(dataset['name']) else: action = 'create' logger.info('Dataset {} to {}'.format(dataset['name'], action)) ho_dict = { 'title': dataset['title'], 'name': dataset['name'], 'owner_org': owner_org, 'notes': dataset['notes'], 'tags': dataset['tags'], 'resources': dataset['resources'], 'action': action } # fix extras if they exists ho_dict.update(override_this) logger.info("Overrided ho_dict {}".format(ho_dict)) # Each harvest object will be passed to other stages in harvest process obj = HarvestObject(guid=dataset['name'], job=harvest_job, content=json.dumps(ho_dict)) obj.save() logger.info('Objects ID appends {}'.format(obj.id)) object_ids.append(obj.id) # TODO compare with previous harvested data to remove dataset no more at harvest source # resumen final logger.info('REQUESTS: \n{}'.format('\n\t'.join(report))) return object_ids def fetch_stage(self, harvest_object): ''' donwload and get what you need before import to CKAN Already downloaded in Gather stage ''' logger.info('Fetching {}'.format(harvest_object.id)) return True def import_stage(self, harvest_object): ''' save to CKAN ''' logger.info('Importing {}'.format(harvest_object.id)) self.set_paths() package_dict = json.loads(harvest_object.content) action = package_dict.pop('action') extras = package_dict.get('extras', []) extras = self.update_extras(extras, harvest_object) package_dict['extras'] = extras resources = package_dict.pop('resources', []) # Save (create or update) to CKAN # Using base class function ._create_or_update_package # seems no useful to deal with resources user_name = self._get_user_name() context = {'model': model, 'session': model.Session, 'user': user_name} if action == 'create': try: pkg = p.toolkit.get_action('package_create')(context, package_dict) except Exception, e: logger.error('Error creating package {}: {}'.format(str(e), package_dict)) # TODO, no debería suceder if str(e).find('already in use') > 0: action = 'update' else: msg = 'Import CREATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e) harvest_object_error = HarvestObjectError(message=msg, object=harvest_object) harvest_object_error.save() return False if action == 'update': try: pkg = p.toolkit.get_action('package_update')(context, package_dict) except Exception, e: msg = 'Import UPDATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e) harvest_object_error = HarvestObjectError(message=msg, object=harvest_object) harvest_object_error.save() logger.error(msg) return False
def _save_object_error(self,message,obj,stage=u'Fetch'): err = HarvestObjectError(message=message,object=obj,stage=stage) err.save() log.error(message)
def _get_xml_url_content(xml_url, urlopen_timeout, harvest_object): try: try: r = requests.get(xml_url, timeout=urlopen_timeout) ET.XML(r.content) # test for valid xml return r except ET.ParseError as e: msg = '%s: %s. From external XML content at %s' % ( type(e).__name__, str(e), xml_url) log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except requests.exceptions.Timeout as e: msg = '%s: %s. From external XML content at %s' % ( type(e).__name__, str(e), xml_url) log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except requests.exceptions.TooManyRedirects as e: msg = 'HTTP too many redirects: %s' % e.code log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except requests.exceptions.RequestException as e: msg = 'HTTP request exception: %s' % e.code log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() except Exception as e: msg = '%s: %s. From external XML content at %s' % ( type(e).__name__, str(e), xml_url) log.warn(msg) err = HarvestObjectError(message=msg, object=harvest_object, stage='Import') err.save() finally: return '' except StaleDataError as e: log.warn('Harvest object %s is stail. Error object not created. %s' % (harvest_object.id, str(e)))