コード例 #1
0
ファイル: base.py プロジェクト: Web5design/ckanext-harvest
 def _save_object_error(self,message,obj,stage=u'Fetch'):
     '''
     Helper function to create an error during the fetch or import stage.
     '''
     err = HarvestObjectError(message=message,object=obj,stage=stage)
     err.save()
     log.error(message)
コード例 #2
0
ファイル: base.py プロジェクト: raphaelstolt/ckanext-harvest
 def _save_object_error(self, message, obj, stage=u"Fetch", line=None):
     err = HarvestObjectError(message=message, object=obj, stage=stage, line=line)
     try:
         err.save()
     except InvalidRequestError, e:
         Session.rollback()
         err.save()
コード例 #3
0
 def _save_object_error(self, message, obj, stage=u'Fetch'):
     '''
     Helper function to create an error during the fetch or import stage.
     '''
     err = HarvestObjectError(message=message, object=obj, stage=stage)
     err.save()
     log.error(message)
コード例 #4
0
    def test_error_mail_sent_with_object_error(self, mock_mailer_mail_recipient):

        context, harvest_source, harvest_job = self._create_harvest_source_and_job_if_not_existing()

        data_dict = {
            'guid': 'guid',
            'content': 'content',
            'job_id': harvest_job['id'],
            'extras': {'a key': 'a value'},
            'source_id': harvest_source['id']
        }
        harvest_object = toolkit.get_action('harvest_object_create')(
            context, data_dict)

        harvest_object_model = HarvestObject.get(harvest_object['id'])

        # create a HarvestObjectError
        msg = 'HarvestObjectError occured: %s' % harvest_job['id']
        harvest_object_error = HarvestObjectError(message=msg, object=harvest_object_model)
        harvest_object_error.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
コード例 #5
0
    def import_stage(self, harvest_object):
        ''' save to CKAN '''
        logger.info('Importing {}'.format(harvest_object.id))
        self.set_paths()
        
        package_dict = json.loads(harvest_object.content)
        action = package_dict.pop('action')
        extras = package_dict.get('extras', [])

        extras = self.update_extras(extras, harvest_object)
        package_dict['extras'] = extras
        resources = package_dict.pop('resources', [])
        
        # Save (create or update) to CKAN
        # Using base class function ._create_or_update_package
        #   seems no useful to deal with resources
        user_name = self._get_user_name()
        context = {'model': model, 'session': model.Session, 'user': user_name}
        
        if action == 'create':
            try:
                pkg = p.toolkit.get_action('package_create')(context, package_dict)
            except Exception, e:
                logger.error('Error creating package {}: {}'.format(str(e), package_dict))
                # TODO, no debería suceder
                if str(e).find('already in use') > 0:
                    action = 'update'
                else:
                    msg = 'Import CREATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e)
                    harvest_object_error = HarvestObjectError(message=msg, object=harvest_object)
                    harvest_object_error.save()
                    return False
コード例 #6
0
    def is_part_of_to_package_id(self, ipo, harvest_object):
        """ Get an identifier from external source using isPartOf
            and returns the parent dataset or raises an ParentNotHarvestedException.
            Only search for datasets that are the parent of a collection.
            """
        ps = p.toolkit.get_action('package_search')
        query = 'extras_identifier:{} AND extras_collection_metadata:true'.format(
            ipo)
        results = ps(self.context(), {"fq": query})
        log.info('Package search results {}'.format(results))

        if results[
                'count'] > 0:  # event if we have only one we need to be sure is the parent I need
            # possible check identifier collision
            # check the URL of the source to validate
            datasets = results['results']
            harvest_source = harvest_object.source

            for dataset in datasets:
                extras = dataset.get('extras', [])
                identifiers = [
                    extra['value'] for extra in extras
                    if extra['key'] == 'identifier'
                ]
                if ipo not in identifiers:
                    log.error('BAD SEARCH for {}:{}'.format(ipo, identifiers))
                    continue

                dataset_harvest_source_id = self.get_harvest_source_id(
                    dataset['id'])

                if harvest_source.id == dataset_harvest_source_id:
                    log.info('Parent dataset identified correctly')
                    return dataset
                else:
                    log.info('{} not found at {} for {}'.format(
                        harvest_source.id, dataset_harvest_source_id, ipo))

        # we have 0 o bad results
        msg = 'Parent identifier not found: "{}"'.format(ipo)
        log.error(msg)
        try:
            harvest_object_error = HarvestObjectError(message=msg,
                                                      object=harvest_object)
            harvest_object_error.save()
            harvest_object.state = "ERROR"
            harvest_object.save()
        except:
            pass
        raise ParentNotHarvestedException(
            'Unable to find parent dataset. Raising error to allow re-run later'
        )
コード例 #7
0
ファイル: base.py プロジェクト: ngds/ckanext-harvest
 def _save_object_error(self, message, obj, stage=u'Fetch', line=None):
     err = HarvestObjectError(message=message,
                              object=obj,
                              stage=stage,
                              line=line)
     try:
         err.save()
     except InvalidRequestError, e:
         Session.rollback()
         err.save()
コード例 #8
0
class SIUTransparenciaHarvester(HarvesterBase):

    def set_paths(self):
        here = os.path.dirname(os.path.abspath(__file__))
        base = os.environ.get('CKAN_STORAGE_PATH', here)
        self.results_folder_path = os.path.join(base, 'siu-harvester-results')
        if not os.path.isdir(self.results_folder_path):
            os.makedirs(self.results_folder_path)
        
        # librearia que gestiona los datos en el portal de SIU
        self.siu_data_lib = SIUPoratlTransparenciaData()
        
    ## IHarvester
    def info(self):
        '''
        :returns: A dictionary with the harvester descriptors
        '''
        return {
            'name': 'siu_transp',
            'title': 'SIU Portal de transparencia',
            'description': 'Extraer y publicar datos del portal de transparecnia de SIU',
            'form_config_interface': 'Text'
        }


    def validate_config(self, config):
        '''

        [optional]

        Harvesters can provide this method to validate the configuration
        entered in the form. It should return a single string, which will be
        stored in the database. Exceptions raised will be shown in the form's
        error messages.

        :param config: Config string coming from the form
        :returns: A string with the validated configuration options
        '''

        if not config:
            raise ValueError('Set up the required configuration settings')

        try:
            config_obj = json.loads(config)
        except ValueError as e:
            raise e
        
        # allow to get config from URL
        # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json
        config_from_url = config_obj.get('from_url', None)
        if config_from_url is not None:
            logger.info('Updating config from URL')
            response = requests.get(config_from_url)
            update_config = response.json()
            config_obj.update(update_config)

        required_cfg = ['username', 'password']  # , 'owner_org']
        faileds = []
        for req in required_cfg:
            if req not in config_obj:
                faileds.append(req)

        if len(faileds) > 0:
            raise ValueError('Missing configs: {}'.format(faileds))

        return config

    def gather_stage(self, harvest_job):
        '''
        analyze the source, return a list of IDs
            and create one HarvestObject per dataset 
        '''
        logger.info('Starts Gather SIU Transp')
        # load paths
        self.set_paths()
        self.siu_data_lib.get_query_files()

        # basic things you'll need
        self.source = harvest_job.source
        self.source_config = json.loads(self.source.config)

        # allow to get config from URL
        # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json
        config_from_url = self.source_config.get('from_url', None)
        if config_from_url is not None:
            logger.info('Updating config from URL')
            response = requests.get(config_from_url)
            update_config = response.json()
            self.source_config.update(update_config)

        self.siu_data_lib.base_url = self.source.url
        self.siu_data_lib.username = self.source_config['username']
        self.siu_data_lib.password = self.source_config['password']
        
        # ####################################
        # get previous harvested packages
        pfr = self.get_packages_for_source(harvest_source_id=self.source.id)
        prev_names = [pkg['name'] for pkg in pfr['results']]
        logger.info('Get previous harvested objects {}'.format(prev_names))
        # TODO
        # ####################################
        
        object_ids = []  # lista de IDs a procesar, esto se devuelve en esta funcion
        
        self.source_dataset = get_harvest_source(self.source.id)
        owner_org = self.source_dataset.get('owner_org')
        logger.info('Gather SIU Transp to ORG {}'.format(owner_org))
        
        # Iterar por cada query para obtener diferentes conjuntos de datos
        # Por cada archivo en siu_transp_data/queries se generarán múltiples datasets para publicar
        
        report = []  # resumen de todos los resultados
        logger.info('Iter files')
        
        # ver si la config me pide sobreescribir metadatos en los datasets de cada archivo
        override = self.source_config.get('override', {})
        logger.info("General override {}".format(override))
            
        for qf in self.siu_data_lib.query_files:
            only_files = self.source_config.get('only_files', None)
            query_file_name = qf.split('/')[-1]
            if only_files is not None:
                if query_file_name not in only_files:
                    logger.info('Skipping file by config {}'.format(query_file_name))
                    continue
            
            logger.info('Gather SIU Transp FILE {}'.format(qf))
            stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf)
            # open to read query params
            stqf.open()
            # request all data
            stqf.request_all(results_folder_path=self.results_folder_path)
            for err in stqf.errors:
                hgerr = HarvestGatherError(message=err, job=harvest_job)
                hgerr.save()


            # ====== Prepare dict to override datasets metadata ============
            override_this = override.get(query_file_name, {})
            logger.info("To override {}: {}".format(query_file_name, override_this))
            
            # extras need to be {"key": "extra name", "value": "extra value"}
            extras = override_this.get('extras', {})
            new_extras = []
            for extra_key, extra_value in extras.iteritems():
                logger.info("Override extra found {}: {}".format(extra_key, extra_value))
                if not isinstance(extra_value, str):
                    extra_value = str(extra_value)
                new_extras.append({"key": extra_key, "value": extra_value})
            
            if len(new_extras) > 0:
                override_this['extras'] = new_extras

            # tags need to be {"name": "tag name"}
            tags = override_this.get('tags', [])
            new_tags = []
            for tag in tags:
                logger.info("Override tag found {}".format(unicode(tag).encode("utf-8")))
                new_tags.append({"name": tag})
            
            if len(new_tags) > 0:
                override_this['tags'] = new_tags

            # groups need to be {"name": "tag name"}
            groups = override_this.get('groups', [])
            new_groups = []
            for group in groups:
                logger.info("Override group found {}".format(group))
                # check if groups must be created
                context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
                try:
                    p.toolkit.get_action('group_create')(context, {"name": group})
                except Exception as e:
                    logger.error('Error creating group (skipped) {}: {}'.format(group, e))
                    
                new_groups.append({"name": group})
            
            if len(new_groups) > 0:
                override_this['groups'] = new_groups

            # ================================
                
            report += stqf.requests
            for dataset in stqf.datasets:
                if dataset['name'] in prev_names:
                    action = 'update'
                    # leave this list just with packages to remove
                    prev_names.remove(dataset['name'])
                else:
                    action = 'create'
                logger.info('Dataset {} to {}'.format(dataset['name'], action))
                ho_dict = {
                    'title': dataset['title'],
                    'name': dataset['name'],
                    'owner_org': owner_org,
                    'notes': dataset['notes'],
                    'tags': dataset['tags'],
                    'resources': dataset['resources'],
                    'action': action
                }

                # fix extras if they exists
                ho_dict.update(override_this)
                logger.info("Overrided ho_dict {}".format(ho_dict))
                    

                # Each harvest object will be passed to other stages in harvest process
                obj = HarvestObject(guid=dataset['name'],
                                    job=harvest_job,
                                    content=json.dumps(ho_dict))
                obj.save()
                logger.info('Objects ID appends {}'.format(obj.id))
                object_ids.append(obj.id)

        # TODO compare with previous harvested data to remove dataset no more at harvest source

        # resumen final
        logger.info('REQUESTS: \n{}'.format('\n\t'.join(report)))
        return object_ids
    
    def fetch_stage(self, harvest_object):
        ''' donwload and get what you need before import to CKAN
            Already downloaded in Gather stage '''
        logger.info('Fetching {}'.format(harvest_object.id))
        
        return True
    
    def import_stage(self, harvest_object):
        ''' save to CKAN '''
        logger.info('Importing {}'.format(harvest_object.id))
        self.set_paths()
        
        package_dict = json.loads(harvest_object.content)
        action = package_dict.pop('action')
        extras = package_dict.get('extras', [])

        extras = self.update_extras(extras, harvest_object)
        package_dict['extras'] = extras
        resources = package_dict.pop('resources', [])
        
        # Save (create or update) to CKAN
        # Using base class function ._create_or_update_package
        #   seems no useful to deal with resources
        user_name = self._get_user_name()
        context = {'model': model, 'session': model.Session, 'user': user_name}
        
        if action == 'create':
            try:
                pkg = p.toolkit.get_action('package_create')(context, package_dict)
            except Exception, e:
                logger.error('Error creating package {}: {}'.format(str(e), package_dict))
                # TODO, no debería suceder
                if str(e).find('already in use') > 0:
                    action = 'update'
                else:
                    msg = 'Import CREATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e)
                    harvest_object_error = HarvestObjectError(message=msg, object=harvest_object)
                    harvest_object_error.save()
                    return False

        if action == 'update':
            try:
                pkg = p.toolkit.get_action('package_update')(context, package_dict)
            except Exception, e:
                msg = 'Import UPDATE error. pkg name: {}. \n\tError: {}'.format(package_dict.get('name', 'unnamed'), e)
                harvest_object_error = HarvestObjectError(message=msg, object=harvest_object)
                harvest_object_error.save()
                logger.error(msg)
                return False
コード例 #9
0
 def _save_object_error(self,message,obj,stage=u'Fetch'):
     err = HarvestObjectError(message=message,object=obj,stage=stage)
     err.save()
     log.error(message)
コード例 #10
0
def _get_xml_url_content(xml_url, urlopen_timeout, harvest_object):
    try:
        try:
            r = requests.get(xml_url, timeout=urlopen_timeout)
            ET.XML(r.content)  # test for valid xml
            return r
        except ET.ParseError as e:
            msg = '%s: %s. From external XML content at %s' % (
                type(e).__name__, str(e), xml_url)
            log.warn(msg)
            err = HarvestObjectError(message=msg,
                                     object=harvest_object,
                                     stage='Import')
            err.save()
        except requests.exceptions.Timeout as e:
            msg = '%s: %s. From external XML content at %s' % (
                type(e).__name__, str(e), xml_url)
            log.warn(msg)
            err = HarvestObjectError(message=msg,
                                     object=harvest_object,
                                     stage='Import')
            err.save()
        except requests.exceptions.TooManyRedirects as e:
            msg = 'HTTP too many redirects: %s' % e.code
            log.warn(msg)
            err = HarvestObjectError(message=msg,
                                     object=harvest_object,
                                     stage='Import')
            err.save()
        except requests.exceptions.RequestException as e:
            msg = 'HTTP request exception: %s' % e.code
            log.warn(msg)
            err = HarvestObjectError(message=msg,
                                     object=harvest_object,
                                     stage='Import')
            err.save()
        except Exception as e:
            msg = '%s: %s. From external XML content at %s' % (
                type(e).__name__, str(e), xml_url)
            log.warn(msg)
            err = HarvestObjectError(message=msg,
                                     object=harvest_object,
                                     stage='Import')
            err.save()
        finally:
            return ''

    except StaleDataError as e:
        log.warn('Harvest object %s is stail. Error object not created. %s' %
                 (harvest_object.id, str(e)))