def get_record_type(cls, xml):
        '''
        For a given ISO19139 record, returns the "type"
        e.g. "dataset", "series", "service"

        xml - etree of the ISO19139 XML record
        '''
        gemini = GeminiDocument(xml_tree=xml)
        return gemini.read_value('resource-type')
Exemple #2
0
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.model import GeminiDocument
        from ckanext.spatial.lib.coupled_resource import extract_guid

        # Find service records
        for service_record in model.Session.query(model.Package).\
            filter_by(state='active').\
            join(model.PackageExtra).\
            filter_by(state='active').\
            filter_by(key='resource-type').\
            filter_by(value='service'):

            # Find coupled dataset records
            service_type = service_record.extras['resource-type']
            if not 'coupled-resource' in service_record.extras:
                if service_type in ('view', 'download'):
                    service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type)
                else:
                    service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name)
                continue                
            coupled_resources_str = service_record.extras['coupled-resource']
            coupled_resources = json.loads(coupled_resources_str)
            log.info('%s has %i coupled resources',
                     service_record.name, len(coupled_resources))
            couples_all_detected = True
            couples_detected = False
            for i, coupled_resource in enumerate(coupled_resources):
                couple_id = '%s.%s' % (service_record.name, i)
                href = coupled_resource['href']

                # For tests only
                #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']:
                #    break
                
                if len(href) <> 1:
                    log.error('Coupled resource href is not a list of 1: %r couple=%s',
                              href, couple_id)
                    couple_stats.add('Couple href is length %i' % len(href), couple_id)
                    couples_all_detected = False
                    continue
                href = href[0]
                if not href.strip():
                    log.error('Coupled resource href is blank. couple=%s',
                              couple_id)
                    couple_stats.add('Couple href is blank', couple_id)
                    couples_all_detected = False
                    continue
                    
                # Look for the equivalent dataset resource

                # If it is CSW, we must extract the guid
                # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&amp;REQUEST=GetRecordById&amp;ID=9df8df52-d788-37a8-e044-0003ba9b0d98&amp;elementSetName=full&amp;OutputSchema=http://www.isotc211.org/2005/gmd
                guid = extract_guid(href)
                if guid:
                    if not guid.strip():
                        couple_stats.add('Guid was blank', couple_id)
                        log.error('Guid was blank. href=%s', href, couple_id)
                        
                    try:
                        harvest_object = cls.find_harvest_object_by_guid(guid)
                    except FindError, e:
                        log.error('%s guid=%s couple=%s', e, guid, couple_id)
                        couple_stats.add(str(e), couple_id)
                        couples_all_detected = False
                        continue

                    dataset_record = harvest_object.package #res.resource_group.package
                    couple_stats.add('Couple completed', couple_id)
                    log.info('Couple completed %s <-> %s',
                             service_record.name, dataset_record.name)
                    
                    cls.add_coupling(service_record, dataset_record, harvest_object, guid)
                    couples_detected = True
                    continue

                # Known bad couples are weeded out
                bad_couples = ('GetCapabilities', 'CEH:EIDC',
                               'ceh:eidc',
                               'http://data.nbn.org.uk#',
                               'www.geostore.com/OGC/OGCInterface',
                               'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer',
                               'Please enter a valid url',
                               )
                bad_couple_detected = False
                for bad_couple in bad_couples:
                    if bad_couple in href:
                        couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id)
                        log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id)
                        bad_couple_detected = True
                if bad_couple_detected:
                    couples_all_detected = False
                    continue
                
                # Try as a WAF
                # Try the URL to download the gemini again, to find the
                # GUID of the dataset
                log.info('Trying possible WAF href: %s' % href)
                try:
                    res = requests.get(href, timeout=10)
                except Exception, e:
                    couple_stats.add('Connecting to href failed: %s' % \
                                     e, couple_id)
                    log.warning('Connecting to href failed: %s href:"%s"', \
                                     e, href)
                    couples_all_detected = False
                    break                    
                if not res.ok:
                    couple_stats.add('Resolving href failed: %s' % \
                                     res.reason, couple_id)
                    log.warning('Resolving href failed: %s %s href:"%s"', \
                                     res.status_code, res.reason, href)
                    couples_all_detected = False
                    break
                gemini = GeminiDocument(res.content)
                try:
                    guid = gemini.read_value('guid')
                except KeyError, e:
                    couple_stats.add('Could not get GUID from Gemini downloaded' % \
                                     href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
Exemple #3
0
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.model import GeminiDocument
        from ckanext.spatial.lib.coupled_resource import extract_guid

        # Find service records
        for service_record in model.Session.query(model.Package).\
            filter_by(state='active').\
            join(model.PackageExtra).\
            filter_by(state='active').\
            filter_by(key='resource-type').\
            filter_by(value='service'):

            # Find coupled dataset records
            service_type = service_record.extras['resource-type']
            if not 'coupled-resource' in service_record.extras:
                if service_type in ('view', 'download'):
                    service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type)
                else:
                    service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name)
                continue                
            coupled_resources_str = service_record.extras['coupled-resource']
            coupled_resources = json.loads(coupled_resources_str)
            log.info('%s has %i coupled resources',
                     service_record.name, len(coupled_resources))
            couples_all_detected = True
            couples_detected = False
            for i, coupled_resource in enumerate(coupled_resources):
                couple_id = '%s.%s' % (service_record.name, i)
                href = coupled_resource['href']

                # For tests only
                #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']:
                #    break
                
                if len(href) <> 1:
                    log.error('Coupled resource href is not a list of 1: %r couple=%s',
                              href, couple_id)
                    couple_stats.add('Couple href is length %i' % len(href), couple_id)
                    couples_all_detected = False
                    continue
                href = href[0]
                if not href.strip():
                    log.error('Coupled resource href is blank. couple=%s',
                              couple_id)
                    couple_stats.add('Couple href is blank', couple_id)
                    couples_all_detected = False
                    continue
                    
                # Look for the equivalent dataset resource

                # If it is CSW, we must extract the guid
                # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&amp;REQUEST=GetRecordById&amp;ID=9df8df52-d788-37a8-e044-0003ba9b0d98&amp;elementSetName=full&amp;OutputSchema=http://www.isotc211.org/2005/gmd
                guid = extract_guid(href)
                if guid:
                    if not guid.strip():
                        couple_stats.add('Guid was blank', couple_id)
                        log.error('Guid was blank. href=%s', href, couple_id)
                        
                    try:
                        harvest_object = cls.find_harvest_object_by_guid(guid)
                    except FindError, e:
                        log.error('%s guid=%s couple=%s', e, guid, couple_id)
                        couple_stats.add(str(e), couple_id)
                        couples_all_detected = False
                        continue

                    dataset_record = harvest_object.package #res.resource_group.package
                    couple_stats.add('Couple completed', couple_id)
                    log.info('Couple completed %s <-> %s',
                             service_record.name, dataset_record.name)
                    
                    cls.add_coupling(service_record, dataset_record, harvest_object, guid)
                    couples_detected = True
                    continue

                # Known bad couples are weeded out
                bad_couples = ('GetCapabilities', 'CEH:EIDC',
                               'ceh:eidc',
                               'http://data.nbn.org.uk#',
                               'www.geostore.com/OGC/OGCInterface',
                               'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer',
                               'Please enter a valid url',
                               )
                bad_couple_detected = False
                for bad_couple in bad_couples:
                    if bad_couple in href:
                        couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id)
                        log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id)
                        bad_couple_detected = True
                if bad_couple_detected:
                    couples_all_detected = False
                    continue
                
                # Try as a WAF
                # Try the URL to download the gemini again, to find the
                # GUID of the dataset
                log.info('Trying possible WAF href: %s' % href)
                try:
                    res = requests.get(href, timeout=10)
                except Exception, e:
                    couple_stats.add('Connecting to href failed: %s' % \
                                     e, couple_id)
                    log.warning('Connecting to href failed: %s href:"%s"', \
                                     e, href)
                    couples_all_detected = False
                    break                    
                if not res.ok:
                    couple_stats.add('Resolving href failed: %s' % \
                                     res.reason, couple_id)
                    log.warning('Resolving href failed: %s %s href:"%s"', \
                                     res.status_code, res.reason, href)
                    couples_all_detected = False
                    break
                gemini = GeminiDocument(res.content)
                try:
                    guid = gemini.read_value('guid')
                except KeyError, e:
                    couple_stats.add('Could not get GUID from Gemini downloaded' % \
                                     href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break