def _features_as_json(self, features):
     '''Returns features in JSON format, with this structure:
     {"type": "FeatureCollection",
      "features":
       [
         {   "type": "Feature",
             "geometry": {"type": "Point", "coordinates": [102.0, 0.5]},
             "properties": {"ID": 11,
                            "SchoolName": "Camden",
                            "SchoolType": "Primary",
                            "StreetName": "Camden Road",
                            "Town": "Carshalton",
                            "Postcode": "SM5 2NS",
                            "TelephoneNumber": "020 86477324",
                            "Easting": 527700.179,
                            "Northing": 164916.916}
         },
         ...
       ]
     }
     '''
     feature_dicts = []
     for feature in features:
         # ignore feature['datasetid']
         try:
             properties = json.loads(feature['properties'])
         except ValueError:
             log.error('Properties did not parse as JSON. Dataset: %s Properties: %r',
                       feature['datasetid'], feature['properties'])
             properties = 'Error loading properties'
         coords = parse_point_wkt(feature['geom'])
         feature_dict = {'type': 'Feature',
                         'geometry': {
                             'type': 'Point',
                             'coordinates': coords,
                             },
                         'properties': properties,
                         }
         feature_dicts.append(feature_dict)
     features_dict = {'type': 'FeatureCollection',
                      'features': feature_dicts}
     return json.dumps(features_dict)
Ejemplo n.º 2
0
def update_coupled_resources(package, harvest_source_reference):
    '''Update the harvest_coupled_resource_table with the details of this
    harvested package\'s couplings.

    :param package: the Package object containing extra fields with couples
                    to update in the table.
    :param harvest_source_reference: the ref of this package being harvested.
                    This is not relevant if it is a service record, but
                    essential if it is a dataset.
    '''
    resource_type = package.extras['resource-type']
    if resource_type == 'service':
        # When a service record is harvested, ensure the couples listed
        # in it match the couples in the HarvestCoupledResource objects,
        # ignoring their dataset values (they might be filled in or not).
        pkg_couples_str = package.extras['coupled-resource']
        pkg_couples = json.loads(pkg_couples_str)
        log.info('Service Record %s has %i coupled resources to update',
                 package.name, len(pkg_couples))

        table_couples_matching_service = HarvestCoupledResource.get_by_service_record(package)
        table_couples_not_matching_pkg = table_couples_matching_service.all() # cross them off as we go

        for pkg_couple in pkg_couples:
            try:
                ref = extract_harvest_source_reference_from_coupled_resource(pkg_couple)
            except CoupledResourceParseError, e:
                log.warn('Error parsing couple: %s Ignoring couple=%s', e, pkg_couple)
                continue
            # Match both service and ref
            matching_table_couples = table_couples_matching_service.filter_by(harvest_source_reference=ref)
            if matching_table_couples.count() > 0:
                # Test: test_02_reharvest_existing_service
                # Note down the matches so we don't delete them later
                for matching_table_couple in matching_table_couples:
                    log.info('Service couple is already there (%s, %s, %s)',
                             package.name, ref,
                             _package_name(matching_table_couple.dataset_record))
                    table_couples_not_matching_pkg.remove(matching_table_couple)
                continue
            # Match just ref with blank service
            matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\
                                     .filter_by(service_record=None)
            if matching_table_couples.count() == 0:
                # Test: test_06_harvest_service_not_matching_a_dataset
                # create the row
                obj = HarvestCoupledResource(service_record=package,
                                             harvest_source_reference=ref)
                model.Session.add(obj)
                log.info('Ref is new for this service - adding (%s, %s, None)',
                         package.name, ref)
                model.Session.commit()
            else:
                # Test: test_04_harvest_service_to_match_existing_dataset
                for matching_table_couple in matching_table_couples:
                    # fill in the service value
                    matching_table_couple.service_record = package
                    log.info('Service filled into couple matching ref (%s, %s, %s)',
                             package.name, ref,
                             _package_name(matching_table_couple.dataset_record))
                model.Session.commit()

        # Delete service value for any table_couples not matching the package
        # Test: test_08_reharvest_existing_service_to_delete_and_add_couples
        for table_couple in table_couples_not_matching_pkg:
            log.info('Service couple not matched - deleted service (%s->None, %s, %s)',
                     _package_name(table_couple.service_record),
                     ref, _package_name(table_couple.dataset_record))
            table_couple.service_record = None
            model.Session.commit()
        return
Ejemplo n.º 3
0
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.model import GeminiDocument
        from ckanext.spatial.lib.coupled_resource import extract_guid

        # Find service records
        for service_record in model.Session.query(model.Package).\
            filter_by(state='active').\
            join(model.PackageExtra).\
            filter_by(state='active').\
            filter_by(key='resource-type').\
            filter_by(value='service'):

            # Find coupled dataset records
            service_type = service_record.extras['resource-type']
            if not 'coupled-resource' in service_record.extras:
                if service_type in ('view', 'download'):
                    service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type)
                else:
                    service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name)
                continue                
            coupled_resources_str = service_record.extras['coupled-resource']
            coupled_resources = json.loads(coupled_resources_str)
            log.info('%s has %i coupled resources',
                     service_record.name, len(coupled_resources))
            couples_all_detected = True
            couples_detected = False
            for i, coupled_resource in enumerate(coupled_resources):
                couple_id = '%s.%s' % (service_record.name, i)
                href = coupled_resource['href']

                # For tests only
                #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']:
                #    break
                
                if len(href) <> 1:
                    log.error('Coupled resource href is not a list of 1: %r couple=%s',
                              href, couple_id)
                    couple_stats.add('Couple href is length %i' % len(href), couple_id)
                    couples_all_detected = False
                    continue
                href = href[0]
                if not href.strip():
                    log.error('Coupled resource href is blank. couple=%s',
                              couple_id)
                    couple_stats.add('Couple href is blank', couple_id)
                    couples_all_detected = False
                    continue
                    
                # Look for the equivalent dataset resource

                # If it is CSW, we must extract the guid
                # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&amp;REQUEST=GetRecordById&amp;ID=9df8df52-d788-37a8-e044-0003ba9b0d98&amp;elementSetName=full&amp;OutputSchema=http://www.isotc211.org/2005/gmd
                guid = extract_guid(href)
                if guid:
                    if not guid.strip():
                        couple_stats.add('Guid was blank', couple_id)
                        log.error('Guid was blank. href=%s', href, couple_id)
                        
                    try:
                        harvest_object = cls.find_harvest_object_by_guid(guid)
                    except FindError, e:
                        log.error('%s guid=%s couple=%s', e, guid, couple_id)
                        couple_stats.add(str(e), couple_id)
                        couples_all_detected = False
                        continue

                    dataset_record = harvest_object.package #res.resource_group.package
                    couple_stats.add('Couple completed', couple_id)
                    log.info('Couple completed %s <-> %s',
                             service_record.name, dataset_record.name)
                    
                    cls.add_coupling(service_record, dataset_record, harvest_object, guid)
                    couples_detected = True
                    continue

                # Known bad couples are weeded out
                bad_couples = ('GetCapabilities', 'CEH:EIDC',
                               'ceh:eidc',
                               'http://data.nbn.org.uk#',
                               'www.geostore.com/OGC/OGCInterface',
                               'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer',
                               'Please enter a valid url',
                               )
                bad_couple_detected = False
                for bad_couple in bad_couples:
                    if bad_couple in href:
                        couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id)
                        log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id)
                        bad_couple_detected = True
                if bad_couple_detected:
                    couples_all_detected = False
                    continue
                
                # Try as a WAF
                # Try the URL to download the gemini again, to find the
                # GUID of the dataset
                log.info('Trying possible WAF href: %s' % href)
                try:
                    res = requests.get(href, timeout=10)
                except Exception, e:
                    couple_stats.add('Connecting to href failed: %s' % \
                                     e, couple_id)
                    log.warning('Connecting to href failed: %s href:"%s"', \
                                     e, href)
                    couples_all_detected = False
                    break                    
                if not res.ok:
                    couple_stats.add('Resolving href failed: %s' % \
                                     res.reason, couple_id)
                    log.warning('Resolving href failed: %s %s href:"%s"', \
                                     res.status_code, res.reason, href)
                    couples_all_detected = False
                    break
                gemini = GeminiDocument(res.content)
                try:
                    guid = gemini.read_value('guid')
                except KeyError, e:
                    couple_stats.add('Could not get GUID from Gemini downloaded' % \
                                     href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
Ejemplo n.º 4
0
    def validate_json(self, received_data):

        try:
            json_object = json.loads(received_data)
        except ValueError, e:
            return False
def update_coupled_resources(package, harvest_source_reference):
    '''Update the harvest_coupled_resource_table with the details of this
    harvested package\'s couplings.

    :param package: the Package object containing extra fields with couples
                    to update in the table.
    :param harvest_source_reference: the ref of this package being harvested.
                    This is not relevant if it is a service record, but
                    essential if it is a dataset.
    '''
    resource_type = package.extras['resource-type']
    if resource_type == 'service':
        # When a service record is harvested, ensure the couples listed
        # in it match the couples in the HarvestCoupledResource objects,
        # ignoring their dataset values (they might be filled in or not).
        pkg_couples_str = package.extras['coupled-resource']
        pkg_couples = json.loads(pkg_couples_str)
        log.info('Service Record %s has %i coupled resources to update',
                 package.name, len(pkg_couples))

        table_couples_matching_service = HarvestCoupledResource.get_by_service_record(
            package)
        table_couples_not_matching_pkg = table_couples_matching_service.all(
        )  # cross them off as we go

        for pkg_couple in pkg_couples:
            try:
                ref = extract_harvest_source_reference_from_coupled_resource(
                    pkg_couple)
            except CoupledResourceParseError, e:
                log.warn('Error parsing couple: %s Ignoring couple=%s', e,
                         pkg_couple)
                continue
            # Match both service and ref
            matching_table_couples = table_couples_matching_service.filter_by(
                harvest_source_reference=ref)
            if matching_table_couples.count() > 0:
                # Test: test_02_reharvest_existing_service
                # Note down the matches so we don't delete them later
                for matching_table_couple in matching_table_couples:
                    log.info(
                        'Service couple is already there (%s, %s, %s)',
                        package.name, ref,
                        _package_name(matching_table_couple.dataset_record))
                    table_couples_not_matching_pkg.remove(
                        matching_table_couple)
                continue
            # Match just ref with blank service
            matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\
                                     .filter_by(service_record=None)
            if matching_table_couples.count() == 0:
                # Test: test_06_harvest_service_not_matching_a_dataset
                # create the row
                obj = HarvestCoupledResource(service_record=package,
                                             harvest_source_reference=ref)
                model.Session.add(obj)
                log.info('Ref is new for this service - adding (%s, %s, None)',
                         package.name, ref)
                model.Session.commit()
            else:
                # Test: test_04_harvest_service_to_match_existing_dataset
                for matching_table_couple in matching_table_couples:
                    # fill in the service value
                    matching_table_couple.service_record = package
                    log.info(
                        'Service filled into couple matching ref (%s, %s, %s)',
                        package.name, ref,
                        _package_name(matching_table_couple.dataset_record))
                model.Session.commit()

        # Delete service value for any table_couples not matching the package
        # Test: test_08_reharvest_existing_service_to_delete_and_add_couples
        for table_couple in table_couples_not_matching_pkg:
            log.info(
                'Service couple not matched - deleted service (%s->None, %s, %s)',
                _package_name(table_couple.service_record), ref,
                _package_name(table_couple.dataset_record))
            table_couple.service_record = None
            model.Session.commit()
        return
Ejemplo n.º 6
0
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.model import GeminiDocument
        from ckanext.spatial.lib.coupled_resource import extract_guid

        # Find service records
        for service_record in model.Session.query(model.Package).\
            filter_by(state='active').\
            join(model.PackageExtra).\
            filter_by(state='active').\
            filter_by(key='resource-type').\
            filter_by(value='service'):

            # Find coupled dataset records
            service_type = service_record.extras['resource-type']
            if not 'coupled-resource' in service_record.extras:
                if service_type in ('view', 'download'):
                    service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type)
                else:
                    service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name)
                continue                
            coupled_resources_str = service_record.extras['coupled-resource']
            coupled_resources = json.loads(coupled_resources_str)
            log.info('%s has %i coupled resources',
                     service_record.name, len(coupled_resources))
            couples_all_detected = True
            couples_detected = False
            for i, coupled_resource in enumerate(coupled_resources):
                couple_id = '%s.%s' % (service_record.name, i)
                href = coupled_resource['href']

                # For tests only
                #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']:
                #    break
                
                if len(href) <> 1:
                    log.error('Coupled resource href is not a list of 1: %r couple=%s',
                              href, couple_id)
                    couple_stats.add('Couple href is length %i' % len(href), couple_id)
                    couples_all_detected = False
                    continue
                href = href[0]
                if not href.strip():
                    log.error('Coupled resource href is blank. couple=%s',
                              couple_id)
                    couple_stats.add('Couple href is blank', couple_id)
                    couples_all_detected = False
                    continue
                    
                # Look for the equivalent dataset resource

                # If it is CSW, we must extract the guid
                # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&amp;REQUEST=GetRecordById&amp;ID=9df8df52-d788-37a8-e044-0003ba9b0d98&amp;elementSetName=full&amp;OutputSchema=http://www.isotc211.org/2005/gmd
                guid = extract_guid(href)
                if guid:
                    if not guid.strip():
                        couple_stats.add('Guid was blank', couple_id)
                        log.error('Guid was blank. href=%s', href, couple_id)
                        
                    try:
                        harvest_object = cls.find_harvest_object_by_guid(guid)
                    except FindError, e:
                        log.error('%s guid=%s couple=%s', e, guid, couple_id)
                        couple_stats.add(str(e), couple_id)
                        couples_all_detected = False
                        continue

                    dataset_record = harvest_object.package #res.resource_group.package
                    couple_stats.add('Couple completed', couple_id)
                    log.info('Couple completed %s <-> %s',
                             service_record.name, dataset_record.name)
                    
                    cls.add_coupling(service_record, dataset_record, harvest_object, guid)
                    couples_detected = True
                    continue

                # Known bad couples are weeded out
                bad_couples = ('GetCapabilities', 'CEH:EIDC',
                               'ceh:eidc',
                               'http://data.nbn.org.uk#',
                               'www.geostore.com/OGC/OGCInterface',
                               'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer',
                               'Please enter a valid url',
                               )
                bad_couple_detected = False
                for bad_couple in bad_couples:
                    if bad_couple in href:
                        couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id)
                        log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id)
                        bad_couple_detected = True
                if bad_couple_detected:
                    couples_all_detected = False
                    continue
                
                # Try as a WAF
                # Try the URL to download the gemini again, to find the
                # GUID of the dataset
                log.info('Trying possible WAF href: %s' % href)
                try:
                    res = requests.get(href, timeout=10)
                except Exception, e:
                    couple_stats.add('Connecting to href failed: %s' % \
                                     e, couple_id)
                    log.warning('Connecting to href failed: %s href:"%s"', \
                                     e, href)
                    couples_all_detected = False
                    break                    
                if not res.ok:
                    couple_stats.add('Resolving href failed: %s' % \
                                     res.reason, couple_id)
                    log.warning('Resolving href failed: %s %s href:"%s"', \
                                     res.status_code, res.reason, href)
                    couples_all_detected = False
                    break
                gemini = GeminiDocument(res.content)
                try:
                    guid = gemini.read_value('guid')
                except KeyError, e:
                    couple_stats.add('Could not get GUID from Gemini downloaded' % \
                                     href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
Ejemplo n.º 7
0
	def validate_json(self,received_data):

		try:
			json_object = json.loads(received_data)
		except ValueError, e:
			return False