def _features_as_json(self, features): '''Returns features in JSON format, with this structure: {"type": "FeatureCollection", "features": [ { "type": "Feature", "geometry": {"type": "Point", "coordinates": [102.0, 0.5]}, "properties": {"ID": 11, "SchoolName": "Camden", "SchoolType": "Primary", "StreetName": "Camden Road", "Town": "Carshalton", "Postcode": "SM5 2NS", "TelephoneNumber": "020 86477324", "Easting": 527700.179, "Northing": 164916.916} }, ... ] } ''' feature_dicts = [] for feature in features: # ignore feature['datasetid'] try: properties = json.loads(feature['properties']) except ValueError: log.error('Properties did not parse as JSON. Dataset: %s Properties: %r', feature['datasetid'], feature['properties']) properties = 'Error loading properties' coords = parse_point_wkt(feature['geom']) feature_dict = {'type': 'Feature', 'geometry': { 'type': 'Point', 'coordinates': coords, }, 'properties': properties, } feature_dicts.append(feature_dict) features_dict = {'type': 'FeatureCollection', 'features': feature_dicts} return json.dumps(features_dict)
def update_coupled_resources(package, harvest_source_reference): '''Update the harvest_coupled_resource_table with the details of this harvested package\'s couplings. :param package: the Package object containing extra fields with couples to update in the table. :param harvest_source_reference: the ref of this package being harvested. This is not relevant if it is a service record, but essential if it is a dataset. ''' resource_type = package.extras['resource-type'] if resource_type == 'service': # When a service record is harvested, ensure the couples listed # in it match the couples in the HarvestCoupledResource objects, # ignoring their dataset values (they might be filled in or not). pkg_couples_str = package.extras['coupled-resource'] pkg_couples = json.loads(pkg_couples_str) log.info('Service Record %s has %i coupled resources to update', package.name, len(pkg_couples)) table_couples_matching_service = HarvestCoupledResource.get_by_service_record(package) table_couples_not_matching_pkg = table_couples_matching_service.all() # cross them off as we go for pkg_couple in pkg_couples: try: ref = extract_harvest_source_reference_from_coupled_resource(pkg_couple) except CoupledResourceParseError, e: log.warn('Error parsing couple: %s Ignoring couple=%s', e, pkg_couple) continue # Match both service and ref matching_table_couples = table_couples_matching_service.filter_by(harvest_source_reference=ref) if matching_table_couples.count() > 0: # Test: test_02_reharvest_existing_service # Note down the matches so we don't delete them later for matching_table_couple in matching_table_couples: log.info('Service couple is already there (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) table_couples_not_matching_pkg.remove(matching_table_couple) continue # Match just ref with blank service matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\ .filter_by(service_record=None) if matching_table_couples.count() == 0: # Test: test_06_harvest_service_not_matching_a_dataset # create the row obj = HarvestCoupledResource(service_record=package, harvest_source_reference=ref) model.Session.add(obj) log.info('Ref is new for this service - adding (%s, %s, None)', package.name, ref) model.Session.commit() else: # Test: test_04_harvest_service_to_match_existing_dataset for matching_table_couple in matching_table_couples: # fill in the service value matching_table_couple.service_record = package log.info('Service filled into couple matching ref (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) model.Session.commit() # Delete service value for any table_couples not matching the package # Test: test_08_reharvest_existing_service_to_delete_and_add_couples for table_couple in table_couples_not_matching_pkg: log.info('Service couple not matched - deleted service (%s->None, %s, %s)', _package_name(table_couple.service_record), ref, _package_name(table_couple.dataset_record)) table_couple.service_record = None model.Session.commit() return
def detect(cls): '''Finds datasets that are coupled and adds their harvest_source_reference to the HarvestObject and package extras. ''' from ckan.lib.base import json from ckan import model from ckanext.harvest.model import HarvestObject from ckanext.spatial.model import GeminiDocument from ckanext.spatial.lib.coupled_resource import extract_guid # Find service records for service_record in model.Session.query(model.Package).\ filter_by(state='active').\ join(model.PackageExtra).\ filter_by(state='active').\ filter_by(key='resource-type').\ filter_by(value='service'): # Find coupled dataset records service_type = service_record.extras['resource-type'] if not 'coupled-resource' in service_record.extras: if service_type in ('view', 'download'): service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type) else: service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name) continue coupled_resources_str = service_record.extras['coupled-resource'] coupled_resources = json.loads(coupled_resources_str) log.info('%s has %i coupled resources', service_record.name, len(coupled_resources)) couples_all_detected = True couples_detected = False for i, coupled_resource in enumerate(coupled_resources): couple_id = '%s.%s' % (service_record.name, i) href = coupled_resource['href'] # For tests only #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']: # break if len(href) <> 1: log.error('Coupled resource href is not a list of 1: %r couple=%s', href, couple_id) couple_stats.add('Couple href is length %i' % len(href), couple_id) couples_all_detected = False continue href = href[0] if not href.strip(): log.error('Coupled resource href is blank. couple=%s', couple_id) couple_stats.add('Couple href is blank', couple_id) couples_all_detected = False continue # Look for the equivalent dataset resource # If it is CSW, we must extract the guid # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&REQUEST=GetRecordById&ID=9df8df52-d788-37a8-e044-0003ba9b0d98&elementSetName=full&OutputSchema=http://www.isotc211.org/2005/gmd guid = extract_guid(href) if guid: if not guid.strip(): couple_stats.add('Guid was blank', couple_id) log.error('Guid was blank. href=%s', href, couple_id) try: harvest_object = cls.find_harvest_object_by_guid(guid) except FindError, e: log.error('%s guid=%s couple=%s', e, guid, couple_id) couple_stats.add(str(e), couple_id) couples_all_detected = False continue dataset_record = harvest_object.package #res.resource_group.package couple_stats.add('Couple completed', couple_id) log.info('Couple completed %s <-> %s', service_record.name, dataset_record.name) cls.add_coupling(service_record, dataset_record, harvest_object, guid) couples_detected = True continue # Known bad couples are weeded out bad_couples = ('GetCapabilities', 'CEH:EIDC', 'ceh:eidc', 'http://data.nbn.org.uk#', 'www.geostore.com/OGC/OGCInterface', 'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer', 'Please enter a valid url', ) bad_couple_detected = False for bad_couple in bad_couples: if bad_couple in href: couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id) log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id) bad_couple_detected = True if bad_couple_detected: couples_all_detected = False continue # Try as a WAF # Try the URL to download the gemini again, to find the # GUID of the dataset log.info('Trying possible WAF href: %s' % href) try: res = requests.get(href, timeout=10) except Exception, e: couple_stats.add('Connecting to href failed: %s' % \ e, couple_id) log.warning('Connecting to href failed: %s href:"%s"', \ e, href) couples_all_detected = False break if not res.ok: couple_stats.add('Resolving href failed: %s' % \ res.reason, couple_id) log.warning('Resolving href failed: %s %s href:"%s"', \ res.status_code, res.reason, href) couples_all_detected = False break gemini = GeminiDocument(res.content) try: guid = gemini.read_value('guid') except KeyError, e: couple_stats.add('Could not get GUID from Gemini downloaded' % \ href, couple_id) log.warning('Could not get GUID from Gemini downloaded href:"%s"', \ href) couples_all_detected = False break
def validate_json(self, received_data): try: json_object = json.loads(received_data) except ValueError, e: return False
def update_coupled_resources(package, harvest_source_reference): '''Update the harvest_coupled_resource_table with the details of this harvested package\'s couplings. :param package: the Package object containing extra fields with couples to update in the table. :param harvest_source_reference: the ref of this package being harvested. This is not relevant if it is a service record, but essential if it is a dataset. ''' resource_type = package.extras['resource-type'] if resource_type == 'service': # When a service record is harvested, ensure the couples listed # in it match the couples in the HarvestCoupledResource objects, # ignoring their dataset values (they might be filled in or not). pkg_couples_str = package.extras['coupled-resource'] pkg_couples = json.loads(pkg_couples_str) log.info('Service Record %s has %i coupled resources to update', package.name, len(pkg_couples)) table_couples_matching_service = HarvestCoupledResource.get_by_service_record( package) table_couples_not_matching_pkg = table_couples_matching_service.all( ) # cross them off as we go for pkg_couple in pkg_couples: try: ref = extract_harvest_source_reference_from_coupled_resource( pkg_couple) except CoupledResourceParseError, e: log.warn('Error parsing couple: %s Ignoring couple=%s', e, pkg_couple) continue # Match both service and ref matching_table_couples = table_couples_matching_service.filter_by( harvest_source_reference=ref) if matching_table_couples.count() > 0: # Test: test_02_reharvest_existing_service # Note down the matches so we don't delete them later for matching_table_couple in matching_table_couples: log.info( 'Service couple is already there (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) table_couples_not_matching_pkg.remove( matching_table_couple) continue # Match just ref with blank service matching_table_couples = HarvestCoupledResource.get_by_harvest_source_reference(ref)\ .filter_by(service_record=None) if matching_table_couples.count() == 0: # Test: test_06_harvest_service_not_matching_a_dataset # create the row obj = HarvestCoupledResource(service_record=package, harvest_source_reference=ref) model.Session.add(obj) log.info('Ref is new for this service - adding (%s, %s, None)', package.name, ref) model.Session.commit() else: # Test: test_04_harvest_service_to_match_existing_dataset for matching_table_couple in matching_table_couples: # fill in the service value matching_table_couple.service_record = package log.info( 'Service filled into couple matching ref (%s, %s, %s)', package.name, ref, _package_name(matching_table_couple.dataset_record)) model.Session.commit() # Delete service value for any table_couples not matching the package # Test: test_08_reharvest_existing_service_to_delete_and_add_couples for table_couple in table_couples_not_matching_pkg: log.info( 'Service couple not matched - deleted service (%s->None, %s, %s)', _package_name(table_couple.service_record), ref, _package_name(table_couple.dataset_record)) table_couple.service_record = None model.Session.commit() return
def validate_json(self,received_data): try: json_object = json.loads(received_data) except ValueError, e: return False