def test_multiplicity_warning(): # This dataset lacks a value for Metadata Date and should # produce a log.warning, but not raise an exception. xml_string = open_xml_fixture('FCSConservancyPolygons.xml') gemini_document = GeminiDocument(xml_string) gemini_values = gemini_document.read_values() assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
def get_gemini_string_and_guid(self, content, url=None): '''From a string buffer containing Gemini XML, return the tree under gmd:MD_Metadata and the GUID for it. If it cannot parse the XML it will raise lxml.etree.XMLSyntaxError. If it cannot find the GUID element, then gemini_guid will be ''. :param content: string containing Gemini XML :param url: string giving info about the location of the XML to be used only in validation errors :returns: (gemini_string, gemini_guid) ''' xml = etree.fromstring(content) # The validator and GeminiDocument don\'t like the container metadata_tag = '{http://www.isotc211.org/2005/gmd}MD_Metadata' if xml.tag == metadata_tag: gemini_xml = xml else: gemini_xml = xml.find(metadata_tag) if gemini_xml is None: self._save_gather_error( 'Content is not a valid Gemini document without the gmd:MD_Metadata element', self.harvest_job) gemini_string = etree.tostring(gemini_xml) gemini_document = GeminiDocument(gemini_string) try: gemini_guid = gemini_document.read_value('guid') except KeyError: gemini_guid = None return gemini_string, gemini_guid
def get_record_type(cls, xml): ''' For a given ISO19139 record, returns the "type" e.g. "dataset", "series", "service" xml - etree of the ISO19139 XML record ''' gemini = GeminiDocument(xml_tree=xml) return gemini.read_value('resource-type')
def detect(cls): '''Finds datasets that are coupled and adds their harvest_source_reference to the HarvestObject and package extras. ''' from ckan.lib.base import json from ckan import model from ckanext.harvest.model import HarvestObject from ckanext.spatial.model import GeminiDocument from ckanext.spatial.lib.coupled_resource import extract_guid # Find service records for service_record in model.Session.query(model.Package).\ filter_by(state='active').\ join(model.PackageExtra).\ filter_by(state='active').\ filter_by(key='resource-type').\ filter_by(value='service'): # Find coupled dataset records service_type = service_record.extras['resource-type'] if not 'coupled-resource' in service_record.extras: if service_type in ('view', 'download'): service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type) else: service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name) continue coupled_resources_str = service_record.extras['coupled-resource'] coupled_resources = json.loads(coupled_resources_str) log.info('%s has %i coupled resources', service_record.name, len(coupled_resources)) couples_all_detected = True couples_detected = False for i, coupled_resource in enumerate(coupled_resources): couple_id = '%s.%s' % (service_record.name, i) href = coupled_resource['href'] # For tests only #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']: # break if len(href) <> 1: log.error('Coupled resource href is not a list of 1: %r couple=%s', href, couple_id) couple_stats.add('Couple href is length %i' % len(href), couple_id) couples_all_detected = False continue href = href[0] if not href.strip(): log.error('Coupled resource href is blank. couple=%s', couple_id) couple_stats.add('Couple href is blank', couple_id) couples_all_detected = False continue # Look for the equivalent dataset resource # If it is CSW, we must extract the guid # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&REQUEST=GetRecordById&ID=9df8df52-d788-37a8-e044-0003ba9b0d98&elementSetName=full&OutputSchema=http://www.isotc211.org/2005/gmd guid = extract_guid(href) if guid: if not guid.strip(): couple_stats.add('Guid was blank', couple_id) log.error('Guid was blank. href=%s', href, couple_id) try: harvest_object = cls.find_harvest_object_by_guid(guid) except FindError, e: log.error('%s guid=%s couple=%s', e, guid, couple_id) couple_stats.add(str(e), couple_id) couples_all_detected = False continue dataset_record = harvest_object.package #res.resource_group.package couple_stats.add('Couple completed', couple_id) log.info('Couple completed %s <-> %s', service_record.name, dataset_record.name) cls.add_coupling(service_record, dataset_record, harvest_object, guid) couples_detected = True continue # Known bad couples are weeded out bad_couples = ('GetCapabilities', 'CEH:EIDC', 'ceh:eidc', 'http://data.nbn.org.uk#', 'www.geostore.com/OGC/OGCInterface', 'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer', 'Please enter a valid url', ) bad_couple_detected = False for bad_couple in bad_couples: if bad_couple in href: couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id) log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id) bad_couple_detected = True if bad_couple_detected: couples_all_detected = False continue # Try as a WAF # Try the URL to download the gemini again, to find the # GUID of the dataset log.info('Trying possible WAF href: %s' % href) try: res = requests.get(href, timeout=10) except Exception, e: couple_stats.add('Connecting to href failed: %s' % \ e, couple_id) log.warning('Connecting to href failed: %s href:"%s"', \ e, href) couples_all_detected = False break if not res.ok: couple_stats.add('Resolving href failed: %s' % \ res.reason, couple_id) log.warning('Resolving href failed: %s %s href:"%s"', \ res.status_code, res.reason, href) couples_all_detected = False break gemini = GeminiDocument(res.content) try: guid = gemini.read_value('guid') except KeyError, e: couple_stats.add('Could not get GUID from Gemini downloaded' % \ href, couple_id) log.warning('Could not get GUID from Gemini downloaded href:"%s"', \ href) couples_all_detected = False break
def test_simple(): xml_string = open_xml_fixture('gemini_dataset.xml') gemini_document = GeminiDocument(xml_string) gemini_values = gemini_document.read_values() assert_equal(gemini_values['guid'], 'test-dataset-1') assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')
def write_package_from_gemini_string(self, content): '''Create or update a Package based on some content that has come from a URL. Returns the package_dict of the result. If there is an error, it returns None or raises Exception. ''' log = logging.getLogger(__name__ + '.import') package = None gemini_document = GeminiDocument(content) gemini_values = gemini_document.read_values() gemini_guid = gemini_values['guid'] # Save the metadata reference date in the Harvest Object try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%d') except ValueError: try: metadata_modified_date = datetime.strptime( gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S') except: raise Exception('Could not extract reference date for GUID %s (%s)' \ % (gemini_guid,gemini_values['metadata-date'])) self.obj.metadata_modified_date = metadata_modified_date self.obj.save() last_harvested_object = Session.query(HarvestObject) \ .filter(HarvestObject.guid==gemini_guid) \ .filter(HarvestObject.current==True) \ .all() if len(last_harvested_object) == 1: last_harvested_object = last_harvested_object[0] elif len(last_harvested_object) > 1: raise Exception( 'Application Error: more than one current record for GUID %s' % gemini_guid) reactivate_package = False if last_harvested_object: # We've previously harvested this (i.e. it's an update) # Use metadata modified date instead of content to determine if the package # needs to be updated if last_harvested_object.metadata_modified_date is None \ or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \ or self.force_import \ or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and last_harvested_object.source.active is False): if self.force_import: log.info('Import forced for object %s with GUID %s' % (self.obj.id, gemini_guid)) else: log.info( 'Package for object with GUID %s needs to be created or updated' % gemini_guid) package = last_harvested_object.package # If the package has a deleted state, we will only update it and reactivate it if the # new document has a more recent modified date if package.state == u'deleted': if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date: log.info( 'Package for object with GUID %s will be re-activated' % gemini_guid) reactivate_package = True else: log.info( 'Remote record with GUID %s is not more recent than a deleted package, skipping... ' % gemini_guid) return None else: if last_harvested_object.content != self.obj.content and \ last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date: diff_generator = difflib.unified_diff( last_harvested_object.content.split('\n'), self.obj.content.split('\n')) diff = '\n'.join([line for line in diff_generator]) raise Exception( 'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff)) else: # The content hasn't changed, no need to update the package log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid)) return None else: log.info( 'No package with GEMINI guid %s found, let\'s create one' % gemini_guid) extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id} # Just add some of the metadata as extras, not the whole lot for name in [ # Essentials 'spatial-reference-system', 'guid', # Usefuls 'dataset-reference-date', 'metadata-language', # Language 'metadata-date', # Released 'coupled-resource', 'contact-email', 'frequency-of-update', 'spatial-data-service-type', ]: extras[name] = gemini_values[name] if len(gemini_values.get('progress', [])): extras['progress'] = gemini_values['progress'][0] else: extras['progress'] = '' extras['resource-type'] = gemini_values['resource-type'][0] # Use-constraints can contain values which are: # * free text # * licence URL # Store all values in extra['licence'] and if there is a # URL in there, store that in extra['licence-url'] extras['licence'] = gemini_values.get('use-constraints', '') if len(extras['licence']): licence_url_extracted = self._extract_first_licence_url( extras['licence']) if licence_url_extracted: extras['licence_url'] = licence_url_extracted extras['access_constraints'] = gemini_values.get( 'limitations-on-public-access', '') if 'temporal-extent-begin' in gemini_values: #gemini_values['temporal-extent-begin'].sort() extras['temporal_coverage-from'] = gemini_values[ 'temporal-extent-begin'] if 'temporal-extent-end' in gemini_values: #gemini_values['temporal-extent-end'].sort() extras['temporal_coverage-to'] = gemini_values[ 'temporal-extent-end'] # Save responsible organization roles provider, responsible_parties = self._process_responsible_organisation( gemini_values['responsible-organisation']) extras['provider'] = provider extras['responsible-party'] = '; '.join(responsible_parties) if len(gemini_values['bbox']) > 0: extras['bbox-east-long'] = gemini_values['bbox'][0]['east'] extras['bbox-north-lat'] = gemini_values['bbox'][0]['north'] extras['bbox-south-lat'] = gemini_values['bbox'][0]['south'] extras['bbox-west-long'] = gemini_values['bbox'][0]['west'] # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry extent_string = self.extent_template.substitute( xmin=extras['bbox-east-long'], ymin=extras['bbox-south-lat'], xmax=extras['bbox-west-long'], ymax=extras['bbox-north-lat']) extras['spatial'] = extent_string.strip() tags = [] for tag in gemini_values['tags']: tag = tag[:50] if len(tag) > 50 else tag tags.append({'name': tag}) package_dict = { 'title': gemini_values['title'], 'notes': gemini_values['abstract'], 'tags': tags, 'resources': [] } if self.obj.source.publisher_id: package_dict['groups'] = [{'id': self.obj.source.publisher_id}] if reactivate_package: package_dict['state'] = u'active' if package is None or package.title != gemini_values['title']: name = self.gen_new_name(gemini_values['title']) if not name: name = self.gen_new_name(six.text_type(gemini_guid)) if not name: raise Exception( 'Could not generate a unique name from the title or the GUID. Please choose a more unique title.' ) package_dict['name'] = name else: package_dict['name'] = package.name resource_locators = gemini_values.get('resource-locator', []) if len(resource_locators): for resource_locator in resource_locators: url = resource_locator.get('url', '') if url: resource_format = '' resource = {} if extras['resource-type'] == 'service': # Check if the service is a view service test_url = url.split('?')[0] if '?' in url else url if self._is_wms(test_url): resource['verified'] = True resource['verified_date'] = datetime.now( ).isoformat() resource_format = 'WMS' resource.update({ 'url': url, 'name': resource_locator.get('name', ''), 'description': resource_locator.get('description') if resource_locator.get('description') else 'Resource locator', 'format': resource_format or None, 'resource_locator_protocol': resource_locator.get('protocol', ''), 'resource_locator_function': resource_locator.get('function', '') }) package_dict['resources'].append(resource) # Guess the best view service to use in WMS preview verified_view_resources = [ r for r in package_dict['resources'] if 'verified' in r and r['format'] == 'WMS' ] if len(verified_view_resources): verified_view_resources[0][ 'ckan_recommended_wms_preview'] = True else: view_resources = [ r for r in package_dict['resources'] if r['format'] == 'WMS' ] if len(view_resources): view_resources[0]['ckan_recommended_wms_preview'] = True extras_as_dict = [] for key, value in extras.items(): if isinstance(value, six.string_types + (Number, )): extras_as_dict.append({'key': key, 'value': value}) else: extras_as_dict.append({'key': key, 'value': json.dumps(value)}) package_dict['extras'] = extras_as_dict if package == None: # Create new package from data. package = self._create_package_from_data(package_dict) log.info('Created new package ID %s with GEMINI guid %s', package['id'], gemini_guid) else: package = self._create_package_from_data(package_dict, package=package) log.info( 'Updated existing package ID %s with existing GEMINI guid %s', package['id'], gemini_guid) # Flag the other objects of this source as not current anymore from ckanext.harvest.model import harvest_object_table u = update(harvest_object_table) \ .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ .values(current=False) Session.execute(u, params={'b_package_id': package['id']}) Session.commit() # Refresh current object from session, otherwise the # import paster command fails Session.remove() Session.add(self.obj) Session.refresh(self.obj) # Set reference to package in the HarvestObject and flag it as # the current one if not self.obj.package_id: self.obj.package_id = package['id'] self.obj.current = True self.obj.save() return package
def setup_class(cls): xml_string = open_xml_fixture('gemini_dataset.xml') cls.gemini_document = GeminiDocument(xml_string)
class Validation(CkanCommand): '''Validation commands Usage: validation report [package-name] Performs validation on the harvested metadata, either for all packages or the one specified. validation report-csv <filename>.csv Performs validation on all the harvested metadata in the db and writes a report in CSV format to the given filepath. validation file <filename>.xml Performs validation on the given metadata file. ''' summary = __doc__.split('\n')[0] usage = __doc__ max_args = 3 min_args = 0 def command(self): if not self.args or self.args[0] in ['--help', '-h', 'help']: print self.usage sys.exit(1) self._load_config() cmd = self.args[0] if cmd == 'report': self.report() elif cmd == 'report-csv': self.report_csv() elif cmd == 'file': self.validate_file() else: print 'Command %s not recognized' % cmd def report(self): from ckan import model from ckanext.harvest.model import HarvestObject from ckanext.spatial.lib.reports import validation_report if len(self.args) >= 2: package_ref = unicode(self.args[1]) pkg = model.Package.get(package_ref) if not pkg: print 'Package ref "%s" not recognised' % package_ref sys.exit(1) else: pkg = None report = validation_report(package_id=pkg.id) for row in report.get_rows_html_formatted(): print for i, col_name in enumerate(report.column_names): print ' %s: %s' % (col_name, row[i]) def validate_file(self): from ckanext.spatial.harvesters import SpatialHarvester from ckanext.spatial.model import GeminiDocument if len(self.args) > 2: print 'Too many parameters %i' % len(self.args) sys.exit(1) if len(self.args) < 2: print 'Not enough parameters %i' % len(self.args) sys.exit(1) metadata_filepath = self.args[1] if not os.path.exists(metadata_filepath): print 'Filepath %s not found' % metadata_filepath sys.exit(1) with open(metadata_filepath, 'rb') as f: metadata_xml = f.read() # this is still encoded - hopefully as UTF8. If not, then it needs # decoding and recoding as UTF8. # Check it is UTF8, as that's what etree expects. try: decoded = metadata_xml.decode("utf-8") reencoded = decoded.encode("utf-8") except UnicodeDecodeError, e: print 'ERROR: File was not UTF8 \'%s\': %s' % \ (metadata_filepath, e) sys.exit(1) # etree.fromstring accepts either a unicode string or the encoding is # expressed in the <xml> tag. NB 'UTF-8' is correct, 'UTF8' is wrong. xml = etree.fromstring(metadata_xml) # XML validation validators = SpatialHarvester()._get_validator() print 'Validators: %r' % validators.profiles valid, errors = validators.is_valid(xml) # CKAN read of values if valid: try: gemini_document = GeminiDocument(metadata_xml) gemini_values = gemini_document.read_values() except Exception, e: valid = False errors.append( 'CKAN exception reading values from GeminiDocument: %s' % e)