def test_multiplicity_warning():
    # This dataset lacks a value for Metadata Date and should
    # produce a log.warning, but not raise an exception.
    xml_string = open_xml_fixture('FCSConservancyPolygons.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
def test_multiplicity_warning():
    # This dataset lacks a value for Metadata Date and should
    # produce a log.warning, but not raise an exception.
    xml_string = open_xml_fixture('FCSConservancyPolygons.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
Exemple #3
0
    def get_gemini_string_and_guid(self, content, url=None):
        '''From a string buffer containing Gemini XML, return the tree
        under gmd:MD_Metadata and the GUID for it.

        If it cannot parse the XML it will raise lxml.etree.XMLSyntaxError.
        If it cannot find the GUID element, then gemini_guid will be ''.

        :param content: string containing Gemini XML
        :param url: string giving info about the location of the XML to be
                    used only in validation errors
        :returns: (gemini_string, gemini_guid)
        '''
        xml = etree.fromstring(content)

        # The validator and GeminiDocument don\'t like the container
        metadata_tag = '{http://www.isotc211.org/2005/gmd}MD_Metadata'
        if xml.tag == metadata_tag:
            gemini_xml = xml
        else:
            gemini_xml = xml.find(metadata_tag)

        if gemini_xml is None:
            self._save_gather_error(
                'Content is not a valid Gemini document without the gmd:MD_Metadata element',
                self.harvest_job)

        gemini_string = etree.tostring(gemini_xml)
        gemini_document = GeminiDocument(gemini_string)
        try:
            gemini_guid = gemini_document.read_value('guid')
        except KeyError:
            gemini_guid = None

        return gemini_string, gemini_guid
    def get_record_type(cls, xml):
        '''
        For a given ISO19139 record, returns the "type"
        e.g. "dataset", "series", "service"

        xml - etree of the ISO19139 XML record
        '''
        gemini = GeminiDocument(xml_tree=xml)
        return gemini.read_value('resource-type')
Exemple #5
0
    def get_record_type(cls, xml):
        '''
        For a given ISO19139 record, returns the "type"
        e.g. "dataset", "series", "service"

        xml - etree of the ISO19139 XML record
        '''
        gemini = GeminiDocument(xml_tree=xml)
        return gemini.read_value('resource-type')
Exemple #6
0
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.model import GeminiDocument
        from ckanext.spatial.lib.coupled_resource import extract_guid

        # Find service records
        for service_record in model.Session.query(model.Package).\
            filter_by(state='active').\
            join(model.PackageExtra).\
            filter_by(state='active').\
            filter_by(key='resource-type').\
            filter_by(value='service'):

            # Find coupled dataset records
            service_type = service_record.extras['resource-type']
            if not 'coupled-resource' in service_record.extras:
                if service_type in ('view', 'download'):
                    service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type)
                else:
                    service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name)
                continue                
            coupled_resources_str = service_record.extras['coupled-resource']
            coupled_resources = json.loads(coupled_resources_str)
            log.info('%s has %i coupled resources',
                     service_record.name, len(coupled_resources))
            couples_all_detected = True
            couples_detected = False
            for i, coupled_resource in enumerate(coupled_resources):
                couple_id = '%s.%s' % (service_record.name, i)
                href = coupled_resource['href']

                # For tests only
                #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']:
                #    break
                
                if len(href) <> 1:
                    log.error('Coupled resource href is not a list of 1: %r couple=%s',
                              href, couple_id)
                    couple_stats.add('Couple href is length %i' % len(href), couple_id)
                    couples_all_detected = False
                    continue
                href = href[0]
                if not href.strip():
                    log.error('Coupled resource href is blank. couple=%s',
                              couple_id)
                    couple_stats.add('Couple href is blank', couple_id)
                    couples_all_detected = False
                    continue
                    
                # Look for the equivalent dataset resource

                # If it is CSW, we must extract the guid
                # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&amp;REQUEST=GetRecordById&amp;ID=9df8df52-d788-37a8-e044-0003ba9b0d98&amp;elementSetName=full&amp;OutputSchema=http://www.isotc211.org/2005/gmd
                guid = extract_guid(href)
                if guid:
                    if not guid.strip():
                        couple_stats.add('Guid was blank', couple_id)
                        log.error('Guid was blank. href=%s', href, couple_id)
                        
                    try:
                        harvest_object = cls.find_harvest_object_by_guid(guid)
                    except FindError, e:
                        log.error('%s guid=%s couple=%s', e, guid, couple_id)
                        couple_stats.add(str(e), couple_id)
                        couples_all_detected = False
                        continue

                    dataset_record = harvest_object.package #res.resource_group.package
                    couple_stats.add('Couple completed', couple_id)
                    log.info('Couple completed %s <-> %s',
                             service_record.name, dataset_record.name)
                    
                    cls.add_coupling(service_record, dataset_record, harvest_object, guid)
                    couples_detected = True
                    continue

                # Known bad couples are weeded out
                bad_couples = ('GetCapabilities', 'CEH:EIDC',
                               'ceh:eidc',
                               'http://data.nbn.org.uk#',
                               'www.geostore.com/OGC/OGCInterface',
                               'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer',
                               'Please enter a valid url',
                               )
                bad_couple_detected = False
                for bad_couple in bad_couples:
                    if bad_couple in href:
                        couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id)
                        log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id)
                        bad_couple_detected = True
                if bad_couple_detected:
                    couples_all_detected = False
                    continue
                
                # Try as a WAF
                # Try the URL to download the gemini again, to find the
                # GUID of the dataset
                log.info('Trying possible WAF href: %s' % href)
                try:
                    res = requests.get(href, timeout=10)
                except Exception, e:
                    couple_stats.add('Connecting to href failed: %s' % \
                                     e, couple_id)
                    log.warning('Connecting to href failed: %s href:"%s"', \
                                     e, href)
                    couples_all_detected = False
                    break                    
                if not res.ok:
                    couple_stats.add('Resolving href failed: %s' % \
                                     res.reason, couple_id)
                    log.warning('Resolving href failed: %s %s href:"%s"', \
                                     res.status_code, res.reason, href)
                    couples_all_detected = False
                    break
                gemini = GeminiDocument(res.content)
                try:
                    guid = gemini.read_value('guid')
                except KeyError, e:
                    couple_stats.add('Could not get GUID from Gemini downloaded' % \
                                     href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
def test_simple():
    xml_string = open_xml_fixture('gemini_dataset.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'test-dataset-1')
    assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')
Exemple #8
0
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.model import GeminiDocument
        from ckanext.spatial.lib.coupled_resource import extract_guid

        # Find service records
        for service_record in model.Session.query(model.Package).\
            filter_by(state='active').\
            join(model.PackageExtra).\
            filter_by(state='active').\
            filter_by(key='resource-type').\
            filter_by(value='service'):

            # Find coupled dataset records
            service_type = service_record.extras['resource-type']
            if not 'coupled-resource' in service_record.extras:
                if service_type in ('view', 'download'):
                    service_stats.add('No coupled-resource extra for %s type (where it is mandatory)', service_record.name, service_type)
                else:
                    service_stats.add('No coupled-resource extra (but not mandatory for this service type)', service_record.name)
                continue                
            coupled_resources_str = service_record.extras['coupled-resource']
            coupled_resources = json.loads(coupled_resources_str)
            log.info('%s has %i coupled resources',
                     service_record.name, len(coupled_resources))
            couples_all_detected = True
            couples_detected = False
            for i, coupled_resource in enumerate(coupled_resources):
                couple_id = '%s.%s' % (service_record.name, i)
                href = coupled_resource['href']

                # For tests only
                #if href != ['http://www.ordnancesurvey.co.uk/oswebsite/xml/products/Topo.xml']:
                #    break
                
                if len(href) <> 1:
                    log.error('Coupled resource href is not a list of 1: %r couple=%s',
                              href, couple_id)
                    couple_stats.add('Couple href is length %i' % len(href), couple_id)
                    couples_all_detected = False
                    continue
                href = href[0]
                if not href.strip():
                    log.error('Coupled resource href is blank. couple=%s',
                              couple_id)
                    couple_stats.add('Couple href is blank', couple_id)
                    couples_all_detected = False
                    continue
                    
                # Look for the equivalent dataset resource

                # If it is CSW, we must extract the guid
                # Example CSW url: http://ogcdev.bgs.ac.uk/geonetwork/srv/en/csw?SERVICE=CSW&amp;REQUEST=GetRecordById&amp;ID=9df8df52-d788-37a8-e044-0003ba9b0d98&amp;elementSetName=full&amp;OutputSchema=http://www.isotc211.org/2005/gmd
                guid = extract_guid(href)
                if guid:
                    if not guid.strip():
                        couple_stats.add('Guid was blank', couple_id)
                        log.error('Guid was blank. href=%s', href, couple_id)
                        
                    try:
                        harvest_object = cls.find_harvest_object_by_guid(guid)
                    except FindError, e:
                        log.error('%s guid=%s couple=%s', e, guid, couple_id)
                        couple_stats.add(str(e), couple_id)
                        couples_all_detected = False
                        continue

                    dataset_record = harvest_object.package #res.resource_group.package
                    couple_stats.add('Couple completed', couple_id)
                    log.info('Couple completed %s <-> %s',
                             service_record.name, dataset_record.name)
                    
                    cls.add_coupling(service_record, dataset_record, harvest_object, guid)
                    couples_detected = True
                    continue

                # Known bad couples are weeded out
                bad_couples = ('GetCapabilities', 'CEH:EIDC',
                               'ceh:eidc',
                               'http://data.nbn.org.uk#',
                               'www.geostore.com/OGC/OGCInterface',
                               'spatialni.gov.uk/arcgis/services/LPS/CadastreNI/MapServer/WMSServer',
                               'Please enter a valid url',
                               )
                bad_couple_detected = False
                for bad_couple in bad_couples:
                    if bad_couple in href:
                        couple_stats.add('Invalid couple (%s)' % bad_couple, couple_id)
                        log.info('Invalid couple (%s): %s couple=%s', bad_couple, href, couple_id)
                        bad_couple_detected = True
                if bad_couple_detected:
                    couples_all_detected = False
                    continue
                
                # Try as a WAF
                # Try the URL to download the gemini again, to find the
                # GUID of the dataset
                log.info('Trying possible WAF href: %s' % href)
                try:
                    res = requests.get(href, timeout=10)
                except Exception, e:
                    couple_stats.add('Connecting to href failed: %s' % \
                                     e, couple_id)
                    log.warning('Connecting to href failed: %s href:"%s"', \
                                     e, href)
                    couples_all_detected = False
                    break                    
                if not res.ok:
                    couple_stats.add('Resolving href failed: %s' % \
                                     res.reason, couple_id)
                    log.warning('Resolving href failed: %s %s href:"%s"', \
                                     res.status_code, res.reason, href)
                    couples_all_detected = False
                    break
                gemini = GeminiDocument(res.content)
                try:
                    guid = gemini.read_value('guid')
                except KeyError, e:
                    couple_stats.add('Could not get GUID from Gemini downloaded' % \
                                     href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
Exemple #9
0
    def write_package_from_gemini_string(self, content):
        '''Create or update a Package based on some content that has
        come from a URL.

        Returns the package_dict of the result.
        If there is an error, it returns None or raises Exception.
        '''
        log = logging.getLogger(__name__ + '.import')
        package = None
        gemini_document = GeminiDocument(content)
        gemini_values = gemini_document.read_values()
        gemini_guid = gemini_values['guid']

        # Save the metadata reference date in the Harvest Object
        try:
            metadata_modified_date = datetime.strptime(
                gemini_values['metadata-date'], '%Y-%m-%d')
        except ValueError:
            try:
                metadata_modified_date = datetime.strptime(
                    gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S')
            except:
                raise Exception('Could not extract reference date for GUID %s (%s)' \
                        % (gemini_guid,gemini_values['metadata-date']))

        self.obj.metadata_modified_date = metadata_modified_date
        self.obj.save()

        last_harvested_object = Session.query(HarvestObject) \
                            .filter(HarvestObject.guid==gemini_guid) \
                            .filter(HarvestObject.current==True) \
                            .all()

        if len(last_harvested_object) == 1:
            last_harvested_object = last_harvested_object[0]
        elif len(last_harvested_object) > 1:
            raise Exception(
                'Application Error: more than one current record for GUID %s' %
                gemini_guid)

        reactivate_package = False
        if last_harvested_object:
            # We've previously harvested this (i.e. it's an update)

            # Use metadata modified date instead of content to determine if the package
            # needs to be updated
            if last_harvested_object.metadata_modified_date is None \
                or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \
                or self.force_import \
                or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and
                    last_harvested_object.source.active is False):

                if self.force_import:
                    log.info('Import forced for object %s with GUID %s' %
                             (self.obj.id, gemini_guid))
                else:
                    log.info(
                        'Package for object with GUID %s needs to be created or updated'
                        % gemini_guid)

                package = last_harvested_object.package

                # If the package has a deleted state, we will only update it and reactivate it if the
                # new document has a more recent modified date
                if package.state == u'deleted':
                    if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date:
                        log.info(
                            'Package for object with GUID %s will be re-activated'
                            % gemini_guid)
                        reactivate_package = True
                    else:
                        log.info(
                            'Remote record with GUID %s is not more recent than a deleted package, skipping... '
                            % gemini_guid)
                        return None

            else:
                if last_harvested_object.content != self.obj.content and \
                 last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
                    diff_generator = difflib.unified_diff(
                        last_harvested_object.content.split('\n'),
                        self.obj.content.split('\n'))
                    diff = '\n'.join([line for line in diff_generator])
                    raise Exception(
                        'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s'
                        % (gemini_guid, diff))
                else:
                    # The content hasn't changed, no need to update the package
                    log.info('Document with GUID %s unchanged, skipping...' %
                             (gemini_guid))
                return None
        else:
            log.info(
                'No package with GEMINI guid %s found, let\'s create one' %
                gemini_guid)

        extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id}

        # Just add some of the metadata as extras, not the whole lot
        for name in [
                # Essentials
                'spatial-reference-system',
                'guid',
                # Usefuls
                'dataset-reference-date',
                'metadata-language',  # Language
                'metadata-date',  # Released
                'coupled-resource',
                'contact-email',
                'frequency-of-update',
                'spatial-data-service-type',
        ]:
            extras[name] = gemini_values[name]

        if len(gemini_values.get('progress', [])):
            extras['progress'] = gemini_values['progress'][0]
        else:
            extras['progress'] = ''

        extras['resource-type'] = gemini_values['resource-type'][0]

        # Use-constraints can contain values which are:
        #  * free text
        #  * licence URL
        # Store all values in extra['licence'] and if there is a
        # URL in there, store that in extra['licence-url']
        extras['licence'] = gemini_values.get('use-constraints', '')
        if len(extras['licence']):
            licence_url_extracted = self._extract_first_licence_url(
                extras['licence'])
            if licence_url_extracted:
                extras['licence_url'] = licence_url_extracted

        extras['access_constraints'] = gemini_values.get(
            'limitations-on-public-access', '')
        if 'temporal-extent-begin' in gemini_values:
            #gemini_values['temporal-extent-begin'].sort()
            extras['temporal_coverage-from'] = gemini_values[
                'temporal-extent-begin']
        if 'temporal-extent-end' in gemini_values:
            #gemini_values['temporal-extent-end'].sort()
            extras['temporal_coverage-to'] = gemini_values[
                'temporal-extent-end']

        # Save responsible organization roles
        provider, responsible_parties = self._process_responsible_organisation(
            gemini_values['responsible-organisation'])
        extras['provider'] = provider
        extras['responsible-party'] = '; '.join(responsible_parties)

        if len(gemini_values['bbox']) > 0:
            extras['bbox-east-long'] = gemini_values['bbox'][0]['east']
            extras['bbox-north-lat'] = gemini_values['bbox'][0]['north']
            extras['bbox-south-lat'] = gemini_values['bbox'][0]['south']
            extras['bbox-west-long'] = gemini_values['bbox'][0]['west']

            # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
            extent_string = self.extent_template.substitute(
                xmin=extras['bbox-east-long'],
                ymin=extras['bbox-south-lat'],
                xmax=extras['bbox-west-long'],
                ymax=extras['bbox-north-lat'])

            extras['spatial'] = extent_string.strip()

        tags = []
        for tag in gemini_values['tags']:
            tag = tag[:50] if len(tag) > 50 else tag
            tags.append({'name': tag})

        package_dict = {
            'title': gemini_values['title'],
            'notes': gemini_values['abstract'],
            'tags': tags,
            'resources': []
        }

        if self.obj.source.publisher_id:
            package_dict['groups'] = [{'id': self.obj.source.publisher_id}]

        if reactivate_package:
            package_dict['state'] = u'active'

        if package is None or package.title != gemini_values['title']:
            name = self.gen_new_name(gemini_values['title'])
            if not name:
                name = self.gen_new_name(six.text_type(gemini_guid))
            if not name:
                raise Exception(
                    'Could not generate a unique name from the title or the GUID. Please choose a more unique title.'
                )
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        resource_locators = gemini_values.get('resource-locator', [])

        if len(resource_locators):
            for resource_locator in resource_locators:
                url = resource_locator.get('url', '')
                if url:
                    resource_format = ''
                    resource = {}
                    if extras['resource-type'] == 'service':
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now(
                            ).isoformat()
                            resource_format = 'WMS'
                    resource.update({
                        'url':
                        url,
                        'name':
                        resource_locator.get('name', ''),
                        'description':
                        resource_locator.get('description')
                        if resource_locator.get('description') else
                        'Resource locator',
                        'format':
                        resource_format or None,
                        'resource_locator_protocol':
                        resource_locator.get('protocol', ''),
                        'resource_locator_function':
                        resource_locator.get('function', '')
                    })
                    package_dict['resources'].append(resource)

            # Guess the best view service to use in WMS preview
            verified_view_resources = [
                r for r in package_dict['resources']
                if 'verified' in r and r['format'] == 'WMS'
            ]
            if len(verified_view_resources):
                verified_view_resources[0][
                    'ckan_recommended_wms_preview'] = True
            else:
                view_resources = [
                    r for r in package_dict['resources']
                    if r['format'] == 'WMS'
                ]
                if len(view_resources):
                    view_resources[0]['ckan_recommended_wms_preview'] = True

        extras_as_dict = []
        for key, value in extras.items():
            if isinstance(value, six.string_types + (Number, )):
                extras_as_dict.append({'key': key, 'value': value})
            else:
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})

        package_dict['extras'] = extras_as_dict

        if package == None:
            # Create new package from data.
            package = self._create_package_from_data(package_dict)
            log.info('Created new package ID %s with GEMINI guid %s',
                     package['id'], gemini_guid)
        else:
            package = self._create_package_from_data(package_dict,
                                                     package=package)
            log.info(
                'Updated existing package ID %s with existing GEMINI guid %s',
                package['id'], gemini_guid)

        # Flag the other objects of this source as not current anymore
        from ckanext.harvest.model import harvest_object_table
        u = update(harvest_object_table) \
                .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                .values(current=False)
        Session.execute(u, params={'b_package_id': package['id']})
        Session.commit()

        # Refresh current object from session, otherwise the
        # import paster command fails
        Session.remove()
        Session.add(self.obj)
        Session.refresh(self.obj)

        # Set reference to package in the HarvestObject and flag it as
        # the current one
        if not self.obj.package_id:
            self.obj.package_id = package['id']

        self.obj.current = True
        self.obj.save()

        return package
 def setup_class(cls):
     xml_string = open_xml_fixture('gemini_dataset.xml')
     cls.gemini_document = GeminiDocument(xml_string)
def test_simple():
    xml_string = open_xml_fixture('gemini_dataset.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'test-dataset-1')
    assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')
Exemple #12
0
class Validation(CkanCommand):
    '''Validation commands

    Usage:
        validation report [package-name]
            Performs validation on the harvested metadata, either for all
            packages or the one specified.

        validation report-csv <filename>.csv
            Performs validation on all the harvested metadata in the db and
            writes a report in CSV format to the given filepath.
      
        validation file <filename>.xml
            Performs validation on the given metadata file.
    '''
    summary = __doc__.split('\n')[0]
    usage = __doc__
    max_args = 3
    min_args = 0

    def command(self):
        if not self.args or self.args[0] in ['--help', '-h', 'help']:
            print self.usage
            sys.exit(1)

        self._load_config()

        cmd = self.args[0]
        if cmd == 'report':
            self.report()
        elif cmd == 'report-csv':
            self.report_csv()
        elif cmd == 'file':
            self.validate_file()
        else:
            print 'Command %s not recognized' % cmd

    def report(self):
        from ckan import model
        from ckanext.harvest.model import HarvestObject
        from ckanext.spatial.lib.reports import validation_report

        if len(self.args) >= 2:
            package_ref = unicode(self.args[1])
            pkg = model.Package.get(package_ref)
            if not pkg:
                print 'Package ref "%s" not recognised' % package_ref
                sys.exit(1)
        else:
            pkg = None

        report = validation_report(package_id=pkg.id)
        for row in report.get_rows_html_formatted():
            print
            for i, col_name in enumerate(report.column_names):
                print '  %s: %s' % (col_name, row[i])

    def validate_file(self):
        from ckanext.spatial.harvesters import SpatialHarvester
        from ckanext.spatial.model import GeminiDocument

        if len(self.args) > 2:
            print 'Too many parameters %i' % len(self.args)
            sys.exit(1)
        if len(self.args) < 2:
            print 'Not enough parameters %i' % len(self.args)
            sys.exit(1)
        metadata_filepath = self.args[1]
        if not os.path.exists(metadata_filepath):
            print 'Filepath %s not found' % metadata_filepath
            sys.exit(1)

        with open(metadata_filepath, 'rb') as f:
            metadata_xml = f.read()

        # this is still encoded - hopefully as UTF8. If not, then it needs
        # decoding and recoding as UTF8.

        # Check it is UTF8, as that's what etree expects.
        try:
            decoded = metadata_xml.decode("utf-8")
            reencoded = decoded.encode("utf-8")
        except UnicodeDecodeError, e:
            print 'ERROR: File was not UTF8 \'%s\': %s' % \
                  (metadata_filepath, e)
            sys.exit(1)

        # etree.fromstring accepts either a unicode string or the encoding is
        # expressed in the <xml> tag. NB 'UTF-8' is correct, 'UTF8' is wrong.
        xml = etree.fromstring(metadata_xml)

        # XML validation
        validators = SpatialHarvester()._get_validator()
        print 'Validators: %r' % validators.profiles
        valid, errors = validators.is_valid(xml)

        # CKAN read of values
        if valid:
            try:
                gemini_document = GeminiDocument(metadata_xml)
                gemini_values = gemini_document.read_values()
            except Exception, e:
                valid = False
                errors.append(
                    'CKAN exception reading values from GeminiDocument: %s' %
                    e)