def test_multiplicity_warning():
    # This dataset lacks a value for Metadata Date and should
    # produce a log.warning, but not raise an exception.
    xml_string = open_xml_fixture('FCSConservancyPolygons.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
def test_multiplicity_warning():
    # This dataset lacks a value for Metadata Date and should
    # produce a log.warning, but not raise an exception.
    xml_string = open_xml_fixture('FCSConservancyPolygons.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'B8A22DF4-B0DC-4F0B-A713-0CF5F8784A28')
def test_simple():
    xml_string = open_xml_fixture('gemini_dataset.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'test-dataset-1')
    assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')
Exemple #4
0
    def write_package_from_gemini_string(self, content):
        '''Create or update a Package based on some content that has
        come from a URL.

        Returns the package_dict of the result.
        If there is an error, it returns None or raises Exception.
        '''
        log = logging.getLogger(__name__ + '.import')
        package = None
        gemini_document = GeminiDocument(content)
        gemini_values = gemini_document.read_values()
        gemini_guid = gemini_values['guid']

        # Save the metadata reference date in the Harvest Object
        try:
            metadata_modified_date = datetime.strptime(
                gemini_values['metadata-date'], '%Y-%m-%d')
        except ValueError:
            try:
                metadata_modified_date = datetime.strptime(
                    gemini_values['metadata-date'], '%Y-%m-%dT%H:%M:%S')
            except:
                raise Exception('Could not extract reference date for GUID %s (%s)' \
                        % (gemini_guid,gemini_values['metadata-date']))

        self.obj.metadata_modified_date = metadata_modified_date
        self.obj.save()

        last_harvested_object = Session.query(HarvestObject) \
                            .filter(HarvestObject.guid==gemini_guid) \
                            .filter(HarvestObject.current==True) \
                            .all()

        if len(last_harvested_object) == 1:
            last_harvested_object = last_harvested_object[0]
        elif len(last_harvested_object) > 1:
            raise Exception(
                'Application Error: more than one current record for GUID %s' %
                gemini_guid)

        reactivate_package = False
        if last_harvested_object:
            # We've previously harvested this (i.e. it's an update)

            # Use metadata modified date instead of content to determine if the package
            # needs to be updated
            if last_harvested_object.metadata_modified_date is None \
                or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \
                or self.force_import \
                or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and
                    last_harvested_object.source.active is False):

                if self.force_import:
                    log.info('Import forced for object %s with GUID %s' %
                             (self.obj.id, gemini_guid))
                else:
                    log.info(
                        'Package for object with GUID %s needs to be created or updated'
                        % gemini_guid)

                package = last_harvested_object.package

                # If the package has a deleted state, we will only update it and reactivate it if the
                # new document has a more recent modified date
                if package.state == u'deleted':
                    if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date:
                        log.info(
                            'Package for object with GUID %s will be re-activated'
                            % gemini_guid)
                        reactivate_package = True
                    else:
                        log.info(
                            'Remote record with GUID %s is not more recent than a deleted package, skipping... '
                            % gemini_guid)
                        return None

            else:
                if last_harvested_object.content != self.obj.content and \
                 last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
                    diff_generator = difflib.unified_diff(
                        last_harvested_object.content.split('\n'),
                        self.obj.content.split('\n'))
                    diff = '\n'.join([line for line in diff_generator])
                    raise Exception(
                        'The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s'
                        % (gemini_guid, diff))
                else:
                    # The content hasn't changed, no need to update the package
                    log.info('Document with GUID %s unchanged, skipping...' %
                             (gemini_guid))
                return None
        else:
            log.info(
                'No package with GEMINI guid %s found, let\'s create one' %
                gemini_guid)

        extras = {'UKLP': 'True', 'harvest_object_id': self.obj.id}

        # Just add some of the metadata as extras, not the whole lot
        for name in [
                # Essentials
                'spatial-reference-system',
                'guid',
                # Usefuls
                'dataset-reference-date',
                'metadata-language',  # Language
                'metadata-date',  # Released
                'coupled-resource',
                'contact-email',
                'frequency-of-update',
                'spatial-data-service-type',
        ]:
            extras[name] = gemini_values[name]

        if len(gemini_values.get('progress', [])):
            extras['progress'] = gemini_values['progress'][0]
        else:
            extras['progress'] = ''

        extras['resource-type'] = gemini_values['resource-type'][0]

        # Use-constraints can contain values which are:
        #  * free text
        #  * licence URL
        # Store all values in extra['licence'] and if there is a
        # URL in there, store that in extra['licence-url']
        extras['licence'] = gemini_values.get('use-constraints', '')
        if len(extras['licence']):
            licence_url_extracted = self._extract_first_licence_url(
                extras['licence'])
            if licence_url_extracted:
                extras['licence_url'] = licence_url_extracted

        extras['access_constraints'] = gemini_values.get(
            'limitations-on-public-access', '')
        if 'temporal-extent-begin' in gemini_values:
            #gemini_values['temporal-extent-begin'].sort()
            extras['temporal_coverage-from'] = gemini_values[
                'temporal-extent-begin']
        if 'temporal-extent-end' in gemini_values:
            #gemini_values['temporal-extent-end'].sort()
            extras['temporal_coverage-to'] = gemini_values[
                'temporal-extent-end']

        # Save responsible organization roles
        provider, responsible_parties = self._process_responsible_organisation(
            gemini_values['responsible-organisation'])
        extras['provider'] = provider
        extras['responsible-party'] = '; '.join(responsible_parties)

        if len(gemini_values['bbox']) > 0:
            extras['bbox-east-long'] = gemini_values['bbox'][0]['east']
            extras['bbox-north-lat'] = gemini_values['bbox'][0]['north']
            extras['bbox-south-lat'] = gemini_values['bbox'][0]['south']
            extras['bbox-west-long'] = gemini_values['bbox'][0]['west']

            # Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
            extent_string = self.extent_template.substitute(
                xmin=extras['bbox-east-long'],
                ymin=extras['bbox-south-lat'],
                xmax=extras['bbox-west-long'],
                ymax=extras['bbox-north-lat'])

            extras['spatial'] = extent_string.strip()

        tags = []
        for tag in gemini_values['tags']:
            tag = tag[:50] if len(tag) > 50 else tag
            tags.append({'name': tag})

        package_dict = {
            'title': gemini_values['title'],
            'notes': gemini_values['abstract'],
            'tags': tags,
            'resources': []
        }

        if self.obj.source.publisher_id:
            package_dict['groups'] = [{'id': self.obj.source.publisher_id}]

        if reactivate_package:
            package_dict['state'] = u'active'

        if package is None or package.title != gemini_values['title']:
            name = self.gen_new_name(gemini_values['title'])
            if not name:
                name = self.gen_new_name(six.text_type(gemini_guid))
            if not name:
                raise Exception(
                    'Could not generate a unique name from the title or the GUID. Please choose a more unique title.'
                )
            package_dict['name'] = name
        else:
            package_dict['name'] = package.name

        resource_locators = gemini_values.get('resource-locator', [])

        if len(resource_locators):
            for resource_locator in resource_locators:
                url = resource_locator.get('url', '')
                if url:
                    resource_format = ''
                    resource = {}
                    if extras['resource-type'] == 'service':
                        # Check if the service is a view service
                        test_url = url.split('?')[0] if '?' in url else url
                        if self._is_wms(test_url):
                            resource['verified'] = True
                            resource['verified_date'] = datetime.now(
                            ).isoformat()
                            resource_format = 'WMS'
                    resource.update({
                        'url':
                        url,
                        'name':
                        resource_locator.get('name', ''),
                        'description':
                        resource_locator.get('description')
                        if resource_locator.get('description') else
                        'Resource locator',
                        'format':
                        resource_format or None,
                        'resource_locator_protocol':
                        resource_locator.get('protocol', ''),
                        'resource_locator_function':
                        resource_locator.get('function', '')
                    })
                    package_dict['resources'].append(resource)

            # Guess the best view service to use in WMS preview
            verified_view_resources = [
                r for r in package_dict['resources']
                if 'verified' in r and r['format'] == 'WMS'
            ]
            if len(verified_view_resources):
                verified_view_resources[0][
                    'ckan_recommended_wms_preview'] = True
            else:
                view_resources = [
                    r for r in package_dict['resources']
                    if r['format'] == 'WMS'
                ]
                if len(view_resources):
                    view_resources[0]['ckan_recommended_wms_preview'] = True

        extras_as_dict = []
        for key, value in extras.items():
            if isinstance(value, six.string_types + (Number, )):
                extras_as_dict.append({'key': key, 'value': value})
            else:
                extras_as_dict.append({'key': key, 'value': json.dumps(value)})

        package_dict['extras'] = extras_as_dict

        if package == None:
            # Create new package from data.
            package = self._create_package_from_data(package_dict)
            log.info('Created new package ID %s with GEMINI guid %s',
                     package['id'], gemini_guid)
        else:
            package = self._create_package_from_data(package_dict,
                                                     package=package)
            log.info(
                'Updated existing package ID %s with existing GEMINI guid %s',
                package['id'], gemini_guid)

        # Flag the other objects of this source as not current anymore
        from ckanext.harvest.model import harvest_object_table
        u = update(harvest_object_table) \
                .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                .values(current=False)
        Session.execute(u, params={'b_package_id': package['id']})
        Session.commit()

        # Refresh current object from session, otherwise the
        # import paster command fails
        Session.remove()
        Session.add(self.obj)
        Session.refresh(self.obj)

        # Set reference to package in the HarvestObject and flag it as
        # the current one
        if not self.obj.package_id:
            self.obj.package_id = package['id']

        self.obj.current = True
        self.obj.save()

        return package
def test_simple():
    xml_string = open_xml_fixture('gemini_dataset.xml')
    gemini_document = GeminiDocument(xml_string)
    gemini_values = gemini_document.read_values()
    assert_equal(gemini_values['guid'], 'test-dataset-1')
    assert_equal(gemini_values['metadata-date'], '2011-09-23T10:06:08')