def test_form_submit_success_xml_file_from_upload(self, app, monkeypatch, tmpdir, ckan_config): _patch_storage_path(monkeypatch, tmpdir, ckan_config) files = {'upload': 'ddi_test.xml'} resp = _post_request(app, '/dataset/import', {}, files, self.extra_environ, status=302) expected_id = 'ddi-test-1' try: toolkit.requires_ckan_version("2.9") assert ('/dataset/{}/resource/new'.format(expected_id) in resp.headers['location']) except toolkit.CkanVersionException: assert ('/dataset/new_resource/{}'.format(expected_id) in resp.headers['location']) dataset = toolkit.get_action('package_show')({ 'ignore_auth': True }, { 'id': expected_id }) assert dataset assert len(dataset['resources']) == 1 assert 'ddi_test.xml' in dataset['resources'][0]['url']
def gather_stage(self, harvest_job): log.debug('In SixodpHarvester gather_stage (%s)', harvest_job.source.url) toolkit.requires_ckan_version(min_version='2.0') get_all_packages = True self._set_config(harvest_job.source.config) # Get source URL remote_ckan_base_url = harvest_job.source.url.rstrip('/') # Filter in/out datasets from particular organizations fq_terms = [] org_filter_include = self.config.get('organizations_filter_include', []) org_filter_exclude = self.config.get('organizations_filter_exclude', []) if org_filter_include: fq_terms.append(' OR '.join( 'organization:%s' % org_name for org_name in org_filter_include)) elif org_filter_exclude: fq_terms.extend( '-organization:%s' % org_name for org_name in org_filter_exclude) force_all = self.config.get('force_all', False) delete_missing = self.config.get('delete_missing', False) # Ideally we can request from the remote CKAN only those datasets # modified since the last completely successful harvest. last_error_free_job = self.last_error_free_job(harvest_job) log.debug('Last error-free job: %r', last_error_free_job) if last_error_free_job and not (force_all or delete_missing): get_all_packages = False # Request only the datasets modified since last_time = last_error_free_job.gather_started # Note: SOLR works in UTC, and gather_started is also UTC, so # this should work as long as local and remote clocks are # relatively accurate. Going back a little earlier, just in case. get_changes_since = \ (last_time - datetime.timedelta(hours=1)).isoformat() log.info('Searching for datasets modified since: %s UTC', get_changes_since) fq_since_last_time = 'metadata_modified:[{since}Z TO *]' \ .format(since=get_changes_since) try: pkg_dicts = self._search_for_datasets( remote_ckan_base_url, fq_terms + [fq_since_last_time]) except SearchError, e: log.info('Searching for datasets changed since last time ' 'gave an error: %s', e) get_all_packages = True if not get_all_packages and not pkg_dicts: log.info('No datasets have been updated on the remote ' 'CKAN instance since the last harvest job %s', last_time) return []
def gather_stage(self, harvest_job): log.error('In MetarepoHarvester gather_stage (%s)', harvest_job.source.url) toolkit.requires_ckan_version(min_version='2.0') get_all_packages = True self._set_config(harvest_job.source.config) # Get source URL remote_ckan_base_url = harvest_job.source.url.rstrip('/') # Filter in/out datasets from particular organizations fq_terms = [] org_filter_include = self.config.get('organizations_filter_include', []) org_filter_exclude = self.config.get('organizations_filter_exclude', []) if org_filter_include: fq_terms.append(' OR '.join('organization:%s' % org_name for org_name in org_filter_include)) elif org_filter_exclude: fq_terms.extend('-organization:%s' % org_name for org_name in org_filter_exclude) # Ideally we can request from the remote Metarepo only those datasets # modified since the last completely successful harvest. last_error_free_job = self._last_error_free_job(harvest_job) log.debug('Last error-free job: %r', last_error_free_job) if (last_error_free_job and not self.config.get('force_all', False)): get_all_packages = False # Request only the datasets modified since last_time = last_error_free_job.gather_started # Note: SOLR works in UTC, and gather_started is also UTC, so # this should work as long as local and remote clocks are # relatively accurate. Going back a little earlier, just in case. get_changes_since = \ (last_time - datetime.timedelta(hours=1)).isoformat() log.info('Searching for datasets modified since: %s UTC', get_changes_since) fq_since_last_time = 'metadata_modified:[{since}Z TO *]' \ .format(since=get_changes_since) try: pkg_dicts = self._search_for_datasets( remote_ckan_base_url, fq_terms + [fq_since_last_time]) except SearchError, e: log.info( 'Searching for datasets changed since last time ' 'gave an error: %s', e) get_all_packages = True if not get_all_packages and not pkg_dicts: log.info( 'No datasets have been updated on the remote ' 'Metarepo instance since the last harvest job %s', last_time) return []
def gather_stage(self, harvest_job): log.debug("In CKANHarvester gather_stage (%s)", harvest_job.source.url) toolkit.requires_ckan_version(min_version="2.0") get_all_packages = True self._set_config(harvest_job.source.config) # Get source URL remote_ckan_base_url = harvest_job.source.url.rstrip("/") # Filter in/out datasets from particular organizations fq_terms = [] org_filter_include = self.config.get("organizations_filter_include", []) org_filter_exclude = self.config.get("organizations_filter_exclude", []) if org_filter_include: fq_terms.append(" OR ".join("organization:%s" % org_name for org_name in org_filter_include)) elif org_filter_exclude: fq_terms.extend("-organization:%s" % org_name for org_name in org_filter_exclude) # Ideally we can request from the remote CKAN only those datasets # modified since the last completely successful harvest. last_error_free_job = self._last_error_free_job(harvest_job) log.debug("Last error-free job: %r", last_error_free_job) if last_error_free_job and not self.config.get("force_all", False): get_all_packages = False # Request only the datasets modified since last_time = last_error_free_job.gather_started # Note: SOLR works in UTC, and gather_started is also UTC, so # this should work as long as local and remote clocks are # relatively accurate. Going back a little earlier, just in case. get_changes_since = (last_time - datetime.timedelta(hours=1)).isoformat() log.info("Searching for datasets modified since: %s UTC", get_changes_since) fq_since_last_time = "metadata_modified:[{since}Z TO *]".format(since=get_changes_since) try: pkg_dicts = self._search_for_datasets(remote_ckan_base_url, fq_terms + [fq_since_last_time]) except SearchError, e: log.info("Searching for datasets changed since last time " "gave an error: %s", e) get_all_packages = True if not get_all_packages and not pkg_dicts: log.info( "No datasets have been updated on the remote " "CKAN instance since the last harvest job %s", last_time, ) return None
def test_no_raise(self): tk.ckan.__version__ = '2' tk.requires_ckan_version(min_version='2')
from ckan.lib.base import BaseController, c, render, request from . import dbutil import ckan.logic as logic import hashlib from . import plugin from pylons import config from paste.util.multidict import MultiDict from ckan.controllers.api import ApiController from ckan.exceptions import CkanVersionException import ckan.plugins.toolkit as tk try: tk.requires_ckan_version("2.9") except CkanVersionException: pass else: from builtins import str log = logging.getLogger("ckanext.googleanalytics") class GAController(BaseController): def view(self): # get package objects corresponding to popular GA content c.top_resources = dbutil.get_top_resources(limit=10) return render("summary.html")
def gather_stage(self, harvest_job): log.debug('In CKANHarvester gather_stage (%s)', harvest_job.source.url) toolkit.requires_ckan_version(min_version='2.0') get_all_packages = True self._set_config(harvest_job.source.config) # Get source URL remote_ckan_base_url = harvest_job.source.url.rstrip('/') # Filter in/out datasets from particular organizations fq_terms = [] custom_filter = self.config.get('custom_filter') if custom_filter: fq_terms += [custom_filter] else: org_filter_include = self.config.get( 'organizations_filter_include', []) org_filter_exclude = self.config.get( 'organizations_filter_exclude', []) if org_filter_include: fq_terms.append(' OR '.join( 'organization:%s' % org_name for org_name in org_filter_include)) elif org_filter_exclude: fq_terms.extend('-organization:%s' % org_name for org_name in org_filter_exclude) groups_filter_include = self.config.get('groups_filter_include', []) groups_filter_exclude = self.config.get('groups_filter_exclude', []) if groups_filter_include: fq_terms.append(' OR '.join( 'groups:%s' % group_name for group_name in groups_filter_include)) elif groups_filter_exclude: fq_terms.extend('-groups:%s' % group_name for group_name in groups_filter_exclude) # Ideally we can request from the remote CKAN only those datasets # modified since the last completely successful harvest. last_error_free_job = self.last_error_free_job(harvest_job) log.debug('Last error-free job: %r', last_error_free_job) if (last_error_free_job and not self.config.get('force_all', False)): get_all_packages = False # Request only the datasets modified since last_time = last_error_free_job.gather_started # Note: SOLR works in UTC, and gather_started is also UTC, so # this should work as long as local and remote clocks are # relatively accurate. Going back a little earlier, just in case. get_changes_since = \ (last_time - datetime.timedelta(hours=1)).isoformat() log.info('Searching for datasets modified since: %s UTC', get_changes_since) fq_since_last_time = 'metadata_modified:[{since}Z TO *]' \ .format(since=get_changes_since) try: pkg_dicts = self._search_for_datasets( remote_ckan_base_url, fq_terms + [fq_since_last_time]) except SearchError as e: log.info( 'Searching for datasets changed since last time ' 'gave an error: %s', e) get_all_packages = True if not get_all_packages and not pkg_dicts: log.info( 'No datasets have been updated on the remote ' 'CKAN instance since the last harvest job %s', last_time) return [] # Fall-back option - request all the datasets from the remote CKAN if get_all_packages: # Request all remote packages try: pkg_dicts = self._search_for_datasets(remote_ckan_base_url, fq_terms) except SearchError as e: log.info('Searching for all datasets gave an error: %s', e) self._save_gather_error( 'Unable to search remote CKAN for datasets:%s url:%s' 'terms:%s' % (e, remote_ckan_base_url, fq_terms), harvest_job) return None if not pkg_dicts: self._save_gather_error( 'No datasets found at CKAN: %s' % remote_ckan_base_url, harvest_job) return [] # Create harvest objects for each dataset try: package_ids = set() object_ids = [] for pkg_dict in pkg_dicts: if pkg_dict['id'] in package_ids: log.info( 'Discarding duplicate dataset %s - probably due ' 'to datasets being changed at the same time as ' 'when the harvester was paging through', pkg_dict['id']) continue package_ids.add(pkg_dict['id']) log.debug('Creating HarvestObject for %s %s', pkg_dict['name'], pkg_dict['id']) obj = HarvestObject(guid=pkg_dict['id'], job=harvest_job, content=json.dumps(pkg_dict)) obj.save() object_ids.append(obj.id) return object_ids except Exception as e: self._save_gather_error('%r' % e.message, harvest_job)
import ckan.plugins as plugins import ckan.plugins.toolkit as toolkit from ckan.exceptions import CkanVersionException import ckanext.opendata_theme.opengov_custom_homepage.helpers as helper from ckanext.opendata_theme.opengov_custom_homepage.constants import CUSTOM_NAMING, CUSTOM_STYLE try: toolkit.requires_ckan_version("2.9") except CkanVersionException: from ckanext.opendata_theme.opengov_custom_homepage.plugin.pylons_plugin import MixinPlugin else: from ckanext.opendata_theme.opengov_custom_homepage.plugin.flask_plugin import MixinPlugin from ckanext.opendata_theme.base.template_helpers import version_builder class Opendata_ThemePlugin(MixinPlugin): plugins.implements(plugins.IConfigurable, inherit=True) plugins.implements(plugins.IConfigurer) plugins.implements(plugins.ITemplateHelpers) plugins.implements(plugins.IConfigurable, inherit=True) plugins.implements(plugins.IConfigurer) plugins.implements(plugins.ITemplateHelpers) # IConfigurer def update_config(self, ckan_config): toolkit.add_template_directory(ckan_config, '../templates') if toolkit.check_ckan_version(min_version='2.4', max_version='2.9'): toolkit.add_ckan_admin_tab(ckan_config, 'custom_home_page',
def test_raise(monkeypatch): monkeypatch.setattr(tk.ckan, u"__version__", u"2") with pytest.raises(tk.CkanVersionException): tk.requires_ckan_version(min_version=u"3")
def test_no_raise(monkeypatch): monkeypatch.setattr(tk.ckan, u"__version__", u"2") tk.requires_ckan_version(min_version=u"2")
def post(self, package_type): self._check_auth() pkg_id = None file_path = None data = self._clean_request_form() try: user = toolkit.c.user importer = ddiimporter.DdiImporter(username=user) if isinstance(data.get('upload'), FileStorage): log.debug('upload: %s' % data['upload']) file_path = self._save_temp_file(data['upload'].stream) log.debug('file_path: %s' % file_path) pkg_id = importer.run( file_path=file_path, upload=data['upload'], data=data, ) elif data.get('url'): log.debug('url: %s' % data['url']) pkg_id = importer.run( url=data['url'], data=data, ) else: raise PackageImportError( 'An XML file (uploaded file or URL) is required') registry = ckanapi.LocalCKAN(username=user) resource_dict = { 'package_id': pkg_id, 'name': 'DDI RDF', 'format': 'rdf', 'url': '', 'type': 'attachment', 'file_type': 'other', 'visibility': data.get('visibility', 'restricted') } if isinstance(data.get('rdf_upload'), FileStorage): resource_dict['upload'] = data['rdf_upload'] registry.call_action('resource_create', resource_dict) elif data.get('rdf_url'): resource_dict['url'] = data['rdf_url'] registry.call_action('resource_create', resource_dict) toolkit.h.flash_success( toolkit._('Dataset import from XML successfully completed. ' + 'You can now add data files to it.')) except toolkit.ValidationError as e: errors = e.error_dict error_summary = e.error_summary return self.get(package_type, data, errors, error_summary) except Exception as e: errors = { 'import': toolkit._('Dataset import from XML failed: %s' % str(e)) } return self.get(package_type, data, errors) finally: if file_path is not None: os.remove(file_path) if pkg_id is not None: try: toolkit.requires_ckan_version("2.9") url = toolkit.h.url_for( u'{}_resource.new'.format(package_type), id=pkg_id, ) except toolkit.CkanVersionException: url = toolkit.h.url_for( controller='package', action='new_resource', id=pkg_id, ) return toolkit.redirect_to(url) else: return toolkit.redirect_to(toolkit.h.url_for('ddi_import.import'))
def gather_stage(self, harvest_job): ''' The gather stage will receive a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its job. The HarvestObjects need a reference date with the last modified date for the resource, this may need to be set in a different stage depending on the type of source. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. - to abort the harvest, create a HarvestGatherError and raise an exception. Any created HarvestObjects will be deleted. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' log.debug('In DataNorgeHarvester gather_stage (%s)', harvest_job.source.url) toolkit.requires_ckan_version(min_version='2.0') get_all_packages = True self._set_config(harvest_job.source.config) # Get source URL remote_datanorge_base_url = harvest_job.source.url.rstrip('/') pkg_dicts = [] # Ideally we can request from the remote Datanorge only those datasets # modified since the last completely successful harvest. last_error_free_job = self._last_error_free_job(harvest_job) if (last_error_free_job and not self.config.get('force_all', False)): get_all_packages = False # Request only the datasets modified since last_time = last_error_free_job.gather_started # Note: SOLR works in UTC, and gather_started is also UTC, so # this should work as long as local and remote clocks are # relatively accurate. Going back a little earlier, just in case. get_changes_since = \ (str(last_time - datetime.timedelta(hours=1)).split(' '))[0] log.info('Searching for datasets modified since: %s UTC', get_changes_since) try: # Add the result from the search to pkg_dicts. pkg_dicts.extend( self._search_for_datasets(remote_datanorge_base_url, get_changes_since)) except SearchError, e: log.info( 'Searching for datasets changed since last time ' 'gave an error: %s', e) get_all_packages = True if not get_all_packages and not pkg_dicts: log.info( 'No datasets have been updated on the remote ' 'DataNorge instance since the last harvest job %s', last_time) return None
def gather_stage(self, harvest_job): ''' The gather stage will receive a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its job. The HarvestObjects need a reference date with the last modified date for the resource, this may need to be set in a different stage depending on the type of source. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. - to abort the harvest, create a HarvestGatherError and raise an exception. Any created HarvestObjects will be deleted. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' log.debug('In GeonorgeHarvester gather_stage (%s)', harvest_job.source.url) toolkit.requires_ckan_version(min_version='2.0') get_all_packages = True self._set_config(harvest_job.source.config) # Get source URL remote_geonorge_base_url = harvest_job.source.url.rstrip('/') pkg_dicts = [] # Retrieves the element at index 'index' from a list '_list' def get_item_from_list(_list, index): counter = 0 for item in _list: if counter == index: return item counter += 1 ''' This makes a list with lists of all possible search-combinations needed to search for everything specified in the config: ''' filter_include = {} fq_terms_list_length = 1 for filter_item in self.config: if filter_item in ['text', 'title', 'uuid']: config_item = self.config[filter_item] if isinstance(config_item, basestring): config_item = [config_item] filter_include[filter_item] = config_item fq_terms_list_length *= len(filter_include[filter_item]) elif filter_item in ['datatypes', 'organizations', 'themes']: # There was a key error when having filter_item = 'type' # This is fixed by setting it to 'datatype' and update it # to 'type' here (same goes for 'organization' and 'organizations'): config_item = self.config[filter_item] if isinstance(config_item, basestring): config_item = [config_item] if filter_item == 'datatypes': filter_include['type'] = config_item fq_terms_list_length *= len(filter_include['type']) elif filter_item == 'organizations': filter_include['organization'] = config_item fq_terms_list_length *= len(filter_include['organization']) elif filter_item == 'themes': filter_include['theme'] = config_item fq_terms_list_length *= len(filter_include['theme']) # Set type to be 'dataset' by default: if not 'type' in filter_include: filter_include['type'] = ['dataset'] fq_terms_list = [{} for i in range(fq_terms_list_length)] switchnum_max = 1 filter_counter = 0 for filter_item in filter_include: switchnum_counter = 0 search_counter = 0 for search in range(fq_terms_list_length): if switchnum_counter == switchnum_max: search_counter += 1 switchnum_counter = 0 switchnum_counter += 1 fq_terms_list[search][filter_item] = \ filter_include[filter_item][search_counter \ % len(filter_include[filter_item])] temp_filter_item = get_item_from_list(filter_include, filter_counter) if temp_filter_item is not None: switchnum_max *= len(filter_include[temp_filter_item]) filter_counter += 1 ''' End of search-combination making. All combination of search parameters is now stored in the list: fq_terms_list which is a list of dictionaries. Each dictionary in the list contains one search to be made. ''' # Ideally we can request from the remote Geonorge only those datasets # modified since the last completely successful harvest. last_error_free_job = self._last_error_free_job(harvest_job) if (last_error_free_job and not self.config.get('force_all', False)): get_all_packages = False # Request only the datasets modified since last_time = last_error_free_job.gather_started # Note: SOLR works in UTC, and gather_started is also UTC, so # this should work as long as local and remote clocks are # relatively accurate. Going back a little earlier, just in case. get_changes_since = \ (last_time - datetime.timedelta(hours=1)).isoformat() log.info('Searching for datasets modified since: %s UTC' % get_changes_since) try: # For every dictionary of search parameters in fq_terms_list: # add the result from the search to pkg_dicts. for fq_terms in fq_terms_list: pkg_dicts.extend( self._search_for_datasets(remote_geonorge_base_url, fq_terms)) pkg_dicts = \ self._get_modified_datasets(pkg_dicts, remote_geonorge_base_url, get_changes_since) except SearchError, e: log.info( 'Searching for datasets changed since last time ' 'gave an error: %s', e) get_all_packages = True if not get_all_packages and not pkg_dicts: log.info( 'No datasets have been updated on the remote ' 'Geonorge instance since the last harvest job %s', last_time) return None