def create_new_version_of_subset(self, subset_id, orig_id): context = { 'model': model, 'session': model.Session, 'user': c.user, 'ignore_capacity_check': True } h.check_access('package_update', {'id': subset_id}) subset = tk.get_action('package_show')(context, {'id': subset_id}) orig_pkg = tk.get_action('package_show')(context, {'id': orig_id}) new_ver_name = subset['name'][:subset['name'].rfind("-v") + 2] + str( helpers.get_version_number(orig_pkg)).zfill(2) # add include_private for newer CKAN versions # ATTENTION deleted but not purged datasets cannot be found! search_results = tk.get_action('package_search')( context, { 'rows': 10000, 'fq': "name:%s" % (new_ver_name), 'include_versions': True }) if search_results['count'] > 0: h.flash_error( 'The new version could not be created as another package already has the name "%s". Please create a new subset from the original package.' % (new_ver_name)) else: try: enqueue_job = tk.enqueue_job except AttributeError: from ckanext.rq.jobs import enqueue as enqueue_job enqueue_job(create_new_version_of_subset_job, [c.user, subset, orig_pkg]) h.flash_notice( 'Your version is being created. This might take a while, you will receive an E-Mail when your version is available.' ) redirect( h.url_for(controller='package', action='read', id=subset['name']))
def xloader_submit(context, data_dict): ''' Submit a job to be Express Loaded. The Express Loader / 'xloader' is a service that imports tabular data into the datastore. :param resource_id: The resource id of the resource that the data should be imported in. The resource's URL will be used to get the data. :type resource_id: string :param set_url_type: If set to True, the ``url_type`` of the resource will be set to ``datastore`` and the resource URL will automatically point to the :ref:`datastore dump <dump>` URL. (optional, default: False) :type set_url_type: bool :param ignore_hash: If set to True, the xloader will reload the file even if it haven't changed. (optional, default: False) :type ignore_hash: bool Returns ``True`` if the job has been submitted and ``False`` if the job has not been submitted, i.e. when ckanext-xloader is not configured. :rtype: bool ''' schema = context.get('schema', ckanext.xloader.schema.xloader_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) if errors: raise p.toolkit.ValidationError(errors) res_id = data_dict['resource_id'] p.toolkit.check_access('xloader_submit', context, data_dict) try: resource_dict = p.toolkit.get_action('resource_show')(context, { 'id': res_id, }) except logic.NotFound: return False site_url = config['ckan.site_url'] callback_url = site_url + '/api/3/action/xloader_hook' site_user = p.toolkit.get_action('get_site_user')({'ignore_auth': True}, {}) for plugin in p.PluginImplementations(xloader_interfaces.IXloader): upload = plugin.can_upload(res_id) if not upload: msg = "Plugin {0} rejected resource {1}"\ .format(plugin.__class__.__name__, res_id) log.info(msg) return False # Check if this resource is already in the process of being xloadered task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'xloader', 'last_updated': six.text_type(datetime.datetime.utcnow()), 'state': 'submitting', 'key': 'xloader', 'value': '{}', 'error': '{}', } try: existing_task = p.toolkit.get_action('task_status_show')(context, { 'entity_id': res_id, 'task_type': 'xloader', 'key': 'xloader' }) assume_task_stale_after = datetime.timedelta(seconds=int( config.get('ckanext.xloader.assume_task_stale_after', 3600))) assume_task_stillborn_after = \ datetime.timedelta(seconds=int( config.get('ckanext.xloader.assume_task_stillborn_after', 5))) if existing_task.get('state') == 'pending': import re # here because it takes a moment to load queued_res_ids = [ re.search(r"'resource_id': u?'([^']+)'", job.description).groups()[0] for job in get_queue().get_jobs() if 'xloader_to_datastore' in six.text_type(job) # filter out test_job etc ] updated = datetime.datetime.strptime( existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f') time_since_last_updated = datetime.datetime.utcnow() - updated if (res_id not in queued_res_ids and time_since_last_updated > assume_task_stillborn_after): # it's not on the queue (and if it had just been started then # its taken too long to update the task_status from pending - # the first thing it should do in the xloader job). # Let it be restarted. log.info('A pending task was found %r, but its not found in ' 'the queue %r and is %s hours old', existing_task['id'], queued_res_ids, time_since_last_updated) elif time_since_last_updated > assume_task_stale_after: # it's been a while since the job was last updated - it's more # likely something went wrong with it and the state wasn't # updated than its still in progress. Let it be restarted. log.info('A pending task was found %r, but it is only %s hours' ' old', existing_task['id'], time_since_last_updated) else: log.info('A pending task was found %s for this resource, so ' 'skipping this duplicate task', existing_task['id']) return False task['id'] = existing_task['id'] except logic.NotFound: pass context['ignore_auth'] = True context['user'] = '' # benign - needed for ckan 2.5 model = context['model'] original_session = model.Session model.Session = model.meta.create_local_session() p.toolkit.get_action('task_status_update')(context, task) data = { 'api_key': site_user['apikey'], 'job_type': 'xloader_to_datastore', 'result_url': callback_url, 'metadata': { 'ignore_hash': data_dict.get('ignore_hash', False), 'ckan_url': site_url, 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False), 'task_created': task['last_updated'], 'original_url': resource_dict.get('url'), } } timeout = config.get('ckanext.xloader.job_timeout', '3600') try: try: job = enqueue_job(jobs.xloader_data_into_datastore, [data], timeout=timeout) except TypeError: # older ckans didn't allow the timeout keyword job = _enqueue(jobs.xloader_data_into_datastore, [data], timeout=timeout) except Exception: log.exception('Unable to enqueued xloader res_id=%s', res_id) model.Session = original_session return False log.debug('Enqueued xloader job=%s res_id=%s', job.id, res_id) value = json.dumps({'job_id': job.id}) task['value'] = value task['state'] = 'pending' task['last_updated'] = six.text_type(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) model.Session = original_session return True
def can_upload(self, res_id): context = {} data_dict = {} log.debug(" xspatial sees %s", res_id) try: resource_dict = plugins.toolkit.get_action('resource_show')( context, { 'id': res_id, }) except logic.NotFound: return False ### site_url = config['ckan.site_url'] callback_url = site_url + '/api/3/action/xloader_hook' site_user = plugins.toolkit.get_action('get_site_user')( { 'ignore_auth': True }, {}) # Check if this resource is already in the process of being xloadered task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'xloader', 'last_updated': str(datetime.datetime.utcnow()), 'state': 'submitting', 'key': 'xloader', 'value': '{}', 'error': '{}', } try: existing_task = plugins.toolkit.get_action('task_status_show')( context, { 'entity_id': res_id, 'task_type': 'xloader', 'key': 'xloader' }) assume_task_stale_after = datetime.timedelta(seconds=int( config.get('ckanext.xloader.assume_task_stale_after', 3600))) assume_task_stillborn_after = \ datetime.timedelta(seconds=int( config.get('ckanext.xloader.assume_task_stillborn_after', 5))) if existing_task.get('state') == 'pending': import re # here because it takes a moment to load queued_res_ids = [ re.search(r"'resource_id': u'([^']+)'", job.description).groups()[0] for job in get_queue().get_jobs() if 'xspatialloader_to_datastore' in str( job) # filter out test_job etc ] updated = datetime.datetime.strptime( existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f') time_since_last_updated = datetime.datetime.utcnow() - updated if (res_id not in queued_res_ids and time_since_last_updated > assume_task_stillborn_after): # it's not on the queue (and if it had just been started then # its taken too long to update the task_status from pending - # the first thing it should do in the xloader job). # Let it be restarted. log.info( 'A pending task was found %r, but its not found in ' 'the queue %r and is %s hours old', existing_task['id'], queued_res_ids, time_since_last_updated) elif time_since_last_updated > assume_task_stale_after: # it's been a while since the job was last updated - it's more # likely something went wrong with it and the state wasn't # updated than its still in progress. Let it be restarted. log.info( 'A pending task was found %r, but it is only %s hours' ' old', existing_task['id'], time_since_last_updated) else: log.info( 'A pending task was found %s for this resource, so ' 'skipping this duplicate task', existing_task['id']) return False task['id'] = existing_task['id'] except logic.NotFound: pass context['ignore_auth'] = True context['user'] = '' # benign - needed for ckan 2.5 plugins.toolkit.get_action('task_status_update')(context, task) data = { 'api_key': site_user['apikey'], 'job_type': 'xspatialloader_to_datastore', 'result_url': callback_url, 'metadata': { 'ignore_hash': data_dict.get('ignore_hash', False), 'ckan_url': site_url, 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False), 'task_created': task['last_updated'], 'original_url': resource_dict.get('url'), } } timeout = config.get('ckanext.xloader.job_timeout', '3600') try: try: job = enqueue_job(jobs.xspatialloader_data_into_datastore, [data], timeout=timeout) except TypeError: # older ckans didn't allow the timeout keyword job = _enqueue(jobs.xspatialloader_data_into_datastore, [data], timeout=timeout) except Exception: log.exception('Unable to enqueued xspatialloader res_id=%s', res_id) return False log.debug('Enqueued xspatialloader job=%s res_id=%s', job.id, res_id) value = json.dumps({'job_id': job.id}) task['value'] = value task['state'] = 'pending' task['last_updated'] = str(datetime.datetime.utcnow()), plugins.toolkit.get_action('task_status_update')(context, task) ### return False
def extractor_extract(context, data_dict): """ Extract and store metadata for a resource. Metadata extraction is done in an asynchronous background job, so this function may return before extraction is complete. :param string id: The ID or name of the resource :param boolean force: Extract metadata even if the resource hasn't changed, or if an extraction task is already scheduled for the resource (optional). :rtype: A dict with the following keys: :status: A string describing the state of the metadata. This can be one of the following: :new: if no metadata for the resource existed before :update: if metadata existed but is going to be updated :unchanged: if metadata existed but won't get updated (for example because the resource's URL did not change since the last extraction) :inprogress: if a background extraction task for this resource is already in progress :ignored: if the resource format is configured to be ignored Note that if ``force`` is true then an extraction job will be scheduled regardless of the status reported, unless that status is ``ignored``. :task_id: The ID of the background task. If ``state`` is ``new`` or ``update`` then this is the ID of a newly created task. If ``state`` is ``inprogress`` then it's the ID of the existing task. Otherwise it is ``null``. If ``force`` is true then this is the ID of the new extraction task. """ log.debug('extractor_extract {}'.format(data_dict['id'])) force = data_dict.get('force', False) resource = toolkit.get_action('resource_show')(context, data_dict) task_id = None metadata = None try: metadata = ResourceMetadata.one(resource_id=resource['id']) if metadata.task_id: status = 'inprogress' task_id = metadata.task_id elif not is_format_indexed(resource['format']): metadata.delete() metadata.commit() metadata = None status = 'ignored' elif (metadata.last_url != resource['url'] or metadata.last_format != resource['format']): status = 'update' else: status = 'unchanged' except NoResultFound: if is_format_indexed(resource['format']): status = 'new' else: status = 'ignored' if status in ('new', 'update') or (status != 'ignored' and force): args = (config['__file__'], resource) title = 'Metadata extraction for resource {}'.format(resource['id']) if metadata is None: metadata = ResourceMetadata.create(resource_id=resource['id']) job = enqueue_job(extract, args, title=title) task_id = metadata.task_id = job.id metadata.save() return { 'status': status, 'task_id': task_id, }