def datapusher_submit(context, data_dict): ''' Submit a job to the datapusher. The datapusher is a service that imports tabular data into the datastore. :param resource_id: The resource id of the resource that the data should be imported in. The resource's URL will be used to get the data. :type resource_id: string :param set_url_type: If set to True, the ``url_type`` of the resource will be set to ``datastore`` and the resource URL will automatically point to the :ref:`datastore dump <dump>` URL. (optional, default: False) :type set_url_type: bool :param ignore_hash: If set to True, the datapusher will reload the file even if it haven't changed. (optional, default: False) :type ignore_hash: bool Returns ``True`` if the job has been submitted and ``False`` if the job has not been submitted, i.e. when the datapusher is not configured. :rtype: bool ''' schema = context.get('schema', dpschema.datapusher_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) if errors: raise p.toolkit.ValidationError(errors) res_id = data_dict['resource_id'] p.toolkit.check_access('datapusher_submit', context, data_dict) try: resource_dict = p.toolkit.get_action('resource_show')(context, { 'id': res_id, }) except logic.NotFound: return False datapusher_url = config.get('ckan.datapusher.url') callback_url_base = config.get('ckan.datapusher.callback_url_base') if callback_url_base: site_url = callback_url_base callback_url = urljoin(callback_url_base.rstrip('/'), '/api/3/action/datapusher_hook') else: site_url = h.url_for('/', qualified=True) callback_url = h.url_for('/api/3/action/datapusher_hook', qualified=True) for plugin in p.PluginImplementations(interfaces.IDataPusher): upload = plugin.can_upload(res_id) if not upload: msg = "Plugin {0} rejected resource {1}"\ .format(plugin.__class__.__name__, res_id) log.info(msg) return False task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'datapusher', 'last_updated': str(datetime.datetime.utcnow()), 'state': 'submitting', 'key': 'datapusher', 'value': '{}', 'error': '{}', } try: existing_task = p.toolkit.get_action('task_status_show')( context, { 'entity_id': res_id, 'task_type': 'datapusher', 'key': 'datapusher' }) assume_task_stale_after = datetime.timedelta(seconds=int( config.get('ckan.datapusher.assume_task_stale_after', 3600))) if existing_task.get('state') == 'pending': updated = datetime.datetime.strptime(existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f') time_since_last_updated = datetime.datetime.utcnow() - updated if time_since_last_updated > assume_task_stale_after: # it's been a while since the job was last updated - it's more # likely something went wrong with it and the state wasn't # updated than its still in progress. Let it be restarted. log.info( 'A pending task was found %r, but it is only %s hours' ' old', existing_task['id'], time_since_last_updated) else: log.info( 'A pending task was found %s for this resource, so ' 'skipping this duplicate task', existing_task['id']) return False task['id'] = existing_task['id'] except logic.NotFound: pass context['ignore_auth'] = True # Use local session for task_status_update, so it can commit its own # results without messing up with the parent session that contains pending # updats of dataset/resource/etc. context['session'] = context['model'].meta.create_local_session() p.toolkit.get_action('task_status_update')(context, task) site_user = p.toolkit.get_action('get_site_user')({ 'ignore_auth': True }, {}) try: r = requests.post(urljoin(datapusher_url, 'job'), headers={'Content-Type': 'application/json'}, data=json.dumps({ 'api_key': site_user['apikey'], 'job_type': 'push_to_datastore', 'result_url': callback_url, 'metadata': { 'ignore_hash': data_dict.get('ignore_hash', False), 'ckan_url': site_url, 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False), 'task_created': task['last_updated'], 'original_url': resource_dict.get('url'), } })) r.raise_for_status() except requests.exceptions.ConnectionError as e: error = { 'message': 'Could not connect to DataPusher.', 'details': str(e) } task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error) except requests.exceptions.HTTPError as e: m = 'An Error occurred while sending the job: {0}'.format(str(e)) try: body = e.response.json() if body.get('error'): m += ' ' + body['error'] except ValueError: body = e.response.text error = {'message': m, 'details': body, 'status_code': r.status_code} task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error) value = json.dumps({ 'job_id': r.json()['job_id'], 'job_key': r.json()['job_key'] }) task['value'] = value task['state'] = 'pending' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) return True
def datapusher_submit(context, data_dict): ''' Submit a job to the datapusher. The datapusher is a service that imports tabular data into the datastore. :param resource_id: The resource id of the resource that the data should be imported in. The resource's URL will be used to get the data. :type resource_id: string :param set_url_type: If set to True, the ``url_type`` of the resource will be set to ``datastore`` and the resource URL will automatically point to the :ref:`datastore dump <dump>` URL. (optional, default: False) :type set_url_type: bool :param ignore_hash: If set to True, the datapusher will reload the file even if it haven't changed. (optional, default: False) :type ignore_hash: bool Returns ``True`` if the job has been submitted and ``False`` if the job has not been submitted, i.e. when the datapusher is not configured. :rtype: bool ''' schema = context.get('schema', dpschema.datapusher_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) if errors: raise p.toolkit.ValidationError(errors) res_id = data_dict['resource_id'] p.toolkit.check_access('datapusher_submit', context, data_dict) datapusher_url = pylons.config.get('ckan.datapusher.url') site_url = pylons.config['ckan.site_url'] callback_url = site_url.rstrip('/') + '/api/3/action/datapusher_hook' user = p.toolkit.get_action('user_show')(context, {'id': context['user']}) task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'datapusher', 'last_updated': str(datetime.datetime.now()), 'state': 'submitting', 'key': 'datapusher', 'value': '{}', 'error': '{}', } try: task_id = p.toolkit.get_action('task_status_show')(context, { 'entity_id': res_id, 'task_type': 'datapusher', 'key': 'datapusher' })['id'] task['id'] = task_id except logic.NotFound: pass context['ignore_auth'] = True result = p.toolkit.get_action('task_status_update')(context, task) task_id = result['id'] try: r = requests.post( urlparse.urljoin(datapusher_url, 'job'), headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'api_key': user['apikey'], 'job_type': 'push_to_datastore', 'result_url': callback_url, 'metadata': { 'ignore_hash': data_dict.get('ignore_hash', False), 'ckan_url': site_url, 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False) } })) r.raise_for_status() except requests.exceptions.ConnectionError, e: error = {'message': 'Could not connect to DataPusher.', 'details': str(e)} task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.now()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error)
def datapusher_submit(context, data_dict): ''' Submit a job to the datapusher. The datapusher is a service that imports tabular data into the datastore. :param resource_id: The resource id of the resource that the data should be imported in. The resource's URL will be used to get the data. :type resource_id: string :param set_url_type: If set to True, the ``url_type`` of the resource will be set to ``datastore`` and the resource URL will automatically point to the :ref:`datastore dump <dump>` URL. (optional, default: False) :type set_url_type: bool :param ignore_hash: If set to True, the datapusher will reload the file even if it haven't changed. (optional, default: False) :type ignore_hash: bool Returns ``True`` if the job has been submitted and ``False`` if the job has not been submitted, i.e. when the datapusher is not configured. :rtype: bool ''' schema = context.get('schema', dpschema.datapusher_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) if errors: raise p.toolkit.ValidationError(errors) res_id = data_dict['resource_id'] p.toolkit.check_access('datapusher_submit', context, data_dict) try: resource_dict = p.toolkit.get_action('resource_show')(context, { 'id': res_id, }) except logic.NotFound: return False datapusher_url = pylons.config.get('ckan.datapusher.url') site_url = pylons.config['ckan.site_url'] callback_url = site_url.rstrip('/') + '/api/3/action/datapusher_hook' user = p.toolkit.get_action('user_show')(context, {'id': context['user']}) for plugin in p.PluginImplementations(interfaces.IDataPusher): upload = plugin.can_upload(res_id) if not upload: msg = "Plugin {0} rejected resource {1}"\ .format(plugin.__class__.__name__, res_id) log.info(msg) return False task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'datapusher', 'last_updated': str(datetime.datetime.now()), 'state': 'submitting', 'key': 'datapusher', 'value': '{}', 'error': '{}', } try: task_id = p.toolkit.get_action('task_status_show')(context, { 'entity_id': res_id, 'task_type': 'datapusher', 'key': 'datapusher' })['id'] task['id'] = task_id except logic.NotFound: pass context['ignore_auth'] = True result = p.toolkit.get_action('task_status_update')(context, task) task_id = result['id'] try: r = requests.post(urlparse.urljoin(datapusher_url, 'job'), headers={'Content-Type': 'application/json'}, data=json.dumps({ 'api_key': user['apikey'], 'job_type': 'push_to_datastore', 'result_url': callback_url, 'metadata': { 'ignore_hash': data_dict.get('ignore_hash', False), 'ckan_url': site_url, 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False), 'task_created': task['last_updated'], 'original_url': resource_dict.get('url'), } })) r.raise_for_status() except requests.exceptions.ConnectionError, e: error = { 'message': 'Could not connect to DataPusher.', 'details': str(e) } task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.now()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error)
def datapusher_submit(context, data_dict): ''' Submit a job to the datapusher. The datapusher is a service that imports tabular data into the datastore. :param resource_id: The resource id of the resource that the data should be imported in. The resource's URL will be used to get the data. :type resource_id: string :param set_url_type: If set to True, the ``url_type`` of the resource will be set to ``datastore`` and the resource URL will automatically point to the :ref:`datastore dump <dump>` URL. (optional, default: False) :type set_url_type: bool Returns ``True`` if the job has been submitted and ``False`` if the job has not been submitted, i.e. when the datapusher is not configured. :rtype: bool ''' schema = context.get('schema', dpschema.datapusher_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) if errors: raise p.toolkit.ValidationError(errors) res_id = data_dict['resource_id'] p.toolkit.check_access('datapusher_submit', context, data_dict) datapusher_url = pylons.config.get('ckan.datapusher.url') callback_url = p.toolkit.url_for(controller='api', action='action', logic_function='datapusher_hook', ver=3, qualified=True) user = p.toolkit.get_action('user_show')(context, {'id': context['user']}) task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'datapusher', 'last_updated': str(datetime.datetime.now()), 'state': 'submitting', 'key': 'datapusher', 'value': '{}', 'error': '{}', } try: task_id = p.toolkit.get_action('task_status_show')(context, { 'entity_id': res_id, 'task_type': 'datapusher', 'key': 'datapusher' })['id'] task['id'] = task_id except logic.NotFound: pass context['ignore_auth'] = True result = p.toolkit.get_action('task_status_update')(context, task) task_id = result['id'] try: r = requests.post(urlparse.urljoin(datapusher_url, 'job'), headers={'Content-Type': 'application/json'}, data=json.dumps({ 'api_key': user['apikey'], 'job_type': 'push_to_datastore', 'result_url': callback_url, 'metadata': { 'ckan_url': pylons.config['ckan.site_url'], 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False) } })) r.raise_for_status() except requests.exceptions.ConnectionError, e: error = { 'message': 'Could not connect to DataPusher.', 'details': str(e) } task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.now()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error)
def datapusher_submit(context, data_dict): ''' Submit a job to the datapusher. The datapusher is a service that imports tabular data into the datastore. :param resource_id: The resource id of the resource that the data should be imported in. The resource's URL will be used to get the data. :type resource_id: string :param set_url_type: If set to True, the ``url_type`` of the resource will be set to ``datastore`` and the resource URL will automatically point to the :ref:`datastore dump <dump>` URL. (optional, default: False) :type set_url_type: bool :param ignore_hash: If set to True, the datapusher will reload the file even if it haven't changed. (optional, default: False) :type ignore_hash: bool Returns ``True`` if the job has been submitted and ``False`` if the job has not been submitted, i.e. when the datapusher is not configured. :rtype: bool ''' schema = context.get('schema', dpschema.datapusher_submit_schema()) data_dict, errors = _validate(data_dict, schema, context) if errors: raise p.toolkit.ValidationError(errors) res_id = data_dict['resource_id'] p.toolkit.check_access('datapusher_submit', context, data_dict) try: resource_dict = p.toolkit.get_action('resource_show')(context, { 'id': res_id, }) except logic.NotFound: return False datapusher_url = config.get('ckan.datapusher.url') site_url = h.url_for('/', qualified=True) callback_url = h.url_for('/api/3/action/datapusher_hook', qualified=True) user = p.toolkit.get_action('user_show')(context, {'id': context['user']}) for plugin in p.PluginImplementations(interfaces.IDataPusher): upload = plugin.can_upload(res_id) if not upload: msg = "Plugin {0} rejected resource {1}"\ .format(plugin.__class__.__name__, res_id) log.info(msg) return False task = { 'entity_id': res_id, 'entity_type': 'resource', 'task_type': 'datapusher', 'last_updated': str(datetime.datetime.utcnow()), 'state': 'submitting', 'key': 'datapusher', 'value': '{}', 'error': '{}', } try: existing_task = p.toolkit.get_action('task_status_show')(context, { 'entity_id': res_id, 'task_type': 'datapusher', 'key': 'datapusher' }) assume_task_stale_after = datetime.timedelta(seconds=int( config.get('ckan.datapusher.assume_task_stale_after', 3600))) if existing_task.get('state') == 'pending': updated = datetime.datetime.strptime( existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f') time_since_last_updated = datetime.datetime.utcnow() - updated if time_since_last_updated > assume_task_stale_after: # it's been a while since the job was last updated - it's more # likely something went wrong with it and the state wasn't # updated than its still in progress. Let it be restarted. log.info('A pending task was found %r, but it is only %s hours' ' old', existing_task['id'], time_since_last_updated) else: log.info('A pending task was found %s for this resource, so ' 'skipping this duplicate task', existing_task['id']) return False task['id'] = existing_task['id'] except logic.NotFound: pass context['ignore_auth'] = True p.toolkit.get_action('task_status_update')(context, task) try: r = requests.post( urlparse.urljoin(datapusher_url, 'job'), headers={ 'Content-Type': 'application/json' }, data=json.dumps({ 'api_key': user['apikey'], 'job_type': 'push_to_datastore', 'result_url': callback_url, 'metadata': { 'ignore_hash': data_dict.get('ignore_hash', False), 'ckan_url': site_url, 'resource_id': res_id, 'set_url_type': data_dict.get('set_url_type', False), 'task_created': task['last_updated'], 'original_url': resource_dict.get('url'), } })) r.raise_for_status() except requests.exceptions.ConnectionError as e: error = {'message': 'Could not connect to DataPusher.', 'details': str(e)} task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error) except requests.exceptions.HTTPError as e: m = 'An Error occurred while sending the job: {0}'.format(e.message) try: body = e.response.json() except ValueError: body = e.response.text error = {'message': m, 'details': body, 'status_code': r.status_code} task['error'] = json.dumps(error) task['state'] = 'error' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) raise p.toolkit.ValidationError(error) value = json.dumps({'job_id': r.json()['job_id'], 'job_key': r.json()['job_key']}) task['value'] = value task['state'] = 'pending' task['last_updated'] = str(datetime.datetime.utcnow()), p.toolkit.get_action('task_status_update')(context, task) return True