Example #1
0
def datapusher_submit(context, data_dict):
    ''' Submit a job to the datapusher. The datapusher is a service that
    imports tabular data into the datastore.

    :param resource_id: The resource id of the resource that the data
        should be imported in. The resource's URL will be used to get the data.
    :type resource_id: string
    :param set_url_type: If set to True, the ``url_type`` of the resource will
        be set to ``datastore`` and the resource URL will automatically point
        to the :ref:`datastore dump <dump>` URL. (optional, default: False)
    :type set_url_type: bool
    :param ignore_hash: If set to True, the datapusher will reload the file
        even if it haven't changed. (optional, default: False)
    :type ignore_hash: bool

    Returns ``True`` if the job has been submitted and ``False`` if the job
    has not been submitted, i.e. when the datapusher is not configured.

    :rtype: bool
    '''
    schema = context.get('schema', dpschema.datapusher_submit_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    if errors:
        raise p.toolkit.ValidationError(errors)

    res_id = data_dict['resource_id']

    p.toolkit.check_access('datapusher_submit', context, data_dict)

    try:
        resource_dict = p.toolkit.get_action('resource_show')(context, {
            'id': res_id,
        })
    except logic.NotFound:
        return False

    datapusher_url = config.get('ckan.datapusher.url')

    callback_url_base = config.get('ckan.datapusher.callback_url_base')
    if callback_url_base:
        site_url = callback_url_base
        callback_url = urljoin(callback_url_base.rstrip('/'),
                               '/api/3/action/datapusher_hook')
    else:
        site_url = h.url_for('/', qualified=True)
        callback_url = h.url_for('/api/3/action/datapusher_hook',
                                 qualified=True)

    for plugin in p.PluginImplementations(interfaces.IDataPusher):
        upload = plugin.can_upload(res_id)
        if not upload:
            msg = "Plugin {0} rejected resource {1}"\
                .format(plugin.__class__.__name__, res_id)
            log.info(msg)
            return False

    task = {
        'entity_id': res_id,
        'entity_type': 'resource',
        'task_type': 'datapusher',
        'last_updated': str(datetime.datetime.utcnow()),
        'state': 'submitting',
        'key': 'datapusher',
        'value': '{}',
        'error': '{}',
    }
    try:
        existing_task = p.toolkit.get_action('task_status_show')(
            context, {
                'entity_id': res_id,
                'task_type': 'datapusher',
                'key': 'datapusher'
            })
        assume_task_stale_after = datetime.timedelta(seconds=int(
            config.get('ckan.datapusher.assume_task_stale_after', 3600)))
        if existing_task.get('state') == 'pending':
            updated = datetime.datetime.strptime(existing_task['last_updated'],
                                                 '%Y-%m-%dT%H:%M:%S.%f')
            time_since_last_updated = datetime.datetime.utcnow() - updated
            if time_since_last_updated > assume_task_stale_after:
                # it's been a while since the job was last updated - it's more
                # likely something went wrong with it and the state wasn't
                # updated than its still in progress. Let it be restarted.
                log.info(
                    'A pending task was found %r, but it is only %s hours'
                    ' old', existing_task['id'], time_since_last_updated)
            else:
                log.info(
                    'A pending task was found %s for this resource, so '
                    'skipping this duplicate task', existing_task['id'])
                return False

        task['id'] = existing_task['id']
    except logic.NotFound:
        pass

    context['ignore_auth'] = True
    # Use local session for task_status_update, so it can commit its own
    # results without messing up with the parent session that contains pending
    # updats of dataset/resource/etc.
    context['session'] = context['model'].meta.create_local_session()
    p.toolkit.get_action('task_status_update')(context, task)

    site_user = p.toolkit.get_action('get_site_user')({
        'ignore_auth': True
    }, {})

    try:
        r = requests.post(urljoin(datapusher_url, 'job'),
                          headers={'Content-Type': 'application/json'},
                          data=json.dumps({
                              'api_key': site_user['apikey'],
                              'job_type': 'push_to_datastore',
                              'result_url': callback_url,
                              'metadata': {
                                  'ignore_hash':
                                  data_dict.get('ignore_hash', False),
                                  'ckan_url':
                                  site_url,
                                  'resource_id':
                                  res_id,
                                  'set_url_type':
                                  data_dict.get('set_url_type', False),
                                  'task_created':
                                  task['last_updated'],
                                  'original_url':
                                  resource_dict.get('url'),
                              }
                          }))
        r.raise_for_status()
    except requests.exceptions.ConnectionError as e:
        error = {
            'message': 'Could not connect to DataPusher.',
            'details': str(e)
        }
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.utcnow()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)

    except requests.exceptions.HTTPError as e:
        m = 'An Error occurred while sending the job: {0}'.format(str(e))
        try:
            body = e.response.json()
            if body.get('error'):
                m += ' ' + body['error']
        except ValueError:
            body = e.response.text
        error = {'message': m, 'details': body, 'status_code': r.status_code}
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.utcnow()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)

    value = json.dumps({
        'job_id': r.json()['job_id'],
        'job_key': r.json()['job_key']
    })

    task['value'] = value
    task['state'] = 'pending'
    task['last_updated'] = str(datetime.datetime.utcnow()),
    p.toolkit.get_action('task_status_update')(context, task)

    return True
Example #2
0
def datapusher_submit(context, data_dict):
    ''' Submit a job to the datapusher. The datapusher is a service that
    imports tabular data into the datastore.

    :param resource_id: The resource id of the resource that the data
        should be imported in. The resource's URL will be used to get the data.
    :type resource_id: string
    :param set_url_type: If set to True, the ``url_type`` of the resource will
        be set to ``datastore`` and the resource URL will automatically point
        to the :ref:`datastore dump <dump>` URL. (optional, default: False)
    :type set_url_type: bool
    :param ignore_hash: If set to True, the datapusher will reload the file
        even if it haven't changed. (optional, default: False)
    :type ignore_hash: bool

    Returns ``True`` if the job has been submitted and ``False`` if the job
    has not been submitted, i.e. when the datapusher is not configured.

    :rtype: bool
    '''

    schema = context.get('schema', dpschema.datapusher_submit_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    if errors:
        raise p.toolkit.ValidationError(errors)

    res_id = data_dict['resource_id']

    p.toolkit.check_access('datapusher_submit', context, data_dict)

    datapusher_url = pylons.config.get('ckan.datapusher.url')

    site_url = pylons.config['ckan.site_url']
    callback_url = site_url.rstrip('/') + '/api/3/action/datapusher_hook'

    user = p.toolkit.get_action('user_show')(context, {'id': context['user']})

    task = {
        'entity_id': res_id,
        'entity_type': 'resource',
        'task_type': 'datapusher',
        'last_updated': str(datetime.datetime.now()),
        'state': 'submitting',
        'key': 'datapusher',
        'value': '{}',
        'error': '{}',
    }
    try:
        task_id = p.toolkit.get_action('task_status_show')(context, {
            'entity_id': res_id,
            'task_type': 'datapusher',
            'key': 'datapusher'
        })['id']
        task['id'] = task_id
    except logic.NotFound:
        pass

    context['ignore_auth'] = True
    result = p.toolkit.get_action('task_status_update')(context, task)
    task_id = result['id']

    try:
        r = requests.post(
            urlparse.urljoin(datapusher_url, 'job'),
            headers={
                'Content-Type': 'application/json'
            },
            data=json.dumps({
                'api_key': user['apikey'],
                'job_type': 'push_to_datastore',
                'result_url': callback_url,
                'metadata': {
                    'ignore_hash': data_dict.get('ignore_hash', False),
                    'ckan_url': site_url,
                    'resource_id': res_id,
                    'set_url_type': data_dict.get('set_url_type', False)
                }
            }))
        r.raise_for_status()
    except requests.exceptions.ConnectionError, e:
        error = {'message': 'Could not connect to DataPusher.',
                 'details': str(e)}
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.now()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)
Example #3
0
def datapusher_submit(context, data_dict):
    ''' Submit a job to the datapusher. The datapusher is a service that
    imports tabular data into the datastore.

    :param resource_id: The resource id of the resource that the data
        should be imported in. The resource's URL will be used to get the data.
    :type resource_id: string
    :param set_url_type: If set to True, the ``url_type`` of the resource will
        be set to ``datastore`` and the resource URL will automatically point
        to the :ref:`datastore dump <dump>` URL. (optional, default: False)
    :type set_url_type: bool
    :param ignore_hash: If set to True, the datapusher will reload the file
        even if it haven't changed. (optional, default: False)
    :type ignore_hash: bool

    Returns ``True`` if the job has been submitted and ``False`` if the job
    has not been submitted, i.e. when the datapusher is not configured.

    :rtype: bool
    '''
    schema = context.get('schema', dpschema.datapusher_submit_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    if errors:
        raise p.toolkit.ValidationError(errors)

    res_id = data_dict['resource_id']

    p.toolkit.check_access('datapusher_submit', context, data_dict)

    try:
        resource_dict = p.toolkit.get_action('resource_show')(context, {
            'id': res_id,
        })
    except logic.NotFound:
        return False

    datapusher_url = pylons.config.get('ckan.datapusher.url')

    site_url = pylons.config['ckan.site_url']
    callback_url = site_url.rstrip('/') + '/api/3/action/datapusher_hook'

    user = p.toolkit.get_action('user_show')(context, {'id': context['user']})

    for plugin in p.PluginImplementations(interfaces.IDataPusher):
        upload = plugin.can_upload(res_id)
        if not upload:
            msg = "Plugin {0} rejected resource {1}"\
                .format(plugin.__class__.__name__, res_id)
            log.info(msg)
            return False

    task = {
        'entity_id': res_id,
        'entity_type': 'resource',
        'task_type': 'datapusher',
        'last_updated': str(datetime.datetime.now()),
        'state': 'submitting',
        'key': 'datapusher',
        'value': '{}',
        'error': '{}',
    }
    try:
        task_id = p.toolkit.get_action('task_status_show')(context, {
            'entity_id': res_id,
            'task_type': 'datapusher',
            'key': 'datapusher'
        })['id']
        task['id'] = task_id
    except logic.NotFound:
        pass

    context['ignore_auth'] = True
    result = p.toolkit.get_action('task_status_update')(context, task)
    task_id = result['id']

    try:
        r = requests.post(urlparse.urljoin(datapusher_url, 'job'),
                          headers={'Content-Type': 'application/json'},
                          data=json.dumps({
                              'api_key': user['apikey'],
                              'job_type': 'push_to_datastore',
                              'result_url': callback_url,
                              'metadata': {
                                  'ignore_hash':
                                  data_dict.get('ignore_hash', False),
                                  'ckan_url':
                                  site_url,
                                  'resource_id':
                                  res_id,
                                  'set_url_type':
                                  data_dict.get('set_url_type', False),
                                  'task_created':
                                  task['last_updated'],
                                  'original_url':
                                  resource_dict.get('url'),
                              }
                          }))
        r.raise_for_status()
    except requests.exceptions.ConnectionError, e:
        error = {
            'message': 'Could not connect to DataPusher.',
            'details': str(e)
        }
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.now()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)
Example #4
0
def datapusher_submit(context, data_dict):
    ''' Submit a job to the datapusher. The datapusher is a service that
    imports tabular data into the datastore.

    :param resource_id: The resource id of the resource that the data
        should be imported in. The resource's URL will be used to get the data.
    :type resource_id: string
    :param set_url_type: If set to True, the ``url_type`` of the resource will
        be set to ``datastore`` and the resource URL will automatically point
        to the :ref:`datastore dump <dump>` URL. (optional, default: False)
    :type set_url_type: bool

    Returns ``True`` if the job has been submitted and ``False`` if the job
    has not been submitted, i.e. when the datapusher is not configured.

    :rtype: bool
    '''

    schema = context.get('schema', dpschema.datapusher_submit_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    if errors:
        raise p.toolkit.ValidationError(errors)

    res_id = data_dict['resource_id']

    p.toolkit.check_access('datapusher_submit', context, data_dict)

    datapusher_url = pylons.config.get('ckan.datapusher.url')

    callback_url = p.toolkit.url_for(controller='api',
                                     action='action',
                                     logic_function='datapusher_hook',
                                     ver=3,
                                     qualified=True)

    user = p.toolkit.get_action('user_show')(context, {'id': context['user']})

    task = {
        'entity_id': res_id,
        'entity_type': 'resource',
        'task_type': 'datapusher',
        'last_updated': str(datetime.datetime.now()),
        'state': 'submitting',
        'key': 'datapusher',
        'value': '{}',
        'error': '{}',
    }
    try:
        task_id = p.toolkit.get_action('task_status_show')(context, {
            'entity_id': res_id,
            'task_type': 'datapusher',
            'key': 'datapusher'
        })['id']
        task['id'] = task_id
    except logic.NotFound:
        pass

    context['ignore_auth'] = True
    result = p.toolkit.get_action('task_status_update')(context, task)
    task_id = result['id']

    try:
        r = requests.post(urlparse.urljoin(datapusher_url, 'job'),
                          headers={'Content-Type': 'application/json'},
                          data=json.dumps({
                              'api_key': user['apikey'],
                              'job_type': 'push_to_datastore',
                              'result_url': callback_url,
                              'metadata': {
                                  'ckan_url':
                                  pylons.config['ckan.site_url'],
                                  'resource_id':
                                  res_id,
                                  'set_url_type':
                                  data_dict.get('set_url_type', False)
                              }
                          }))
        r.raise_for_status()
    except requests.exceptions.ConnectionError, e:
        error = {
            'message': 'Could not connect to DataPusher.',
            'details': str(e)
        }
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.now()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)
Example #5
0
def datapusher_submit(context, data_dict):
    ''' Submit a job to the datapusher. The datapusher is a service that
    imports tabular data into the datastore.

    :param resource_id: The resource id of the resource that the data
        should be imported in. The resource's URL will be used to get the data.
    :type resource_id: string
    :param set_url_type: If set to True, the ``url_type`` of the resource will
        be set to ``datastore`` and the resource URL will automatically point
        to the :ref:`datastore dump <dump>` URL. (optional, default: False)
    :type set_url_type: bool
    :param ignore_hash: If set to True, the datapusher will reload the file
        even if it haven't changed. (optional, default: False)
    :type ignore_hash: bool

    Returns ``True`` if the job has been submitted and ``False`` if the job
    has not been submitted, i.e. when the datapusher is not configured.

    :rtype: bool
    '''
    schema = context.get('schema', dpschema.datapusher_submit_schema())
    data_dict, errors = _validate(data_dict, schema, context)
    if errors:
        raise p.toolkit.ValidationError(errors)

    res_id = data_dict['resource_id']

    p.toolkit.check_access('datapusher_submit', context, data_dict)

    try:
        resource_dict = p.toolkit.get_action('resource_show')(context, {
            'id': res_id,
        })
    except logic.NotFound:
        return False

    datapusher_url = config.get('ckan.datapusher.url')

    site_url = h.url_for('/', qualified=True)
    callback_url = h.url_for('/api/3/action/datapusher_hook', qualified=True)

    user = p.toolkit.get_action('user_show')(context, {'id': context['user']})

    for plugin in p.PluginImplementations(interfaces.IDataPusher):
        upload = plugin.can_upload(res_id)
        if not upload:
            msg = "Plugin {0} rejected resource {1}"\
                .format(plugin.__class__.__name__, res_id)
            log.info(msg)
            return False

    task = {
        'entity_id': res_id,
        'entity_type': 'resource',
        'task_type': 'datapusher',
        'last_updated': str(datetime.datetime.utcnow()),
        'state': 'submitting',
        'key': 'datapusher',
        'value': '{}',
        'error': '{}',
    }
    try:
        existing_task = p.toolkit.get_action('task_status_show')(context, {
            'entity_id': res_id,
            'task_type': 'datapusher',
            'key': 'datapusher'
        })
        assume_task_stale_after = datetime.timedelta(seconds=int(
            config.get('ckan.datapusher.assume_task_stale_after', 3600)))
        if existing_task.get('state') == 'pending':
            updated = datetime.datetime.strptime(
                existing_task['last_updated'], '%Y-%m-%dT%H:%M:%S.%f')
            time_since_last_updated = datetime.datetime.utcnow() - updated
            if time_since_last_updated > assume_task_stale_after:
                # it's been a while since the job was last updated - it's more
                # likely something went wrong with it and the state wasn't
                # updated than its still in progress. Let it be restarted.
                log.info('A pending task was found %r, but it is only %s hours'
                         ' old', existing_task['id'], time_since_last_updated)
            else:
                log.info('A pending task was found %s for this resource, so '
                         'skipping this duplicate task', existing_task['id'])
                return False

        task['id'] = existing_task['id']
    except logic.NotFound:
        pass

    context['ignore_auth'] = True
    p.toolkit.get_action('task_status_update')(context, task)

    try:
        r = requests.post(
            urlparse.urljoin(datapusher_url, 'job'),
            headers={
                'Content-Type': 'application/json'
            },
            data=json.dumps({
                'api_key': user['apikey'],
                'job_type': 'push_to_datastore',
                'result_url': callback_url,
                'metadata': {
                    'ignore_hash': data_dict.get('ignore_hash', False),
                    'ckan_url': site_url,
                    'resource_id': res_id,
                    'set_url_type': data_dict.get('set_url_type', False),
                    'task_created': task['last_updated'],
                    'original_url': resource_dict.get('url'),
                }
            }))
        r.raise_for_status()
    except requests.exceptions.ConnectionError as e:
        error = {'message': 'Could not connect to DataPusher.',
                 'details': str(e)}
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.utcnow()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)

    except requests.exceptions.HTTPError as e:
        m = 'An Error occurred while sending the job: {0}'.format(e.message)
        try:
            body = e.response.json()
        except ValueError:
            body = e.response.text
        error = {'message': m,
                 'details': body,
                 'status_code': r.status_code}
        task['error'] = json.dumps(error)
        task['state'] = 'error'
        task['last_updated'] = str(datetime.datetime.utcnow()),
        p.toolkit.get_action('task_status_update')(context, task)
        raise p.toolkit.ValidationError(error)

    value = json.dumps({'job_id': r.json()['job_id'],
                        'job_key': r.json()['job_key']})

    task['value'] = value
    task['state'] = 'pending'
    task['last_updated'] = str(datetime.datetime.utcnow()),
    p.toolkit.get_action('task_status_update')(context, task)

    return True