Ejemplo n.º 1
0
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except DownloadError, e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except ChooseNotToDownload, e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
    except Exception, e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id):
        log.info('GET error: %s - %r, %r "%s"',
                 Status.by_id(download_status_id), e, e.args,
                 resource.get('url'))

        if try_as_api:
            download_result = api_request(context, resource)
            if download_result:
                download_status_id = Status.by_text('Archived successfully')
            # else the download_status_id (i.e. an error) is left what it was
            # from the previous download (i.e. not when we tried it as an API)

        if not try_as_api or not Status.is_ok(download_status_id):
            extra_args = [e.url_redirected_to
                          ] if 'url_redirected_to' in e else []
            _save(download_status_id, e, resource, *extra_args)
Ejemplo n.º 2
0
def _update_resource(resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """

    from ckan import model
    from ckan.plugins.toolkit import config
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify_resource(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    url = resource['url']
    if not url.startswith('http'):
        url = config['ckan.site_url'].rstrip('/') + url

    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource['id'])

        hosted_externally = not url.startswith(
            config['ckan.site_url']) or urlparse(filepath).scheme != ''
        # if resource.get('resource_type') == 'file.upload' and not hosted_externally:
        if not hosted_externally:
            log.info("Won't attemp to archive resource uploaded locally: %s" %
                     resource['url'])

            try:
                hash, length = _file_hashnlength(filepath)
            except IOError as e:
                log.error('Error while accessing local resource %s: %s',
                          filepath, e)

                download_status_id = Status.by_text('URL request failed')
                _save(download_status_id, e, resource)
                return

            mimetype = None
            headers = None
            content_type, content_encoding = mimetypes.guess_type(url)
            if content_type:
                mimetype = _clean_content_type(content_type)
                headers = {'Content-Type': content_type}

            download_result_mock = {
                'mimetype': mimetype,
                'size': length,
                'hash': hash,
                'headers': headers,
                'saved_file': filepath,
                'url_redirected_to': url,
                'request_type': 'GET'
            }

            archive_result_mock = {
                'cache_filepath': filepath,
                'cache_url': url
            }

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'],
                  download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock,
                                   **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext-archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }

    err = None
    try:
        download_result = download(context, resource)
    except NotChanged as e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
        err = e
    except LinkInvalidError as e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
        err = e
    except DownloadException as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except DownloadError as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except ChooseNotToDownload as e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
        err = e
    except ForbiddenError as e:
        download_status_id = Status.by_text('Forbidden error')
        try_as_api = False
        err = e
    except Exception as e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id) and err:
        log.info('GET error: %s - %r, %r "%s"',
                 Status.by_id(download_status_id), err, err.args,
                 resource.get('url'))

        if try_as_api:
            download_result = api_request(context, resource)
            if download_result:
                download_status_id = Status.by_text('Archived successfully')
            # else the download_status_id (i.e. an error) is left what it was
            # from the previous download (i.e. not when we tried it as an API)

        if not try_as_api or not Status.is_ok(download_status_id):
            extra_args = [err.args.url_redirected_to
                          ] if 'url_redirected_to' in err.args else []
            _save(download_status_id, err, resource, *extra_args)
            return

    if not requires_archive:
        # We don't need to archive if the remote content has not changed
        return None

    # Archival
    log.info('Attempting to archive resource')
    try:
        archive_result = archive_resource(context, resource, log,
                                          download_result)
    except ArchiveError as e:
        log.error('System error during archival: %r, %r', e, e.args)
        _save(Status.by_text('System error during archival'), e, resource,
              download_result['url_redirected_to'])
        return

    # Success
    _save(Status.by_text('Archived successfully'), '', resource,
          download_result['url_redirected_to'], download_result,
          archive_result)

    # The return value is only used by tests. Serialized for Celery.
    return json.dumps(dict(download_result, **archive_result))
Ejemplo n.º 3
0
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except DownloadError, e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except ChooseNotToDownload, e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
    except Exception, e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id):
        log.info('GET error: %s - %r, %r "%s"',
                 Status.by_id(download_status_id), e, e.args,
                 resource.get('url'))

        if try_as_api:
            download_result = api_request(context, resource)
            if download_result:
                download_status_id = Status.by_text('Archived successfully')
            # else the download_status_id (i.e. an error) is left what it was
            # from the previous download (i.e. not when we tried it as an API)

        if not try_as_api or not Status.is_ok(download_status_id):
            extra_args = [e.url_redirected_to] if 'url_redirected_to' in e else []
            _save(download_status_id, e, resource, *extra_args)
            return