Esempio n. 1
0
 def test_not_available_any_more(self):
     # A cache of the data still exists from the previous run, but this
     # time, the archiver found the file gave a 404.
     # The record of the previous (successful) run of QA.
     res = self._test_resource(license_id=None, format=None)
     qa = qa_model.QA.create(res.id)
     qa.format = 'CSV'
     model.Session.add(qa)
     model.Session.commit()
     # cache still exists from the previous run, but this time, the archiver
     # found the file gave a 404.
     archival = Archival.get_for_resource(res.id)
     archival.cache_filepath = __file__
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 404 error'
     archival.last_success = datetime.datetime(year=2008, month=10, day=1)
     archival.first_failure = datetime.datetime(year=2008, month=10, day=2)
     archival.failure_count = 1
     archival.is_broken = True
     result = resource_score(res, log)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], 'CSV')
     # in preference it should report that it is not available
     assert_equal(
         result['openness_score_reason'],
         'File could not be downloaded. Reason: Download error. Error details: Server returned 404 error. Attempted on 10/10/2008. This URL last worked on: 01/10/2008.'
     )
Esempio n. 2
0
def qa_resource_show(context, data_dict):
    '''
    Returns the QA and Archival information for a package or resource.
    '''
    model = context['model']
    session = context['session']
    #user = context.get('user')
    #p.toolkit.check_access('qa_resource_show', context, data_dict)

    res_id = p.toolkit.get_or_bust(data_dict, 'id')
    res = session.query(model.Resource).get(res_id)
    if not res:
        raise p.toolkit.ObjectNotFound

    archival = Archival.get_for_resource(res_id)
    qa = QA.get_for_resource(res_id)
    pkg = res.resource_group.package
    return {'name': pkg.name,
            'title': pkg.title,
            'id': res.id,
            'archival_updated': archival.updated.isoformat() if archival and archival.updated else None,
            'archival_is_broken': archival.is_broken if archival else None,
            'archival_reason': archival.reason if archival else None,
            'archival_url_redirected_to': archival.url_redirected_to if archival else None,
            'openness_score': qa.openness_score if qa else None,
            'openness_score_reason': qa.openness_score_reason if qa else None,
            'updated': qa.updated.isoformat() if qa and qa.updated else None,
            'format': qa.format if qa else None,
            }
Esempio n. 3
0
def save_archival(resource, status_id, reason, url_redirected_to,
                  download_result, archive_result, log):
    '''Writes to the archival table the result of an attempt to download
    the resource.

    May propagate a CkanError.
    '''
    now = datetime.datetime.now()

    from ckanext.archiver.model import Archival, Status
    from ckan import model

    archival = Archival.get_for_resource(resource['id'])
    first_archival = not archival
    previous_archival_was_broken = None
    if not archival:
        archival = Archival.create(resource['id'])
        model.Session.add(archival)
    else:
        log.info('Archival from before: %r', archival)
        previous_archival_was_broken = archival.is_broken

    revision = model.Session.query(model.Revision).get(resource['revision_id'])
    archival.resource_timestamp = revision.timestamp

    # Details of the latest archival attempt
    archival.status_id = status_id
    archival.is_broken = Status.is_status_broken(status_id)
    archival.reason = reason
    archival.url_redirected_to = url_redirected_to

    # Details of successful archival
    if archival.is_broken is False:
        archival.cache_filepath = archive_result['cache_filepath']
        archival.cache_url = archive_result['cache_url']
        archival.size = download_result['size']
        archival.mimetype = download_result['mimetype']
        archival.hash = download_result['hash']
        archival.etag = download_result['headers'].get('etag')
        archival.last_modified = download_result['headers'].get('last-modified')

    # History
    if archival.is_broken is False:
        archival.last_success = now
        archival.first_failure = None
        archival.failure_count = 0
    else:
        log.info('First_archival=%r Previous_broken=%r Failure_count=%r' %
                 (first_archival, previous_archival_was_broken,
                  archival.failure_count))
        if first_archival or previous_archival_was_broken is False:
            # i.e. this is the first failure (or the first archival)
            archival.first_failure = now
            archival.failure_count = 1
        else:
            archival.failure_count += 1

    archival.updated = now
    log.info('Archival saved: %r', archival)
    model.repo.commit_and_remove()
Esempio n. 4
0
def qa_resource_show(context, data_dict):
    '''
    Returns the QA and Archival information for a package or resource.
    '''
    model = context['model']
    session = context['session']
    # user = context.get('user')
    # p.toolkit.check_access('qa_resource_show', context, data_dict)

    res_id = p.toolkit.get_or_bust(data_dict, 'id')
    res = session.query(model.Resource).get(res_id)
    if not res:
        raise p.toolkit.ObjectNotFound

    archival = Archival.get_for_resource(res_id)
    qa = QA.get_for_resource(res_id)
    pkg = res.resource_group.package
    return_dict = {
        'name': pkg.name,
        'title': pkg.title,
        'id': res.id
        }
    return_dict['archival'] = archival.as_dict()
    return_dict.update(qa.as_dict())
    return return_dict
def qa_resource_show(context, data_dict):
    '''
    Returns the QA and Archival information for a package or resource.
    '''
    model = context['model']
    session = context['session']
    #user = context.get('user')
    #p.toolkit.check_access('qa_resource_show', context, data_dict)

    res_id = p.toolkit.get_or_bust(data_dict, 'id')
    res = session.query(model.Resource).get(res_id)
    if not res:
        raise p.toolkit.ObjectNotFound

    archival = Archival.get_for_resource(res_id)
    qa = QA.get_for_resource(res_id)
    pkg = res.resource_group.package
    return_dict = {
        'name': pkg.name,
        'title': pkg.title,
        'id': res.id
        }
    return_dict['archival'] = archival.as_dict()
    return_dict.update(qa.as_dict())
    return return_dict
Esempio n. 6
0
 def test_not_available_any_more(self):
     # A cache of the data still exists from the previous run, but this
     # time, the archiver found the file gave a 404.
     # The record of the previous (successful) run of QA.
     res = self._test_resource(license_id=None, format=None)
     qa = qa_model.QA.create(res.id)
     qa.format = 'CSV'
     model.Session.add(qa)
     model.Session.commit()
     # cache still exists from the previous run, but this time, the archiver
     # found the file gave a 404.
     archival = Archival.get_for_resource(res.id)
     archival.cache_filepath = __file__
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 404 error'
     archival.last_success = datetime.datetime(year=2008, month=10, day=1)
     archival.first_failure = datetime.datetime(year=2008, month=10, day=2)
     archival.failure_count = 1
     archival.is_broken = True
     result = resource_score(res)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], 'CSV')
     # in preference it should report that it is not available
     assert_equal(result['openness_score_reason'], 'File could not be downloaded. '
                                                   'Reason: Download error. Error details: Server returned 404 error.'
                                                   ' Attempted on 10/10/2008. This URL last worked on: 01/10/2008.')
Esempio n. 7
0
def resource_score(resource):
    """
    Score resource on Sir Tim Berners-Lee\'s five stars of openness.

    Returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'format': format of the data (string)
        'archival_timestamp': time of the archival that this result is based on (iso string)

    Raises QAError for reasonable errors
    """
    score = 0
    score_reason = ''
    format_ = None

    register_translator()

    try:
        score_reasons = []  # a list of strings detailing how we scored it
        archival = Archival.get_for_resource(resource_id=resource.id)
        if not resource:
            raise QAError('Could not find resource "%s"' % resource.id)

        score, format_ = score_if_link_broken(archival, resource,
                                              score_reasons)
        if score is None:
            # we don't want to take the publisher's word for it, in case the link
            # is only to a landing page, so highest priority is the sniffed type
            score, format_ = score_by_sniffing_data(archival, resource,
                                                    score_reasons)
            if score is None:
                # Fall-backs are user-given data
                score, format_ = score_by_url_extension(
                    resource, score_reasons)
                if score is None:
                    score, format_ = score_by_format_field(
                        resource, score_reasons)
                    if score is None:
                        log.warning(
                            'Could not score resource: "%s" with url: "%s"',
                            resource.id, resource.url)
                        score_reasons.append(
                            _('Could not understand the file format, therefore score is 1.'
                              ))
                        score = 1
                        if format_ is None:
                            # use any previously stored format value for this resource
                            format_ = get_qa_format(resource.id)
        score_reason = ' '.join(score_reasons)
        format_ = format_ or None
    except Exception, e:
        log.error(
            'Unexpected error while calculating openness score %s: %s\nException: %s',
            e.__class__.__name__, unicode(e), traceback.format_exc())
        score_reason = _("Unknown error: %s") % str(e)
        raise
class DataPreviewController(BaseController):
    def index(self, id):
        resource = model.Resource.get(id)
        if not resource or resource.state != 'active':
            abort(404, "Resource not found")

        context = {'model': model, 'session': model.Session, 'user': c.user}
        try:
            check_access("resource_show", context, {'id': resource.id})
        except NotAuthorized, e:
            abort(403, "You are not permitted access to this resource")

        size_limit = config.get('ckan.datapreview.limit', 5242880)

        qa = QA.get_for_resource(resource.id)
        format_ = qa.format if qa else None
        log.debug('File format (according to QA): %r' % format_)
        if not format_:
            format_ = resource.format.lower() if resource.format else ''
            log.debug('File format (resource.format): %r' % format_)

        query = dict(type=format_, size_limit=size_limit, length=None)
        archival = Archival.get_for_resource(resource.id)
        if archival and archival.size:
            query['length'] = archival.size

        # Add the extra fields if they are set
        for k in ['max-results', 'encoding', 'type']:
            if k in request.params:
                query[k] = request.params[k]

        url, archived = self._get_url(resource, query)
        query['archived'] = archived
        if url:
            try:
                response.content_type = 'application/json'
                result = proxy_query(resource, url, query)
            except ProxyError as e:
                log.warn("Request {0} failed : {1}".format(
                    identify_resource(resource), e))
                result = _error(title=e.title, message=e.message)
        else:
            result = _error(
                title="Remote resource not downloadable",
                message="Unable to find the remote resource for download")

        format_ = request.params.get('callback')
        if format_:
            return "%s(%s)" % (format_, result)

        return result
Esempio n. 9
0
def resource_score(resource):
    """
    Score resource on Sir Tim Berners-Lee\'s five stars of openness.

    Returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'format': format of the data (string)
        'archival_timestamp': time of the archival that this result is based on (iso string)

    Raises QAError for reasonable errors
    """
    score = 0
    score_reason = ''
    format_ = None

    register_translator()

    try:
        score_reasons = []  # a list of strings detailing how we scored it
        archival = Archival.get_for_resource(resource_id=resource.id)
        if not resource:
            raise QAError('Could not find resource "%s"' % resource.id)

        score, format_ = score_if_link_broken(archival, resource, score_reasons)
        if score is None:
            # we don't want to take the publisher's word for it, in case the link
            # is only to a landing page, so highest priority is the sniffed type
            score, format_ = score_by_sniffing_data(archival, resource,
                                                    score_reasons)
            if score is None:
                # Fall-backs are user-given data
                score, format_ = score_by_url_extension(resource, score_reasons)
                if score is None:
                    score, format_ = score_by_format_field(resource, score_reasons)
                    if score is None:
                        log.warning('Could not score resource: "%s" with url: "%s"',
                                    resource.id, resource.url)
                        score_reasons.append(_('Could not understand the file format, therefore score is 1.'))
                        score = 1
                        if format_ is None:
                            # use any previously stored format value for this resource
                            format_ = get_qa_format(resource.id)
        score_reason = ' '.join(score_reasons)
        format_ = format_ or None
    except Exception, e:
        log.error('Unexpected error while calculating openness score %s: %s\nException: %s',
                  e.__class__.__name__,  unicode(e), traceback.format_exc())
        score_reason = _("Unknown error: %s") % str(e)
        raise
Esempio n. 10
0
 def test_not_available_and_not_open(self):
     res = self._test_resource(license_id=None, format=None, cached=False)
     archival = Archival.get_for_resource(res.id)
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 500 error'
     archival.last_success = None
     archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30)
     archival.failure_count = 16
     archival.is_broken = True
     model.Session.commit()
     result = resource_score(res, log)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], None)
     # in preference it should report that it is not available
     assert_equal(result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 500 error. Attempted on 10/10/2008. Tried 16 times since 01/10/2008. This URL has not worked in the history of this tool.')
Esempio n. 11
0
 def test_not_available_and_not_open(self):
     res = self._test_resource(license_id=None, format=None, cached=False)
     archival = Archival.get_for_resource(res.id)
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 500 error'
     archival.last_success = None
     archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30)
     archival.failure_count = 16
     archival.is_broken = True
     model.Session.commit()
     result = resource_score(res, log)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], None)
     # in preference it should report that it is not available
     assert_equal(result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 500 error. Attempted on 10/10/2008. Tried 16 times since 01/10/2008. This URL has not worked in the history of this tool.')
Esempio n. 12
0
def archiver_resource_show(context, data_dict=None):
    '''Return a details of the archival of a resource

    :param id: the id of the resource
    :type id: string

    :rtype: dictionary
    '''
    id_ = _get_or_bust(data_dict, 'id')
    archival = Archival.get_for_resource(id_)
    if archival is None:
        raise ObjectNotFound
    archival_dict = archival.as_dict()
    p.toolkit.check_access('archiver_resource_show', context, data_dict)
    return archival_dict
Esempio n. 13
0
def _update_resource(resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """

    from ckan import model
    from ckan.plugins.toolkit import config
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify_resource(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    url = resource['url']
    if not url.startswith('http'):
        url = config['ckan.site_url'].rstrip('/') + url

    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource['id'])

        hosted_externally = not url.startswith(
            config['ckan.site_url']) or urlparse(filepath).scheme != ''
        # if resource.get('resource_type') == 'file.upload' and not hosted_externally:
        if not hosted_externally:
            log.info("Won't attemp to archive resource uploaded locally: %s" %
                     resource['url'])

            try:
                hash, length = _file_hashnlength(filepath)
            except IOError as e:
                log.error('Error while accessing local resource %s: %s',
                          filepath, e)

                download_status_id = Status.by_text('URL request failed')
                _save(download_status_id, e, resource)
                return

            mimetype = None
            headers = None
            content_type, content_encoding = mimetypes.guess_type(url)
            if content_type:
                mimetype = _clean_content_type(content_type)
                headers = {'Content-Type': content_type}

            download_result_mock = {
                'mimetype': mimetype,
                'size': length,
                'hash': hash,
                'headers': headers,
                'saved_file': filepath,
                'url_redirected_to': url,
                'request_type': 'GET'
            }

            archive_result_mock = {
                'cache_filepath': filepath,
                'cache_url': url
            }

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'],
                  download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock,
                                   **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext-archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }

    err = None
    try:
        download_result = download(context, resource)
    except NotChanged as e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
        err = e
    except LinkInvalidError as e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
        err = e
    except DownloadException as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except DownloadError as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except ChooseNotToDownload as e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
        err = e
    except ForbiddenError as e:
        download_status_id = Status.by_text('Forbidden error')
        try_as_api = False
        err = e
    except Exception as e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id) and err:
        log.info('GET error: %s - %r, %r "%s"',
                 Status.by_id(download_status_id), err, err.args,
                 resource.get('url'))

        if try_as_api:
            download_result = api_request(context, resource)
            if download_result:
                download_status_id = Status.by_text('Archived successfully')
            # else the download_status_id (i.e. an error) is left what it was
            # from the previous download (i.e. not when we tried it as an API)

        if not try_as_api or not Status.is_ok(download_status_id):
            extra_args = [err.args.url_redirected_to
                          ] if 'url_redirected_to' in err.args else []
            _save(download_status_id, err, resource, *extra_args)
            return

    if not requires_archive:
        # We don't need to archive if the remote content has not changed
        return None

    # Archival
    log.info('Attempting to archive resource')
    try:
        archive_result = archive_resource(context, resource, log,
                                          download_result)
    except ArchiveError as e:
        log.error('System error during archival: %r, %r', e, e.args)
        _save(Status.by_text('System error during archival'), e, resource,
              download_result['url_redirected_to'])
        return

    # Success
    _save(Status.by_text('Archived successfully'), '', resource,
          download_result['url_redirected_to'], download_result,
          archive_result)

    # The return value is only used by tests. Serialized for Celery.
    return json.dumps(dict(download_result, **archive_result))
Esempio n. 14
0
 def assert_archival_error(self, error_message_fragment, resource_id):
     archival = Archival.get_for_resource(resource_id)
     if error_message_fragment not in archival.reason:
         print 'ERROR: %s (%s)' % (archival.reason, archival.status)
         raise AssertionError(archival.reason)
Esempio n. 15
0
            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock, **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root': config.get('ckanext-archiver.cache_url_root'),
        'previous': Archival.get_for_resource(resource_id)
        }
    try:
        download_result = download(context, resource)
    except NotChanged, e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
    except LinkInvalidError, e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
    except DownloadException, e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except DownloadError, e:
        download_status_id = Status.by_text('Download error')
Esempio n. 16
0
        "license": LICENSE_LOOKUP.get(pkg["license_id"], ""),
        "resources": [],
    }

    if pkg["notes"]:
        datapackage["description"] = pkg["notes"]

    try:
        package_zip = PackageZip.get_for_package(pkg["id"])
        datapackage["filepath"] = package_zip.filepath
    except Exception, ex:
        pass

    fd = FilenameDeduplicator()
    for res in pkg["resources"]:
        archival = Archival.get_for_resource(res["id"])
        if archival and archival.cache_filepath:
            # We have archived it, and we have a path.
            _, resource_id, filename = archival.cache_filepath.rsplit("/", 2)
            cache_filepath = archival.cache_filepath
        else:
            # Try and work out the filename from the URL.
            try:
                _, filename = res["url"].rsplit("/", 1)
            except ValueError:
                filename = res["id"]
            cache_filepath = ""

        filename = fd.deduplicate(filename)
        resource_json = {
            "url": res["url"],
Esempio n. 17
0
        'license': LICENSE_LOOKUP.get(pkg['license_id'], ''),
        'resources': [],
    }

    if pkg['notes']:
        datapackage['description'] = pkg['notes']

    try:
        package_zip = PackageZip.get_for_package(pkg['id'])
        datapackage['filepath'] = package_zip.filepath
    except Exception, ex:
        pass

    fd = FilenameDeduplicator()
    for res in pkg['resources']:
        archival = Archival.get_for_resource(res['id'])
        if archival and archival.cache_filepath:
            # We have archived it, and we have a path.
            _, resource_id, filename = archival.cache_filepath.rsplit('/', 2)
            cache_filepath = archival.cache_filepath
        else:
            # Try and work out the filename from the URL.
            try:
                _, filename = res['url'].rsplit('/', 1)
            except ValueError:
                filename = res['id']
            cache_filepath = ''

        filename = fd.deduplicate(filename)
        resource_json = {
            'url': res['url'],
Esempio n. 18
0
def _update_resource(resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    from ckan import model
    from pylons import config
    from ckan.plugins import toolkit
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify_resource(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext.archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }
    try:
        download_result = download(context, resource)
    except NotChanged, e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
Esempio n. 19
0
def _update_resource(ckan_ini_filepath, resource_id, queue):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    log = update_resource.get_logger()

    load_config(ckan_ini_filepath)
    register_translator()

    from ckan import model
    from pylons import config
    from ckan.plugins import toolkit

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id, exception, resource, url_redirected_to=None,
              download_result=None, archive_result=None):
        reason = '%s' % exception
        save_archival(resource, status_id,
                      reason, url_redirected_to,
                      download_result, archive_result,
                      log)
        notify_resource(
            resource,
            queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    from ckanext.archiver.model import Status, Archival
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root': config.get('ckanext-archiver.cache_url_root'),
        'previous': Archival.get_for_resource(resource_id)
        }
    try:
        download_result = download(context, resource)
    except NotChanged, e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
Esempio n. 20
0
 def assert_archival_error(self, error_message_fragment, resource_id):
     archival = Archival.get_for_resource(resource_id)
     if error_message_fragment not in archival.reason:
         print 'ERROR: %s (%s)' % (archival.reason, archival.status)
         raise AssertionError(archival.reason)
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url
                    or res.extras.get('cache_filepath')
                    or res.hash
                    or res.size
                    or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to',
                               'updated', 'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME,
                fields['first_failure'] or START_OF_TIME,
                fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res, stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Esempio n. 22
0
def resource_score(resource):
    """
    Score resource on Sir Tim Berners-Lee\'s five stars of openness.

    Returns a dict with keys:

        'openness_score': score (int)
        'openness_score_reason': the reason for the score (string)
        'format': format of the data (string)
        'archival_timestamp': time of the archival that this result is based on (iso string)

    Raises QAError for reasonable errors
    """
    score = 0
    score_reason = ''
    format_ = None

    try:
        register_translator()
    except ImportError:
        # if we can't import Pylons, we don't need to
        pass

    try:
        score_reasons = []  # a list of strings detailing how we scored it
        archival = Archival.get_for_resource(resource_id=resource.id)
        if not resource:
            raise QAError('Could not find resource "%s"' % resource.id)

        score, format_ = score_if_link_broken(archival, resource, score_reasons)
        if score is None:
            # we don't want to take the publisher's word for it, in case the link
            # is only to a landing page, so highest priority is the sniffed type
            score, format_ = score_by_sniffing_data(archival, resource,
                                                    score_reasons)
            if score is None:
                # Fall-backs are user-given data
                score, format_ = score_by_url_extension(resource, score_reasons)
                if score is None:
                    score, format_ = score_by_format_field(resource, score_reasons)
                    if score is None:
                        log.warning('Could not score resource: "%s" with url: "%s"',
                                    resource.id, resource.url)
                        score_reasons.append(_('Could not understand the file format, therefore score is 1.'))
                        score = 1
                        if format_ is None:
                            # use any previously stored format value for this resource
                            format_ = get_qa_format(resource.id)
        score_reason = ' '.join(score_reasons)
        format_ = format_ or None
    except Exception as e:
        log.error('Unexpected error while calculating openness score %s: %s\nException: %s',
                  e.__class__.__name__, e, traceback.format_exc())
        score_reason = _("Unknown error: %s") % e
        raise

    # Even if we can get the link, we should still treat the resource
    # as having a score of 0 if the license isn't open.
    #
    # It is important we do this check after the link check, otherwise
    # the link checker won't get the chance to see if the resource
    # is broken.
    if toolkit.check_ckan_version(max_version='2.2.99'):
        package = resource.resource_group.package
    else:
        package = resource.package
    if score > 0 and not package.isopen():
        score_reason = _('License not open')
        score = 0

    log.info('Score: %s Reason: %s', score, score_reason)

    archival_updated = archival.updated.isoformat() \
        if archival and archival.updated else None
    result = {
        'openness_score': score,
        'openness_score_reason': score_reason,
        'format': format_,
        'archival_timestamp': archival_updated
    }

    custom_result = custom_resource_score(resource, result)

    return custom_result or result
    def _get_url(self, resource, query):
        '''
        Given a resource, return the URL for the data and a flag denoting whether
        the URL is to a local file (and therefore can ignore size limit checks.)

        This allows a local cache to be used in preference to the
        resource.url.

        If we are going to use an external URL, then we can do a HEAD request
        to check it works and record the mimetype & length in the query dict.

        :param resource: resource object
        :param query: dict describing the properties of the data
        '''
        from requests.exceptions import InvalidURL

        url = None
        archived = False
        query['mimetype'] = None
        archival = Archival.get_for_resource(resource.id)

        if archival:
            # Look for a local cache of the data file
            # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv"
            if archival.cache_filepath:
                if os.path.exists(archival.cache_filepath.encode('utf8')):
                    log.debug('Previewing local cached data: %s',
                              archival.cache_filepath)
                    url = archival.cache_filepath
                    archived = True
                else:
                    log.debug('Local cached data file missing: %s',
                              archival.cache_filepath)
            else:
                log.debug('No cache_filepath for resource %s',
                          identify_resource(resource))

            # Otherwise try the cache_url
            # This works well when running on a database copied from another
            # machine - all the cached files are missing locally, but it can use
            # them from the original machine using the cache_url.
            if not url:
                if archival.cache_url:
                    try:
                        u = fix_url(archival.cache_url)
                    except InvalidURL:
                        log.error("Unable to fix the URL for resource: %s" %
                                  identify_resource(resource))
                        return None, False

                    # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf"
                    try:
                        req = urllib2.Request(u)
                        req.get_method = lambda: 'HEAD'

                        r = urllib2.urlopen(req)
                        if r.getcode() == 200:
                            url = u
                            query['length'] = r.info().get("content-length", 0)
                            query['mimetype'] = r.info().get(
                                'content-type', None)
                            log.debug('Previewing cache URL: %s', url)
                    except Exception, e:
                        log.error(
                            u"Request {0} with cache url {1}, {2}".format(
                                identify_resource(resource), u, e))
                else:
                    log.debug('No cache_url for resource %s',
                              identify_resource(resource))
Esempio n. 24
0
 def is_res_broken(resource):
     archival = Archival.get_for_resource(resource.id)
     if not archival:
         return None
     return archival.is_broken
Esempio n. 25
0
    def _get_url(self, resource, query):
        '''
        Given a resource, return the URL for the data and a flag denoting whether
        the URL is to a local file (and therefore can ignore size limit checks.)

        This allows a local cache to be used in preference to the
        resource.url.

        If we are going to use an external URL, then we can do a HEAD request
        to check it works and record the mimetype & length in the query dict.

        :param resource: resource object
        :param query: dict describing the properties of the data
        '''
        from requests.exceptions import InvalidURL

        url = None
        archived = False
        query['mimetype'] = None
        archival = Archival.get_for_resource(resource.id)

        if archival:
            # Look for a local cache of the data file
            # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv"
            if archival.cache_filepath:
                if os.path.exists(archival.cache_filepath.encode('utf8')):
                    log.debug('Previewing local cached data: %s', archival.cache_filepath)
                    url = archival.cache_filepath
                    archived = True
                else:
                    log.debug('Local cached data file missing: %s', archival.cache_filepath)
            else:
                log.debug('No cache_filepath for resource %s', identify_resource(resource))

            # Otherwise try the cache_url
            # This works well when running on a database copied from another
            # machine - all the cached files are missing locally, but it can use
            # them from the original machine using the cache_url.
            if not url:
                if archival.cache_url:
                    try:
                        u = fix_url(archival.cache_url)
                    except InvalidURL:
                        log.error("Unable to fix the URL for resource: %s" % identify_resource(resource))
                        return None, False

                    # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf"
                    try:
                        req = urllib2.Request(u)
                        req.get_method = lambda: 'HEAD'

                        r = urllib2.urlopen(req)
                        if r.getcode() == 200:
                            url = u
                            query['length'] = r.info().get("content-length", 0)
                            query['mimetype'] = r.info().get('content-type', None)
                            log.debug('Previewing cache URL: %s', url)
                    except Exception, e:
                        log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e))
                else:
                    log.debug('No cache_url for resource %s', identify_resource(resource))
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    from ckanext.qa.model import QA

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of QA from TaskStatus
        # to fill all properties of QA apart from:
        # * package_id
        # * resource_id
        fields = {}
        qa_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='qa')\
                                    .filter_by(key='status')\
                                    .first()
        if not qa_task_status:
            add_stat('No QA data', res, stats)
            continue
        qa_error = json.loads(qa_task_status.error)
        fields['openness_score'] = int(qa_task_status.value)
        fields['openness_score_reason'] = qa_error['reason']
        fields['format'] = qa_error['format']
        qa_date = qa_task_status.last_updated
        # NB qa_task_status.last_updated appears to be 1hr ahead of the revision
        # time, so some timezone nonesense going on. Can't do much.
        archival = Archival.get_for_resource(res.id)
        if not archival:
            print add_stat('QA but no Archival data', res, stats)
            continue
        archival_date = archival.updated
        # the state of the resource was as it was archived on the date of
        # the QA update but we only know when the latest archival was. So
        # if it was archived before the QA update thenwe know that was the
        # archival, otherwise we don't know when the relevant archival was.
        if archival_date and qa_date >= archival_date:
            fields['archival_timestamp'] = archival_date
            fields['updated'] = archival_date
            fields['created'] = archival_date
            # Assume the resource URL archived was the one when the
            # archival was done (it may not be if the URL was queued and
            # there was significant delay before it was archived)
            get_resource_as_at = archival_date
        else:
            # This is common for when a resource is created and qa runs just
            # before archiver and you get:
            # "This file had not been downloaded at the time of scoring it."
            # Just put sensible datetimes since we don't really know the exact
            # ones
            fields['archival_timestamp'] = qa_date
            fields['updated'] = qa_date
            fields['created'] = qa_date
            get_resource_as_at = qa_date
        res_rev = model.Session.query(model.ResourceRevision).\
            filter_by(id=res.id).\
            filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\
            order_by(model.ResourceRevision.revision_timestamp.desc()).\
            first()
        fields['resource_timestamp'] = res_rev.revision_timestamp

        # Compare with any existing data in the Archival table
        qa = QA.get_for_resource(res.id)
        if qa:
            changed = None
            for field, value in fields.items():
                if getattr(qa, field) != value:
                    if options.write:
                        setattr(qa, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in QA table', res, stats)
                continue
            add_stat('Updated in QA table', res, stats)
        else:
            qa = QA.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(qa, field, value)
                model.Session.add(qa)
            add_stat('Added to QA table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Esempio n. 27
0
 def assert_archival_error(self, error_message_fragment, resource_id):
     archival = Archival.get_for_resource(resource_id)
     if error_message_fragment not in archival.reason:
         print('ERROR: %s (%s)' % (archival.reason, archival.status))
         raise AssertionError("Expected error containing: {}, but was: {}".format(error_message_fragment, archival.reason))
Esempio n. 28
0
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(
                ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(
                ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url or res.extras.get('cache_filepath')
                    or res.hash or res.size or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to', 'updated',
                               'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[
                -1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME, fields['first_failure']
                or START_OF_TIME, fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res,
                         stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Esempio n. 29
0
            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock,
                                   **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext-archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }
    try:
        download_result = download(context, resource)
    except NotChanged, e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
    except LinkInvalidError, e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
    except DownloadException, e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except DownloadError, e:
        download_status_id = Status.by_text('Download error')