Example #1
0
 def test_not_available_any_more(self):
     # A cache of the data still exists from the previous run, but this
     # time, the archiver found the file gave a 404.
     # The record of the previous (successful) run of QA.
     res = self._test_resource(license_id=None, format=None)
     qa = qa_model.QA.create(res.id)
     qa.format = 'CSV'
     model.Session.add(qa)
     model.Session.commit()
     # cache still exists from the previous run, but this time, the archiver
     # found the file gave a 404.
     archival = Archival.get_for_resource(res.id)
     archival.cache_filepath = __file__
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 404 error'
     archival.last_success = datetime.datetime(year=2008, month=10, day=1)
     archival.first_failure = datetime.datetime(year=2008, month=10, day=2)
     archival.failure_count = 1
     archival.is_broken = True
     result = resource_score(res)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], 'CSV')
     # in preference it should report that it is not available
     assert_equal(result['openness_score_reason'], 'File could not be downloaded. '
                                                   'Reason: Download error. Error details: Server returned 404 error.'
                                                   ' Attempted on 10/10/2008. This URL last worked on: 01/10/2008.')
Example #2
0
 def test_not_available_any_more(self):
     # A cache of the data still exists from the previous run, but this
     # time, the archiver found the file gave a 404.
     # The record of the previous (successful) run of QA.
     res = self._test_resource(license_id=None, format=None)
     qa = qa_model.QA.create(res.id)
     qa.format = 'CSV'
     model.Session.add(qa)
     model.Session.commit()
     # cache still exists from the previous run, but this time, the archiver
     # found the file gave a 404.
     archival = Archival.get_for_resource(res.id)
     archival.cache_filepath = __file__
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 404 error'
     archival.last_success = datetime.datetime(year=2008, month=10, day=1)
     archival.first_failure = datetime.datetime(year=2008, month=10, day=2)
     archival.failure_count = 1
     archival.is_broken = True
     result = resource_score(res, log)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], 'CSV')
     # in preference it should report that it is not available
     assert_equal(
         result['openness_score_reason'],
         'File could not be downloaded. Reason: Download error. Error details: Server returned 404 error. Attempted on 10/10/2008. This URL last worked on: 01/10/2008.'
     )
Example #3
0
def score_by_sniffing_data(archival, resource, score_reasons):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append(_('This file had not been downloaded at the time of scoring it.'))
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    delete_file = False
    if not os.path.exists(filepath):
        log.debug("%s not found on disk, retrieving from URL %s",
                  filepath, archival.cache_url)
        try:
            filepath = _download_url(archival.cache_url).name
            delete_file = True
        except Exception as e:
            score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e)
            return (None, None)

    if filepath:
        try:
            sniffed_format = sniff_format.sniff_file_format(filepath)
        finally:
            if delete_file:
                try:
                    os.remove(filepath)
                except OSError as e:
                    log.warn("Unable to remove temporary file %s: %s", filepath, e)
        score = lib.resource_format_scores().get(sniffed_format['format']) \
            if sniffed_format else None
        if sniffed_format:
            score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.')
                                 % (sniffed_format['format'], score))
            return score, sniffed_format['format']
        else:
            score_reasons.append(_('The format of the file was not recognized from its contents.'))
            return (None, None)
    else:
        # No cache_url
        if archival.status_id == Status.by_text('Chose not to download'):
            score_reasons.append(_('File was not downloaded deliberately') + '. '
                                 + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
            return (None, None)
        elif archival.is_broken is None and archival.status_id:
            # i.e. 'Download failure' or 'System error during archival'
            score_reasons.append(_('A system error occurred during downloading this file') + '. '
                                 + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.'))
            return (None, None)
        else:
            score_reasons.append(_('This file had not been downloaded at the time of scoring it.'))
            return (None, None)
Example #4
0
def score_by_sniffing_data(archival, resource, score_reasons):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append(
            _('This file had not been downloaded at the time of scoring it.'))
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append(
            _('Cache filepath does not exist: "%s".') % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath)
            score = lib.resource_format_scores().get(sniffed_format['format']) \
                if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    _('Content of file appeared to be format "%s" which receives openness score: %s.'
                      ) % (sniffed_format['format'], score))
                return score, sniffed_format['format']
            else:
                score_reasons.append(
                    _('The format of the file was not recognized from its contents.'
                      ))
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append(
                    _('File was not downloaded deliberately') + '. ' +
                    _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    _('A system error occurred during downloading this file') +
                    '. ' + _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            else:
                score_reasons.append(
                    _('This file had not been downloaded at the time of scoring it.'
                      ))
                return (None, None)
Example #5
0
def score_by_sniffing_data(archival, resource, score_reasons, log):
    """
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    """
    if not archival or not archival.cache_filepath:
        score_reasons.append("This file had not been downloaded at the time of scoring it.")
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Cache filepath does not exist: "%s".' % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            score = lib.resource_format_scores().get(sniffed_format["format"]) if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    'Content of file appeared to be format "%s" which receives openness score: %s.'
                    % (sniffed_format["format"], score)
                )
                return score, sniffed_format["format"]
            else:
                score_reasons.append("The format of the file was not recognized from its contents.")
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text("Chose not to download"):
                score_reasons.append(
                    "File was not downloaded deliberately. Reason: %s. Using other methods to determine file openness."
                    % archival.reason
                )
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    "A system error occurred during downloading this file. Reason: %s. Using other methods to determine file openness."
                    % archival.reason
                )
                return (None, None)
            else:
                score_reasons.append("This file had not been downloaded at the time of scoring it.")
                return (None, None)
Example #6
0
 def test_not_available_and_not_open(self):
     res = self._test_resource(license_id=None, format=None, cached=False)
     archival = Archival.get_for_resource(res.id)
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 500 error'
     archival.last_success = None
     archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30)
     archival.failure_count = 16
     archival.is_broken = True
     model.Session.commit()
     result = resource_score(res, log)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], None)
     # in preference it should report that it is not available
     assert_equal(result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 500 error. Attempted on 10/10/2008. Tried 16 times since 01/10/2008. This URL has not worked in the history of this tool.')
Example #7
0
 def test_not_available_and_not_open(self):
     res = self._test_resource(license_id=None, format=None, cached=False)
     archival = Archival.get_for_resource(res.id)
     archival.status_id = Status.by_text('Download error')
     archival.reason = 'Server returned 500 error'
     archival.last_success = None
     archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30)
     archival.failure_count = 16
     archival.is_broken = True
     model.Session.commit()
     result = resource_score(res, log)
     assert result['openness_score'] == 0, result
     assert_equal(result['format'], None)
     # in preference it should report that it is not available
     assert_equal(result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 500 error. Attempted on 10/10/2008. Tried 16 times since 01/10/2008. This URL has not worked in the history of this tool.')
Example #8
0
def score_by_sniffing_data(archival, resource, score_reasons, log):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.')
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Putanja predmemorije ne postoji: "%s".' %
                             filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            if sniffed_format:
                score_reasons.append(
                    'Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' %
                    (sniffed_format['display_name'],
                     sniffed_format['openness']))
                return sniffed_format['openness'], sniffed_format[
                    'display_name']
            else:
                score_reasons.append('Format je nepoznat.')
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            else:
                score_reasons.append(
                    'Datoteka nije preuzeta u vrijeme ocijenjivanja.')
                return (None, None)
Example #9
0
File: tasks.py Project: tbalaz/test
def score_by_sniffing_data(archival, resource, score_reasons, log):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.')
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Putanja predmemorije ne postoji: "%s".' % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            if sniffed_format:
                score_reasons.append('Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' % (sniffed_format['display_name'], sniffed_format['openness']))
                return sniffed_format['openness'], sniffed_format['display_name']
            else:
                score_reasons.append('Format je nepoznat.')
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            else:
                score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.')
                return (None, None)
Example #10
0
    def test_trigger_on_archival(cls):
        # create package
        context = {
            'model': model,
            'ignore_auth': True,
            'session': model.Session,
            'user': '******'
        }
        pkg = {
            'name':
            'testpkg',
            'owner_org':
            _test_org().id,
            'license_id':
            'uk-ogl',
            'resources': [{
                'url': 'http://test.com/',
                'format': 'CSV',
                'description': 'Test'
            }]
        }
        pkg = get_action('package_create')(context, pkg)
        resource_dict = pkg['resources'][0]
        res_id = resource_dict['id']
        # create record of archival
        archival = Archival.create(res_id)
        cache_filepath = __file__  # just needs to exist
        archival.cache_filepath = cache_filepath
        archival.updated = TODAY
        archival.status_id = Status.by_text('Archived successfully')
        model.Session.add(archival)
        model.Session.commit()
        # TODO show that QA hasn't run yet

        # create a send_data from ckanext-archiver, that gets picked up by
        # ckanext-qa to put a task on the queue
        ckanext.archiver.tasks.notify_package(pkg, 'priority')
Example #11
0
def _test_resource(url='anything',
                   format='TXT',
                   archived=True,
                   cached=True,
                   license_id='uk-ogl'):
    pkg = {
        'owner_org': _test_org().id,
        'license_id': license_id,
        'resources': [{
            'url': url,
            'format': format,
            'description': 'Test'
        }]
    }
    pkg = ckan_factories.Dataset(**pkg)
    res_id = pkg['resources'][0]['id']
    if archived:
        archival = Archival.create(res_id)
        archival.cache_filepath = __file__ if cached else None  # just needs to exist
        archival.updated = TODAY
        archival.status_id = Status.by_text('Archived successfully')
        model.Session.add(archival)
        model.Session.commit()
    return model.Resource.get(res_id)
Example #12
0
def _update_resource(ckan_ini_filepath, resource_id, queue):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    log = update_resource.get_logger()

    load_config(ckan_ini_filepath)
    register_translator()

    from ckan import model
    from pylons import config
    from ckan.plugins import toolkit

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id, exception, resource, url_redirected_to=None,
              download_result=None, archive_result=None):
        reason = '%s' % exception
        save_archival(resource, status_id,
                      reason, url_redirected_to,
                      download_result, archive_result,
                      log)
        notify_resource(
            resource,
            queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    from ckanext.archiver.model import Status
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root': config.get('ckanext-archiver.cache_url_root'),
        }
    try:
        download_result = download(context, resource)
    except LinkInvalidError, e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
Example #13
0
def _update_resource(resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """

    from ckan import model
    from ckan.plugins.toolkit import config
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify_resource(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    url = resource['url']
    if not url.startswith('http'):
        url = config['ckan.site_url'].rstrip('/') + url

    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource['id'])

        hosted_externally = not url.startswith(
            config['ckan.site_url']) or urlparse(filepath).scheme != ''
        # if resource.get('resource_type') == 'file.upload' and not hosted_externally:
        if not hosted_externally:
            log.info("Won't attemp to archive resource uploaded locally: %s" %
                     resource['url'])

            try:
                hash, length = _file_hashnlength(filepath)
            except IOError as e:
                log.error('Error while accessing local resource %s: %s',
                          filepath, e)

                download_status_id = Status.by_text('URL request failed')
                _save(download_status_id, e, resource)
                return

            mimetype = None
            headers = None
            content_type, content_encoding = mimetypes.guess_type(url)
            if content_type:
                mimetype = _clean_content_type(content_type)
                headers = {'Content-Type': content_type}

            download_result_mock = {
                'mimetype': mimetype,
                'size': length,
                'hash': hash,
                'headers': headers,
                'saved_file': filepath,
                'url_redirected_to': url,
                'request_type': 'GET'
            }

            archive_result_mock = {
                'cache_filepath': filepath,
                'cache_url': url
            }

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'],
                  download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock,
                                   **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext-archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }

    err = None
    try:
        download_result = download(context, resource)
    except NotChanged as e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
        err = e
    except LinkInvalidError as e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
        err = e
    except DownloadException as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except DownloadError as e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
        err = e
    except ChooseNotToDownload as e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
        err = e
    except ForbiddenError as e:
        download_status_id = Status.by_text('Forbidden error')
        try_as_api = False
        err = e
    except Exception as e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id) and err:
        log.info('GET error: %s - %r, %r "%s"',
                 Status.by_id(download_status_id), err, err.args,
                 resource.get('url'))

        if try_as_api:
            download_result = api_request(context, resource)
            if download_result:
                download_status_id = Status.by_text('Archived successfully')
            # else the download_status_id (i.e. an error) is left what it was
            # from the previous download (i.e. not when we tried it as an API)

        if not try_as_api or not Status.is_ok(download_status_id):
            extra_args = [err.args.url_redirected_to
                          ] if 'url_redirected_to' in err.args else []
            _save(download_status_id, err, resource, *extra_args)
            return

    if not requires_archive:
        # We don't need to archive if the remote content has not changed
        return None

    # Archival
    log.info('Attempting to archive resource')
    try:
        archive_result = archive_resource(context, resource, log,
                                          download_result)
    except ArchiveError as e:
        log.error('System error during archival: %r, %r', e, e.args)
        _save(Status.by_text('System error during archival'), e, resource,
              download_result['url_redirected_to'])
        return

    # Success
    _save(Status.by_text('Archived successfully'), '', resource,
          download_result['url_redirected_to'], download_result,
          archive_result)

    # The return value is only used by tests. Serialized for Celery.
    return json.dumps(dict(download_result, **archive_result))
Example #14
0
def _update(ckan_ini_filepath, resource_id, queue):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    log = update.get_logger()

    load_config(ckan_ini_filepath)
    register_translator()

    from ckan import model
    from ckan.logic import get_action
    from pylons import config

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = '%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    from ckanext.archiver.model import Status
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckan.cache_url_root'),
    }
    try:
        download_result = download(context, resource)
    except LinkInvalidError, e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
Example #15
0
    download_result = None
    from ckanext.archiver.model import Status
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckan.cache_url_root'),
    }
    try:
        download_result = download(context, resource)
    except LinkInvalidError, e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
    except DownloadException, e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except DownloadError, e:
        download_status_id = Status.by_text('Download error')
        try_as_api = True
    except ChooseNotToDownload, e:
        download_status_id = Status.by_text('Chose not to download')
        try_as_api = False
    except Exception, e:
        if os.environ.get('DEBUG'):
            raise
        log.error('Uncaught download failure: %r, %r', e, e.args)
        _save(Status.by_text('Download failure'), e, resource)
        return

    if not Status.is_ok(download_status_id):
Example #16
0
def _update_resource(ckan_ini_filepath, resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    load_config(ckan_ini_filepath)

    from ckan import model
    from pylons import config
    from ckan.plugins import toolkit
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id, exception, resource, url_redirected_to=None,
              download_result=None, archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id,
                      reason, url_redirected_to,
                      download_result, archive_result,
                      log)
        notify_resource(
            resource,
            queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    url = resource['url']
    if not url.startswith('http'):
        url = config['ckan.site_url'].rstrip('/') + url

    if resource.get('url_type') == 'upload':
        upload = uploader.get_resource_uploader(resource)
        filepath = upload.get_path(resource['id'])

        hosted_externally = not url.startswith(config['ckan.site_url']) or urlparse.urlparse(filepath).scheme is not ''
        # if resource.get('resource_type') == 'file.upload' and not hosted_externally:
        if not hosted_externally:
            log.info("Won't attemp to archive resource uploaded locally: %s" % resource['url'])

            try:
                hash, length = _file_hashnlength(filepath)
            except IOError, e:
                log.error('Error while accessing local resource %s: %s', filepath, e)

                download_status_id = Status.by_text('URL request failed')
                _save(download_status_id, e, resource)
                return

            mimetype = None
            headers = None
            content_type, content_encoding = mimetypes.guess_type(url)
            if content_type:
                mimetype = _clean_content_type(content_type)
                headers = {'Content-Type': content_type}

            download_result_mock = {'mimetype': mimetype,
                                    'size': length,
                                    'hash': hash,
                                    'headers': headers,
                                    'saved_file': filepath,
                                    'url_redirected_to': url,
                                    'request_type': 'GET'}

            archive_result_mock = {'cache_filepath': filepath,
                                   'cache_url': url}

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock, **archive_result_mock))
Example #17
0
                                    'request_type': 'GET'}

            archive_result_mock = {'cache_filepath': filepath,
                                   'cache_url': url}

            # Success
            _save(Status.by_text('Archived successfully'), '', resource,
                  download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock)

            # The return value is only used by tests. Serialized for Celery.
            return json.dumps(dict(download_result_mock, **archive_result_mock))
            # endif: processing locally uploaded resource

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root': config.get('ckanext-archiver.cache_url_root'),
        'previous': Archival.get_for_resource(resource_id)
        }
    try:
        download_result = download(context, resource)
    except NotChanged, e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
    except LinkInvalidError, e:
        download_status_id = Status.by_text('URL invalid')
        try_as_api = False
    except DownloadException, e:
Example #18
0
def _update_resource(resource_id, queue, log):
    """
    Link check and archive the given resource.
    If successful, updates the archival table with the cache_url & hash etc.
    Finally, a notification of the archival is broadcast.

    Params:
      resource - resource dict
      queue - name of the celery queue

    Should only raise on a fundamental error:
      ArchiverError
      CkanError

    Returns a JSON dict, ready to be returned from the celery task giving a
    success status:
        {
            'resource': the updated resource dict,
            'file_path': path to archived file (if archive successful), or None
        }
    If not successful, returns None.
    """
    from ckan import model
    from pylons import config
    from ckan.plugins import toolkit
    from ckanext.archiver import default_settings as settings
    from ckanext.archiver.model import Status, Archival

    get_action = toolkit.get_action

    assert is_id(resource_id), resource_id
    context_ = {'model': model, 'ignore_auth': True, 'session': model.Session}
    resource = get_action('resource_show')(context_, {'id': resource_id})

    if not os.path.exists(settings.ARCHIVE_DIR):
        log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR)
        os.mkdir(settings.ARCHIVE_DIR)

    def _save(status_id,
              exception,
              resource,
              url_redirected_to=None,
              download_result=None,
              archive_result=None):
        reason = u'%s' % exception
        save_archival(resource, status_id, reason, url_redirected_to,
                      download_result, archive_result, log)
        notify_resource(
            resource, queue,
            archive_result.get('cache_filename') if archive_result else None)

    # Download
    try_as_api = False
    requires_archive = True

    log.info("Attempting to download resource: %s" % resource['url'])
    download_result = None
    download_status_id = Status.by_text('Archived successfully')
    context = {
        'site_url':
        config.get('ckan.site_url_internally') or config['ckan.site_url'],
        'cache_url_root':
        config.get('ckanext.archiver.cache_url_root'),
        'previous':
        Archival.get_for_resource(resource_id)
    }
    try:
        download_result = download(context, resource)
    except NotChanged, e:
        download_status_id = Status.by_text('Content has not changed')
        try_as_api = False
        requires_archive = False
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(
                ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(
                ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url or res.extras.get('cache_filepath')
                    or res.hash or res.size or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to', 'updated',
                               'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[
                -1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME, fields['first_failure']
                or START_OF_TIME, fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res,
                         stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url
                    or res.extras.get('cache_filepath')
                    or res.hash
                    or res.size
                    or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to',
                               'updated', 'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME,
                fields['first_failure'] or START_OF_TIME,
                fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res, stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'