except DownloadError, e: download_status_id = Status.by_text('Download error') try_as_api = True except ChooseNotToDownload, e: download_status_id = Status.by_text('Chose not to download') try_as_api = False except Exception, e: if os.environ.get('DEBUG'): raise log.error('Uncaught download failure: %r, %r', e, e.args) _save(Status.by_text('Download failure'), e, resource) return if not Status.is_ok(download_status_id): log.info('GET error: %s - %r, %r "%s"', Status.by_id(download_status_id), e, e.args, resource.get('url')) if try_as_api: download_result = api_request(context, resource) if download_result: download_status_id = Status.by_text('Archived successfully') # else the download_status_id (i.e. an error) is left what it was # from the previous download (i.e. not when we tried it as an API) if not try_as_api or not Status.is_ok(download_status_id): extra_args = [e.url_redirected_to ] if 'url_redirected_to' in e else [] _save(download_status_id, e, resource, *extra_args) return
def _update_resource(resource_id, queue, log): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ from ckan import model from ckan.plugins.toolkit import config from ckanext.archiver import default_settings as settings from ckanext.archiver.model import Status, Archival get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = u'%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True url = resource['url'] if not url.startswith('http'): url = config['ckan.site_url'].rstrip('/') + url if resource.get('url_type') == 'upload': upload = uploader.get_resource_uploader(resource) filepath = upload.get_path(resource['id']) hosted_externally = not url.startswith( config['ckan.site_url']) or urlparse(filepath).scheme != '' # if resource.get('resource_type') == 'file.upload' and not hosted_externally: if not hosted_externally: log.info("Won't attemp to archive resource uploaded locally: %s" % resource['url']) try: hash, length = _file_hashnlength(filepath) except IOError as e: log.error('Error while accessing local resource %s: %s', filepath, e) download_status_id = Status.by_text('URL request failed') _save(download_status_id, e, resource) return mimetype = None headers = None content_type, content_encoding = mimetypes.guess_type(url) if content_type: mimetype = _clean_content_type(content_type) headers = {'Content-Type': content_type} download_result_mock = { 'mimetype': mimetype, 'size': length, 'hash': hash, 'headers': headers, 'saved_file': filepath, 'url_redirected_to': url, 'request_type': 'GET' } archive_result_mock = { 'cache_filepath': filepath, 'cache_url': url } # Success _save(Status.by_text('Archived successfully'), '', resource, download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock)) # endif: processing locally uploaded resource log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } err = None try: download_result = download(context, resource) except NotChanged as e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False err = e except LinkInvalidError as e: download_status_id = Status.by_text('URL invalid') try_as_api = False err = e except DownloadException as e: download_status_id = Status.by_text('Download error') try_as_api = True err = e except DownloadError as e: download_status_id = Status.by_text('Download error') try_as_api = True err = e except ChooseNotToDownload as e: download_status_id = Status.by_text('Chose not to download') try_as_api = False err = e except ForbiddenError as e: download_status_id = Status.by_text('Forbidden error') try_as_api = False err = e except Exception as e: if os.environ.get('DEBUG'): raise log.error('Uncaught download failure: %r, %r', e, e.args) _save(Status.by_text('Download failure'), e, resource) return if not Status.is_ok(download_status_id) and err: log.info('GET error: %s - %r, %r "%s"', Status.by_id(download_status_id), err, err.args, resource.get('url')) if try_as_api: download_result = api_request(context, resource) if download_result: download_status_id = Status.by_text('Archived successfully') # else the download_status_id (i.e. an error) is left what it was # from the previous download (i.e. not when we tried it as an API) if not try_as_api or not Status.is_ok(download_status_id): extra_args = [err.args.url_redirected_to ] if 'url_redirected_to' in err.args else [] _save(download_status_id, err, resource, *extra_args) return if not requires_archive: # We don't need to archive if the remote content has not changed return None # Archival log.info('Attempting to archive resource') try: archive_result = archive_resource(context, resource, log, download_result) except ArchiveError as e: log.error('System error during archival: %r, %r', e, e.args) _save(Status.by_text('System error during archival'), e, resource, download_result['url_redirected_to']) return # Success _save(Status.by_text('Archived successfully'), '', resource, download_result['url_redirected_to'], download_result, archive_result) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result, **archive_result))
except DownloadError, e: download_status_id = Status.by_text('Download error') try_as_api = True except ChooseNotToDownload, e: download_status_id = Status.by_text('Chose not to download') try_as_api = False except Exception, e: if os.environ.get('DEBUG'): raise log.error('Uncaught download failure: %r, %r', e, e.args) _save(Status.by_text('Download failure'), e, resource) return if not Status.is_ok(download_status_id): log.info('GET error: %s - %r, %r "%s"', Status.by_id(download_status_id), e, e.args, resource.get('url')) if try_as_api: download_result = api_request(context, resource) if download_result: download_status_id = Status.by_text('Archived successfully') # else the download_status_id (i.e. an error) is left what it was # from the previous download (i.e. not when we tried it as an API) if not try_as_api or not Status.is_ok(download_status_id): extra_args = [e.url_redirected_to] if 'url_redirected_to' in e else [] _save(download_status_id, e, resource, *extra_args) return if not requires_archive: