def test_not_available_any_more(self): # A cache of the data still exists from the previous run, but this # time, the archiver found the file gave a 404. # The record of the previous (successful) run of QA. res = self._test_resource(license_id=None, format=None) qa = qa_model.QA.create(res.id) qa.format = 'CSV' model.Session.add(qa) model.Session.commit() # cache still exists from the previous run, but this time, the archiver # found the file gave a 404. archival = Archival.get_for_resource(res.id) archival.cache_filepath = __file__ archival.status_id = Status.by_text('Download error') archival.reason = 'Server returned 404 error' archival.last_success = datetime.datetime(year=2008, month=10, day=1) archival.first_failure = datetime.datetime(year=2008, month=10, day=2) archival.failure_count = 1 archival.is_broken = True result = resource_score(res) assert result['openness_score'] == 0, result assert_equal(result['format'], 'CSV') # in preference it should report that it is not available assert_equal(result['openness_score_reason'], 'File could not be downloaded. ' 'Reason: Download error. Error details: Server returned 404 error.' ' Attempted on 10/10/2008. This URL last worked on: 01/10/2008.')
def test_not_available_any_more(self): # A cache of the data still exists from the previous run, but this # time, the archiver found the file gave a 404. # The record of the previous (successful) run of QA. res = self._test_resource(license_id=None, format=None) qa = qa_model.QA.create(res.id) qa.format = 'CSV' model.Session.add(qa) model.Session.commit() # cache still exists from the previous run, but this time, the archiver # found the file gave a 404. archival = Archival.get_for_resource(res.id) archival.cache_filepath = __file__ archival.status_id = Status.by_text('Download error') archival.reason = 'Server returned 404 error' archival.last_success = datetime.datetime(year=2008, month=10, day=1) archival.first_failure = datetime.datetime(year=2008, month=10, day=2) archival.failure_count = 1 archival.is_broken = True result = resource_score(res, log) assert result['openness_score'] == 0, result assert_equal(result['format'], 'CSV') # in preference it should report that it is not available assert_equal( result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 404 error. Attempted on 10/10/2008. This URL last worked on: 01/10/2008.' )
def score_by_sniffing_data(archival, resource, score_reasons): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append(_('This file had not been downloaded at the time of scoring it.')) return (None, None) # Analyse the cached file filepath = archival.cache_filepath delete_file = False if not os.path.exists(filepath): log.debug("%s not found on disk, retrieving from URL %s", filepath, archival.cache_url) try: filepath = _download_url(archival.cache_url).name delete_file = True except Exception as e: score_reasons.append(_('A system error occurred during downloading this file') + '. %s' % e) return (None, None) if filepath: try: sniffed_format = sniff_format.sniff_file_format(filepath) finally: if delete_file: try: os.remove(filepath) except OSError as e: log.warn("Unable to remove temporary file %s: %s", filepath, e) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: score_reasons.append(_('Content of file appeared to be format "%s" which receives openness score: %s.') % (sniffed_format['format'], score)) return score, sniffed_format['format'] else: score_reasons.append(_('The format of the file was not recognized from its contents.')) return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append(_('File was not downloaded deliberately') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append(_('A system error occurred during downloading this file') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) else: score_reasons.append(_('This file had not been downloaded at the time of scoring it.')) return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append( _('This file had not been downloaded at the time of scoring it.')) return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append( _('Cache filepath does not exist: "%s".') % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath) score = lib.resource_format_scores().get(sniffed_format['format']) \ if sniffed_format else None if sniffed_format: score_reasons.append( _('Content of file appeared to be format "%s" which receives openness score: %s.' ) % (sniffed_format['format'], score)) return score, sniffed_format['format'] else: score_reasons.append( _('The format of the file was not recognized from its contents.' )) return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append( _('File was not downloaded deliberately') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( _('A system error occurred during downloading this file') + '. ' + _('Reason') + ': %s. ' % archival.reason + _('Using other methods to determine file openness.')) return (None, None) else: score_reasons.append( _('This file had not been downloaded at the time of scoring it.' )) return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons, log): """ Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_string) * If it cannot work out the format then format_string is None * If it cannot score it, then score is None """ if not archival or not archival.cache_filepath: score_reasons.append("This file had not been downloaded at the time of scoring it.") return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Cache filepath does not exist: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) score = lib.resource_format_scores().get(sniffed_format["format"]) if sniffed_format else None if sniffed_format: score_reasons.append( 'Content of file appeared to be format "%s" which receives openness score: %s.' % (sniffed_format["format"], score) ) return score, sniffed_format["format"] else: score_reasons.append("The format of the file was not recognized from its contents.") return (None, None) else: # No cache_url if archival.status_id == Status.by_text("Chose not to download"): score_reasons.append( "File was not downloaded deliberately. Reason: %s. Using other methods to determine file openness." % archival.reason ) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append( "A system error occurred during downloading this file. Reason: %s. Using other methods to determine file openness." % archival.reason ) return (None, None) else: score_reasons.append("This file had not been downloaded at the time of scoring it.") return (None, None)
def test_not_available_and_not_open(self): res = self._test_resource(license_id=None, format=None, cached=False) archival = Archival.get_for_resource(res.id) archival.status_id = Status.by_text('Download error') archival.reason = 'Server returned 500 error' archival.last_success = None archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30) archival.failure_count = 16 archival.is_broken = True model.Session.commit() result = resource_score(res, log) assert result['openness_score'] == 0, result assert_equal(result['format'], None) # in preference it should report that it is not available assert_equal(result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 500 error. Attempted on 10/10/2008. Tried 16 times since 01/10/2008. This URL has not worked in the history of this tool.')
def score_by_sniffing_data(archival, resource, score_reasons, log): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Putanja predmemorije ne postoji: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) if sniffed_format: score_reasons.append( 'Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' % (sniffed_format['display_name'], sniffed_format['openness'])) return sniffed_format['openness'], sniffed_format[ 'display_name'] else: score_reasons.append('Format je nepoznat.') return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \ archival.reason) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \ archival.reason) return (None, None) else: score_reasons.append( 'Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None)
def score_by_sniffing_data(archival, resource, score_reasons, log): ''' Looks inside a data file\'s contents to determine its format and score. It adds strings to score_reasons list about how it came to the conclusion. Return values: * It returns a tuple: (score, format_display_name) * If it cannot work out the format then format_display_name is None * If it cannot score it, then score is None ''' if not archival or not archival.cache_filepath: score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None) # Analyse the cached file filepath = archival.cache_filepath if not os.path.exists(filepath): score_reasons.append('Putanja predmemorije ne postoji: "%s".' % filepath) return (None, None) else: if filepath: sniffed_format = sniff_file_format(filepath, log) if sniffed_format: score_reasons.append('Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' % (sniffed_format['display_name'], sniffed_format['openness'])) return sniffed_format['openness'], sniffed_format['display_name'] else: score_reasons.append('Format je nepoznat.') return (None, None) else: # No cache_url if archival.status_id == Status.by_text('Chose not to download'): score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \ archival.reason) return (None, None) elif archival.is_broken is None and archival.status_id: # i.e. 'Download failure' or 'System error during archival' score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \ archival.reason) return (None, None) else: score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.') return (None, None)
def test_trigger_on_archival(cls): # create package context = { 'model': model, 'ignore_auth': True, 'session': model.Session, 'user': '******' } pkg = { 'name': 'testpkg', 'owner_org': _test_org().id, 'license_id': 'uk-ogl', 'resources': [{ 'url': 'http://test.com/', 'format': 'CSV', 'description': 'Test' }] } pkg = get_action('package_create')(context, pkg) resource_dict = pkg['resources'][0] res_id = resource_dict['id'] # create record of archival archival = Archival.create(res_id) cache_filepath = __file__ # just needs to exist archival.cache_filepath = cache_filepath archival.updated = TODAY archival.status_id = Status.by_text('Archived successfully') model.Session.add(archival) model.Session.commit() # TODO show that QA hasn't run yet # create a send_data from ckanext-archiver, that gets picked up by # ckanext-qa to put a task on the queue ckanext.archiver.tasks.notify_package(pkg, 'priority')
def _test_resource(url='anything', format='TXT', archived=True, cached=True, license_id='uk-ogl'): pkg = { 'owner_org': _test_org().id, 'license_id': license_id, 'resources': [{ 'url': url, 'format': format, 'description': 'Test' }] } pkg = ckan_factories.Dataset(**pkg) res_id = pkg['resources'][0]['id'] if archived: archival = Archival.create(res_id) archival.cache_filepath = __file__ if cached else None # just needs to exist archival.updated = TODAY archival.status_id = Status.by_text('Archived successfully') model.Session.add(archival) model.Session.commit() return model.Resource.get(res_id)
def _update_resource(ckan_ini_filepath, resource_id, queue): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ log = update_resource.get_logger() load_config(ckan_ini_filepath) register_translator() from ckan import model from pylons import config from ckan.plugins import toolkit get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = '%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download log.info("Attempting to download resource: %s" % resource['url']) download_result = None from ckanext.archiver.model import Status download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), } try: download_result = download(context, resource) except LinkInvalidError, e: download_status_id = Status.by_text('URL invalid') try_as_api = False
def _update_resource(resource_id, queue, log): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ from ckan import model from ckan.plugins.toolkit import config from ckanext.archiver import default_settings as settings from ckanext.archiver.model import Status, Archival get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = u'%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True url = resource['url'] if not url.startswith('http'): url = config['ckan.site_url'].rstrip('/') + url if resource.get('url_type') == 'upload': upload = uploader.get_resource_uploader(resource) filepath = upload.get_path(resource['id']) hosted_externally = not url.startswith( config['ckan.site_url']) or urlparse(filepath).scheme != '' # if resource.get('resource_type') == 'file.upload' and not hosted_externally: if not hosted_externally: log.info("Won't attemp to archive resource uploaded locally: %s" % resource['url']) try: hash, length = _file_hashnlength(filepath) except IOError as e: log.error('Error while accessing local resource %s: %s', filepath, e) download_status_id = Status.by_text('URL request failed') _save(download_status_id, e, resource) return mimetype = None headers = None content_type, content_encoding = mimetypes.guess_type(url) if content_type: mimetype = _clean_content_type(content_type) headers = {'Content-Type': content_type} download_result_mock = { 'mimetype': mimetype, 'size': length, 'hash': hash, 'headers': headers, 'saved_file': filepath, 'url_redirected_to': url, 'request_type': 'GET' } archive_result_mock = { 'cache_filepath': filepath, 'cache_url': url } # Success _save(Status.by_text('Archived successfully'), '', resource, download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock)) # endif: processing locally uploaded resource log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } err = None try: download_result = download(context, resource) except NotChanged as e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False err = e except LinkInvalidError as e: download_status_id = Status.by_text('URL invalid') try_as_api = False err = e except DownloadException as e: download_status_id = Status.by_text('Download error') try_as_api = True err = e except DownloadError as e: download_status_id = Status.by_text('Download error') try_as_api = True err = e except ChooseNotToDownload as e: download_status_id = Status.by_text('Chose not to download') try_as_api = False err = e except ForbiddenError as e: download_status_id = Status.by_text('Forbidden error') try_as_api = False err = e except Exception as e: if os.environ.get('DEBUG'): raise log.error('Uncaught download failure: %r, %r', e, e.args) _save(Status.by_text('Download failure'), e, resource) return if not Status.is_ok(download_status_id) and err: log.info('GET error: %s - %r, %r "%s"', Status.by_id(download_status_id), err, err.args, resource.get('url')) if try_as_api: download_result = api_request(context, resource) if download_result: download_status_id = Status.by_text('Archived successfully') # else the download_status_id (i.e. an error) is left what it was # from the previous download (i.e. not when we tried it as an API) if not try_as_api or not Status.is_ok(download_status_id): extra_args = [err.args.url_redirected_to ] if 'url_redirected_to' in err.args else [] _save(download_status_id, err, resource, *extra_args) return if not requires_archive: # We don't need to archive if the remote content has not changed return None # Archival log.info('Attempting to archive resource') try: archive_result = archive_resource(context, resource, log, download_result) except ArchiveError as e: log.error('System error during archival: %r, %r', e, e.args) _save(Status.by_text('System error during archival'), e, resource, download_result['url_redirected_to']) return # Success _save(Status.by_text('Archived successfully'), '', resource, download_result['url_redirected_to'], download_result, archive_result) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result, **archive_result))
def _update(ckan_ini_filepath, resource_id, queue): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ log = update.get_logger() load_config(ckan_ini_filepath) register_translator() from ckan import model from ckan.logic import get_action from pylons import config assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = '%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download log.info("Attempting to download resource: %s" % resource['url']) download_result = None from ckanext.archiver.model import Status download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckan.cache_url_root'), } try: download_result = download(context, resource) except LinkInvalidError, e: download_status_id = Status.by_text('URL invalid') try_as_api = False
download_result = None from ckanext.archiver.model import Status download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckan.cache_url_root'), } try: download_result = download(context, resource) except LinkInvalidError, e: download_status_id = Status.by_text('URL invalid') try_as_api = False except DownloadException, e: download_status_id = Status.by_text('Download error') try_as_api = True except DownloadError, e: download_status_id = Status.by_text('Download error') try_as_api = True except ChooseNotToDownload, e: download_status_id = Status.by_text('Chose not to download') try_as_api = False except Exception, e: if os.environ.get('DEBUG'): raise log.error('Uncaught download failure: %r, %r', e, e.args) _save(Status.by_text('Download failure'), e, resource) return if not Status.is_ok(download_status_id):
def _update_resource(ckan_ini_filepath, resource_id, queue, log): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ load_config(ckan_ini_filepath) from ckan import model from pylons import config from ckan.plugins import toolkit from ckanext.archiver import default_settings as settings from ckanext.archiver.model import Status, Archival get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = u'%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True url = resource['url'] if not url.startswith('http'): url = config['ckan.site_url'].rstrip('/') + url if resource.get('url_type') == 'upload': upload = uploader.get_resource_uploader(resource) filepath = upload.get_path(resource['id']) hosted_externally = not url.startswith(config['ckan.site_url']) or urlparse.urlparse(filepath).scheme is not '' # if resource.get('resource_type') == 'file.upload' and not hosted_externally: if not hosted_externally: log.info("Won't attemp to archive resource uploaded locally: %s" % resource['url']) try: hash, length = _file_hashnlength(filepath) except IOError, e: log.error('Error while accessing local resource %s: %s', filepath, e) download_status_id = Status.by_text('URL request failed') _save(download_status_id, e, resource) return mimetype = None headers = None content_type, content_encoding = mimetypes.guess_type(url) if content_type: mimetype = _clean_content_type(content_type) headers = {'Content-Type': content_type} download_result_mock = {'mimetype': mimetype, 'size': length, 'hash': hash, 'headers': headers, 'saved_file': filepath, 'url_redirected_to': url, 'request_type': 'GET'} archive_result_mock = {'cache_filepath': filepath, 'cache_url': url} # Success _save(Status.by_text('Archived successfully'), '', resource, download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock))
'request_type': 'GET'} archive_result_mock = {'cache_filepath': filepath, 'cache_url': url} # Success _save(Status.by_text('Archived successfully'), '', resource, download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock)) # endif: processing locally uploaded resource log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } try: download_result = download(context, resource) except NotChanged, e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False except LinkInvalidError, e: download_status_id = Status.by_text('URL invalid') try_as_api = False except DownloadException, e:
def _update_resource(resource_id, queue, log): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ from ckan import model from pylons import config from ckan.plugins import toolkit from ckanext.archiver import default_settings as settings from ckanext.archiver.model import Status, Archival get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = u'%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext.archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } try: download_result = download(context, resource) except NotChanged, e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none( ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none( ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[ -1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'