def test_not_available_any_more(self): # A cache of the data still exists from the previous run, but this # time, the archiver found the file gave a 404. # The record of the previous (successful) run of QA. res = self._test_resource(license_id=None, format=None) qa = qa_model.QA.create(res.id) qa.format = 'CSV' model.Session.add(qa) model.Session.commit() # cache still exists from the previous run, but this time, the archiver # found the file gave a 404. archival = Archival.get_for_resource(res.id) archival.cache_filepath = __file__ archival.status_id = Status.by_text('Download error') archival.reason = 'Server returned 404 error' archival.last_success = datetime.datetime(year=2008, month=10, day=1) archival.first_failure = datetime.datetime(year=2008, month=10, day=2) archival.failure_count = 1 archival.is_broken = True result = resource_score(res, log) assert result['openness_score'] == 0, result assert_equal(result['format'], 'CSV') # in preference it should report that it is not available assert_equal( result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 404 error. Attempted on 10/10/2008. This URL last worked on: 01/10/2008.' )
def qa_resource_show(context, data_dict): ''' Returns the QA and Archival information for a package or resource. ''' model = context['model'] session = context['session'] #user = context.get('user') #p.toolkit.check_access('qa_resource_show', context, data_dict) res_id = p.toolkit.get_or_bust(data_dict, 'id') res = session.query(model.Resource).get(res_id) if not res: raise p.toolkit.ObjectNotFound archival = Archival.get_for_resource(res_id) qa = QA.get_for_resource(res_id) pkg = res.resource_group.package return {'name': pkg.name, 'title': pkg.title, 'id': res.id, 'archival_updated': archival.updated.isoformat() if archival and archival.updated else None, 'archival_is_broken': archival.is_broken if archival else None, 'archival_reason': archival.reason if archival else None, 'archival_url_redirected_to': archival.url_redirected_to if archival else None, 'openness_score': qa.openness_score if qa else None, 'openness_score_reason': qa.openness_score_reason if qa else None, 'updated': qa.updated.isoformat() if qa and qa.updated else None, 'format': qa.format if qa else None, }
def save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log): '''Writes to the archival table the result of an attempt to download the resource. May propagate a CkanError. ''' now = datetime.datetime.now() from ckanext.archiver.model import Archival, Status from ckan import model archival = Archival.get_for_resource(resource['id']) first_archival = not archival previous_archival_was_broken = None if not archival: archival = Archival.create(resource['id']) model.Session.add(archival) else: log.info('Archival from before: %r', archival) previous_archival_was_broken = archival.is_broken revision = model.Session.query(model.Revision).get(resource['revision_id']) archival.resource_timestamp = revision.timestamp # Details of the latest archival attempt archival.status_id = status_id archival.is_broken = Status.is_status_broken(status_id) archival.reason = reason archival.url_redirected_to = url_redirected_to # Details of successful archival if archival.is_broken is False: archival.cache_filepath = archive_result['cache_filepath'] archival.cache_url = archive_result['cache_url'] archival.size = download_result['size'] archival.mimetype = download_result['mimetype'] archival.hash = download_result['hash'] archival.etag = download_result['headers'].get('etag') archival.last_modified = download_result['headers'].get('last-modified') # History if archival.is_broken is False: archival.last_success = now archival.first_failure = None archival.failure_count = 0 else: log.info('First_archival=%r Previous_broken=%r Failure_count=%r' % (first_archival, previous_archival_was_broken, archival.failure_count)) if first_archival or previous_archival_was_broken is False: # i.e. this is the first failure (or the first archival) archival.first_failure = now archival.failure_count = 1 else: archival.failure_count += 1 archival.updated = now log.info('Archival saved: %r', archival) model.repo.commit_and_remove()
def qa_resource_show(context, data_dict): ''' Returns the QA and Archival information for a package or resource. ''' model = context['model'] session = context['session'] # user = context.get('user') # p.toolkit.check_access('qa_resource_show', context, data_dict) res_id = p.toolkit.get_or_bust(data_dict, 'id') res = session.query(model.Resource).get(res_id) if not res: raise p.toolkit.ObjectNotFound archival = Archival.get_for_resource(res_id) qa = QA.get_for_resource(res_id) pkg = res.resource_group.package return_dict = { 'name': pkg.name, 'title': pkg.title, 'id': res.id } return_dict['archival'] = archival.as_dict() return_dict.update(qa.as_dict()) return return_dict
def qa_resource_show(context, data_dict): ''' Returns the QA and Archival information for a package or resource. ''' model = context['model'] session = context['session'] #user = context.get('user') #p.toolkit.check_access('qa_resource_show', context, data_dict) res_id = p.toolkit.get_or_bust(data_dict, 'id') res = session.query(model.Resource).get(res_id) if not res: raise p.toolkit.ObjectNotFound archival = Archival.get_for_resource(res_id) qa = QA.get_for_resource(res_id) pkg = res.resource_group.package return_dict = { 'name': pkg.name, 'title': pkg.title, 'id': res.id } return_dict['archival'] = archival.as_dict() return_dict.update(qa.as_dict()) return return_dict
def test_not_available_any_more(self): # A cache of the data still exists from the previous run, but this # time, the archiver found the file gave a 404. # The record of the previous (successful) run of QA. res = self._test_resource(license_id=None, format=None) qa = qa_model.QA.create(res.id) qa.format = 'CSV' model.Session.add(qa) model.Session.commit() # cache still exists from the previous run, but this time, the archiver # found the file gave a 404. archival = Archival.get_for_resource(res.id) archival.cache_filepath = __file__ archival.status_id = Status.by_text('Download error') archival.reason = 'Server returned 404 error' archival.last_success = datetime.datetime(year=2008, month=10, day=1) archival.first_failure = datetime.datetime(year=2008, month=10, day=2) archival.failure_count = 1 archival.is_broken = True result = resource_score(res) assert result['openness_score'] == 0, result assert_equal(result['format'], 'CSV') # in preference it should report that it is not available assert_equal(result['openness_score_reason'], 'File could not be downloaded. ' 'Reason: Download error. Error details: Server returned 404 error.' ' Attempted on 10/10/2008. This URL last worked on: 01/10/2008.')
def resource_score(resource): """ Score resource on Sir Tim Berners-Lee\'s five stars of openness. Returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'format': format of the data (string) 'archival_timestamp': time of the archival that this result is based on (iso string) Raises QAError for reasonable errors """ score = 0 score_reason = '' format_ = None register_translator() try: score_reasons = [] # a list of strings detailing how we scored it archival = Archival.get_for_resource(resource_id=resource.id) if not resource: raise QAError('Could not find resource "%s"' % resource.id) score, format_ = score_if_link_broken(archival, resource, score_reasons) if score is None: # we don't want to take the publisher's word for it, in case the link # is only to a landing page, so highest priority is the sniffed type score, format_ = score_by_sniffing_data(archival, resource, score_reasons) if score is None: # Fall-backs are user-given data score, format_ = score_by_url_extension( resource, score_reasons) if score is None: score, format_ = score_by_format_field( resource, score_reasons) if score is None: log.warning( 'Could not score resource: "%s" with url: "%s"', resource.id, resource.url) score_reasons.append( _('Could not understand the file format, therefore score is 1.' )) score = 1 if format_ is None: # use any previously stored format value for this resource format_ = get_qa_format(resource.id) score_reason = ' '.join(score_reasons) format_ = format_ or None except Exception, e: log.error( 'Unexpected error while calculating openness score %s: %s\nException: %s', e.__class__.__name__, unicode(e), traceback.format_exc()) score_reason = _("Unknown error: %s") % str(e) raise
class DataPreviewController(BaseController): def index(self, id): resource = model.Resource.get(id) if not resource or resource.state != 'active': abort(404, "Resource not found") context = {'model': model, 'session': model.Session, 'user': c.user} try: check_access("resource_show", context, {'id': resource.id}) except NotAuthorized, e: abort(403, "You are not permitted access to this resource") size_limit = config.get('ckan.datapreview.limit', 5242880) qa = QA.get_for_resource(resource.id) format_ = qa.format if qa else None log.debug('File format (according to QA): %r' % format_) if not format_: format_ = resource.format.lower() if resource.format else '' log.debug('File format (resource.format): %r' % format_) query = dict(type=format_, size_limit=size_limit, length=None) archival = Archival.get_for_resource(resource.id) if archival and archival.size: query['length'] = archival.size # Add the extra fields if they are set for k in ['max-results', 'encoding', 'type']: if k in request.params: query[k] = request.params[k] url, archived = self._get_url(resource, query) query['archived'] = archived if url: try: response.content_type = 'application/json' result = proxy_query(resource, url, query) except ProxyError as e: log.warn("Request {0} failed : {1}".format( identify_resource(resource), e)) result = _error(title=e.title, message=e.message) else: result = _error( title="Remote resource not downloadable", message="Unable to find the remote resource for download") format_ = request.params.get('callback') if format_: return "%s(%s)" % (format_, result) return result
def resource_score(resource): """ Score resource on Sir Tim Berners-Lee\'s five stars of openness. Returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'format': format of the data (string) 'archival_timestamp': time of the archival that this result is based on (iso string) Raises QAError for reasonable errors """ score = 0 score_reason = '' format_ = None register_translator() try: score_reasons = [] # a list of strings detailing how we scored it archival = Archival.get_for_resource(resource_id=resource.id) if not resource: raise QAError('Could not find resource "%s"' % resource.id) score, format_ = score_if_link_broken(archival, resource, score_reasons) if score is None: # we don't want to take the publisher's word for it, in case the link # is only to a landing page, so highest priority is the sniffed type score, format_ = score_by_sniffing_data(archival, resource, score_reasons) if score is None: # Fall-backs are user-given data score, format_ = score_by_url_extension(resource, score_reasons) if score is None: score, format_ = score_by_format_field(resource, score_reasons) if score is None: log.warning('Could not score resource: "%s" with url: "%s"', resource.id, resource.url) score_reasons.append(_('Could not understand the file format, therefore score is 1.')) score = 1 if format_ is None: # use any previously stored format value for this resource format_ = get_qa_format(resource.id) score_reason = ' '.join(score_reasons) format_ = format_ or None except Exception, e: log.error('Unexpected error while calculating openness score %s: %s\nException: %s', e.__class__.__name__, unicode(e), traceback.format_exc()) score_reason = _("Unknown error: %s") % str(e) raise
def test_not_available_and_not_open(self): res = self._test_resource(license_id=None, format=None, cached=False) archival = Archival.get_for_resource(res.id) archival.status_id = Status.by_text('Download error') archival.reason = 'Server returned 500 error' archival.last_success = None archival.first_failure = datetime.datetime(year=2008, month=10, day=1, hour=6, minute=30) archival.failure_count = 16 archival.is_broken = True model.Session.commit() result = resource_score(res, log) assert result['openness_score'] == 0, result assert_equal(result['format'], None) # in preference it should report that it is not available assert_equal(result['openness_score_reason'], 'File could not be downloaded. Reason: Download error. Error details: Server returned 500 error. Attempted on 10/10/2008. Tried 16 times since 01/10/2008. This URL has not worked in the history of this tool.')
def archiver_resource_show(context, data_dict=None): '''Return a details of the archival of a resource :param id: the id of the resource :type id: string :rtype: dictionary ''' id_ = _get_or_bust(data_dict, 'id') archival = Archival.get_for_resource(id_) if archival is None: raise ObjectNotFound archival_dict = archival.as_dict() p.toolkit.check_access('archiver_resource_show', context, data_dict) return archival_dict
def _update_resource(resource_id, queue, log): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ from ckan import model from ckan.plugins.toolkit import config from ckanext.archiver import default_settings as settings from ckanext.archiver.model import Status, Archival get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = u'%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True url = resource['url'] if not url.startswith('http'): url = config['ckan.site_url'].rstrip('/') + url if resource.get('url_type') == 'upload': upload = uploader.get_resource_uploader(resource) filepath = upload.get_path(resource['id']) hosted_externally = not url.startswith( config['ckan.site_url']) or urlparse(filepath).scheme != '' # if resource.get('resource_type') == 'file.upload' and not hosted_externally: if not hosted_externally: log.info("Won't attemp to archive resource uploaded locally: %s" % resource['url']) try: hash, length = _file_hashnlength(filepath) except IOError as e: log.error('Error while accessing local resource %s: %s', filepath, e) download_status_id = Status.by_text('URL request failed') _save(download_status_id, e, resource) return mimetype = None headers = None content_type, content_encoding = mimetypes.guess_type(url) if content_type: mimetype = _clean_content_type(content_type) headers = {'Content-Type': content_type} download_result_mock = { 'mimetype': mimetype, 'size': length, 'hash': hash, 'headers': headers, 'saved_file': filepath, 'url_redirected_to': url, 'request_type': 'GET' } archive_result_mock = { 'cache_filepath': filepath, 'cache_url': url } # Success _save(Status.by_text('Archived successfully'), '', resource, download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock)) # endif: processing locally uploaded resource log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } err = None try: download_result = download(context, resource) except NotChanged as e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False err = e except LinkInvalidError as e: download_status_id = Status.by_text('URL invalid') try_as_api = False err = e except DownloadException as e: download_status_id = Status.by_text('Download error') try_as_api = True err = e except DownloadError as e: download_status_id = Status.by_text('Download error') try_as_api = True err = e except ChooseNotToDownload as e: download_status_id = Status.by_text('Chose not to download') try_as_api = False err = e except ForbiddenError as e: download_status_id = Status.by_text('Forbidden error') try_as_api = False err = e except Exception as e: if os.environ.get('DEBUG'): raise log.error('Uncaught download failure: %r, %r', e, e.args) _save(Status.by_text('Download failure'), e, resource) return if not Status.is_ok(download_status_id) and err: log.info('GET error: %s - %r, %r "%s"', Status.by_id(download_status_id), err, err.args, resource.get('url')) if try_as_api: download_result = api_request(context, resource) if download_result: download_status_id = Status.by_text('Archived successfully') # else the download_status_id (i.e. an error) is left what it was # from the previous download (i.e. not when we tried it as an API) if not try_as_api or not Status.is_ok(download_status_id): extra_args = [err.args.url_redirected_to ] if 'url_redirected_to' in err.args else [] _save(download_status_id, err, resource, *extra_args) return if not requires_archive: # We don't need to archive if the remote content has not changed return None # Archival log.info('Attempting to archive resource') try: archive_result = archive_resource(context, resource, log, download_result) except ArchiveError as e: log.error('System error during archival: %r, %r', e, e.args) _save(Status.by_text('System error during archival'), e, resource, download_result['url_redirected_to']) return # Success _save(Status.by_text('Archived successfully'), '', resource, download_result['url_redirected_to'], download_result, archive_result) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result, **archive_result))
def assert_archival_error(self, error_message_fragment, resource_id): archival = Archival.get_for_resource(resource_id) if error_message_fragment not in archival.reason: print 'ERROR: %s (%s)' % (archival.reason, archival.status) raise AssertionError(archival.reason)
# Success _save(Status.by_text('Archived successfully'), '', resource, download_result_mock['url_redirected_to'], download_result_mock, archive_result_mock) # The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock)) # endif: processing locally uploaded resource log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } try: download_result = download(context, resource) except NotChanged, e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False except LinkInvalidError, e: download_status_id = Status.by_text('URL invalid') try_as_api = False except DownloadException, e: download_status_id = Status.by_text('Download error') try_as_api = True except DownloadError, e: download_status_id = Status.by_text('Download error')
"license": LICENSE_LOOKUP.get(pkg["license_id"], ""), "resources": [], } if pkg["notes"]: datapackage["description"] = pkg["notes"] try: package_zip = PackageZip.get_for_package(pkg["id"]) datapackage["filepath"] = package_zip.filepath except Exception, ex: pass fd = FilenameDeduplicator() for res in pkg["resources"]: archival = Archival.get_for_resource(res["id"]) if archival and archival.cache_filepath: # We have archived it, and we have a path. _, resource_id, filename = archival.cache_filepath.rsplit("/", 2) cache_filepath = archival.cache_filepath else: # Try and work out the filename from the URL. try: _, filename = res["url"].rsplit("/", 1) except ValueError: filename = res["id"] cache_filepath = "" filename = fd.deduplicate(filename) resource_json = { "url": res["url"],
'license': LICENSE_LOOKUP.get(pkg['license_id'], ''), 'resources': [], } if pkg['notes']: datapackage['description'] = pkg['notes'] try: package_zip = PackageZip.get_for_package(pkg['id']) datapackage['filepath'] = package_zip.filepath except Exception, ex: pass fd = FilenameDeduplicator() for res in pkg['resources']: archival = Archival.get_for_resource(res['id']) if archival and archival.cache_filepath: # We have archived it, and we have a path. _, resource_id, filename = archival.cache_filepath.rsplit('/', 2) cache_filepath = archival.cache_filepath else: # Try and work out the filename from the URL. try: _, filename = res['url'].rsplit('/', 1) except ValueError: filename = res['id'] cache_filepath = '' filename = fd.deduplicate(filename) resource_json = { 'url': res['url'],
def _update_resource(resource_id, queue, log): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ from ckan import model from pylons import config from ckan.plugins import toolkit from ckanext.archiver import default_settings as settings from ckanext.archiver.model import Status, Archival get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = u'%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext.archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } try: download_result = download(context, resource) except NotChanged, e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False
def _update_resource(ckan_ini_filepath, resource_id, queue): """ Link check and archive the given resource. If successful, updates the archival table with the cache_url & hash etc. Finally, a notification of the archival is broadcast. Params: resource - resource dict queue - name of the celery queue Should only raise on a fundamental error: ArchiverError CkanError Returns a JSON dict, ready to be returned from the celery task giving a success status: { 'resource': the updated resource dict, 'file_path': path to archived file (if archive successful), or None } If not successful, returns None. """ log = update_resource.get_logger() load_config(ckan_ini_filepath) register_translator() from ckan import model from pylons import config from ckan.plugins import toolkit get_action = toolkit.get_action assert is_id(resource_id), resource_id context_ = {'model': model, 'ignore_auth': True, 'session': model.Session} resource = get_action('resource_show')(context_, {'id': resource_id}) if not os.path.exists(settings.ARCHIVE_DIR): log.info("Creating archive directory: %s" % settings.ARCHIVE_DIR) os.mkdir(settings.ARCHIVE_DIR) def _save(status_id, exception, resource, url_redirected_to=None, download_result=None, archive_result=None): reason = '%s' % exception save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log) notify_resource( resource, queue, archive_result.get('cache_filename') if archive_result else None) # Download try_as_api = False requires_archive = True log.info("Attempting to download resource: %s" % resource['url']) download_result = None from ckanext.archiver.model import Status, Archival download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } try: download_result = download(context, resource) except NotChanged, e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def resource_score(resource): """ Score resource on Sir Tim Berners-Lee\'s five stars of openness. Returns a dict with keys: 'openness_score': score (int) 'openness_score_reason': the reason for the score (string) 'format': format of the data (string) 'archival_timestamp': time of the archival that this result is based on (iso string) Raises QAError for reasonable errors """ score = 0 score_reason = '' format_ = None try: register_translator() except ImportError: # if we can't import Pylons, we don't need to pass try: score_reasons = [] # a list of strings detailing how we scored it archival = Archival.get_for_resource(resource_id=resource.id) if not resource: raise QAError('Could not find resource "%s"' % resource.id) score, format_ = score_if_link_broken(archival, resource, score_reasons) if score is None: # we don't want to take the publisher's word for it, in case the link # is only to a landing page, so highest priority is the sniffed type score, format_ = score_by_sniffing_data(archival, resource, score_reasons) if score is None: # Fall-backs are user-given data score, format_ = score_by_url_extension(resource, score_reasons) if score is None: score, format_ = score_by_format_field(resource, score_reasons) if score is None: log.warning('Could not score resource: "%s" with url: "%s"', resource.id, resource.url) score_reasons.append(_('Could not understand the file format, therefore score is 1.')) score = 1 if format_ is None: # use any previously stored format value for this resource format_ = get_qa_format(resource.id) score_reason = ' '.join(score_reasons) format_ = format_ or None except Exception as e: log.error('Unexpected error while calculating openness score %s: %s\nException: %s', e.__class__.__name__, e, traceback.format_exc()) score_reason = _("Unknown error: %s") % e raise # Even if we can get the link, we should still treat the resource # as having a score of 0 if the license isn't open. # # It is important we do this check after the link check, otherwise # the link checker won't get the chance to see if the resource # is broken. if toolkit.check_ckan_version(max_version='2.2.99'): package = resource.resource_group.package else: package = resource.package if score > 0 and not package.isopen(): score_reason = _('License not open') score = 0 log.info('Score: %s Reason: %s', score, score_reason) archival_updated = archival.updated.isoformat() \ if archival and archival.updated else None result = { 'openness_score': score, 'openness_score_reason': score_reason, 'format': format_, 'archival_timestamp': archival_updated } custom_result = custom_resource_score(resource, result) return custom_result or result
def _get_url(self, resource, query): ''' Given a resource, return the URL for the data and a flag denoting whether the URL is to a local file (and therefore can ignore size limit checks.) This allows a local cache to be used in preference to the resource.url. If we are going to use an external URL, then we can do a HEAD request to check it works and record the mimetype & length in the query dict. :param resource: resource object :param query: dict describing the properties of the data ''' from requests.exceptions import InvalidURL url = None archived = False query['mimetype'] = None archival = Archival.get_for_resource(resource.id) if archival: # Look for a local cache of the data file # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv" if archival.cache_filepath: if os.path.exists(archival.cache_filepath.encode('utf8')): log.debug('Previewing local cached data: %s', archival.cache_filepath) url = archival.cache_filepath archived = True else: log.debug('Local cached data file missing: %s', archival.cache_filepath) else: log.debug('No cache_filepath for resource %s', identify_resource(resource)) # Otherwise try the cache_url # This works well when running on a database copied from another # machine - all the cached files are missing locally, but it can use # them from the original machine using the cache_url. if not url: if archival.cache_url: try: u = fix_url(archival.cache_url) except InvalidURL: log.error("Unable to fix the URL for resource: %s" % identify_resource(resource)) return None, False # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf" try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info().get("content-length", 0) query['mimetype'] = r.info().get( 'content-type', None) log.debug('Previewing cache URL: %s', url) except Exception, e: log.error( u"Request {0} with cache url {1}, {2}".format( identify_resource(resource), u, e)) else: log.debug('No cache_url for resource %s', identify_resource(resource))
def is_res_broken(resource): archival = Archival.get_for_resource(resource.id) if not archival: return None return archival.is_broken
def _get_url(self, resource, query): ''' Given a resource, return the URL for the data and a flag denoting whether the URL is to a local file (and therefore can ignore size limit checks.) This allows a local cache to be used in preference to the resource.url. If we are going to use an external URL, then we can do a HEAD request to check it works and record the mimetype & length in the query dict. :param resource: resource object :param query: dict describing the properties of the data ''' from requests.exceptions import InvalidURL url = None archived = False query['mimetype'] = None archival = Archival.get_for_resource(resource.id) if archival: # Look for a local cache of the data file # e.g. "cache_filepath": "/mnt/shared/ckan_resource_cache/63/63b159d7-90c5-443b-846d-f700f74ea062/bian-anal-mca-2005-dols-eng-1011-0312-tab2.csv" if archival.cache_filepath: if os.path.exists(archival.cache_filepath.encode('utf8')): log.debug('Previewing local cached data: %s', archival.cache_filepath) url = archival.cache_filepath archived = True else: log.debug('Local cached data file missing: %s', archival.cache_filepath) else: log.debug('No cache_filepath for resource %s', identify_resource(resource)) # Otherwise try the cache_url # This works well when running on a database copied from another # machine - all the cached files are missing locally, but it can use # them from the original machine using the cache_url. if not url: if archival.cache_url: try: u = fix_url(archival.cache_url) except InvalidURL: log.error("Unable to fix the URL for resource: %s" % identify_resource(resource)) return None, False # e.g. resource.cache_url = "http://data.gov.uk/data/resource_cache/07/0791d492-8ab9-4aae-b7e6-7ecae561faa3/bian-anal-mca-2005-dols-eng-1011-0312-qual.pdf" try: req = urllib2.Request(u) req.get_method = lambda: 'HEAD' r = urllib2.urlopen(req) if r.getcode() == 200: url = u query['length'] = r.info().get("content-length", 0) query['mimetype'] = r.info().get('content-type', None) log.debug('Previewing cache URL: %s', url) except Exception, e: log.error(u"Request {0} with cache url {1}, {2}".format(identify_resource(resource), u, e)) else: log.debug('No cache_url for resource %s', identify_resource(resource))
def migrate(options): from ckan import model from ckanext.archiver.model import Archival from ckanext.qa.model import QA resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of QA from TaskStatus # to fill all properties of QA apart from: # * package_id # * resource_id fields = {} qa_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='qa')\ .filter_by(key='status')\ .first() if not qa_task_status: add_stat('No QA data', res, stats) continue qa_error = json.loads(qa_task_status.error) fields['openness_score'] = int(qa_task_status.value) fields['openness_score_reason'] = qa_error['reason'] fields['format'] = qa_error['format'] qa_date = qa_task_status.last_updated # NB qa_task_status.last_updated appears to be 1hr ahead of the revision # time, so some timezone nonesense going on. Can't do much. archival = Archival.get_for_resource(res.id) if not archival: print add_stat('QA but no Archival data', res, stats) continue archival_date = archival.updated # the state of the resource was as it was archived on the date of # the QA update but we only know when the latest archival was. So # if it was archived before the QA update thenwe know that was the # archival, otherwise we don't know when the relevant archival was. if archival_date and qa_date >= archival_date: fields['archival_timestamp'] = archival_date fields['updated'] = archival_date fields['created'] = archival_date # Assume the resource URL archived was the one when the # archival was done (it may not be if the URL was queued and # there was significant delay before it was archived) get_resource_as_at = archival_date else: # This is common for when a resource is created and qa runs just # before archiver and you get: # "This file had not been downloaded at the time of scoring it." # Just put sensible datetimes since we don't really know the exact # ones fields['archival_timestamp'] = qa_date fields['updated'] = qa_date fields['created'] = qa_date get_resource_as_at = qa_date res_rev = model.Session.query(model.ResourceRevision).\ filter_by(id=res.id).\ filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\ order_by(model.ResourceRevision.revision_timestamp.desc()).\ first() fields['resource_timestamp'] = res_rev.revision_timestamp # Compare with any existing data in the Archival table qa = QA.get_for_resource(res.id) if qa: changed = None for field, value in fields.items(): if getattr(qa, field) != value: if options.write: setattr(qa, field, value) changed = True if not changed: add_stat('Already exists correctly in QA table', res, stats) continue add_stat('Updated in QA table', res, stats) else: qa = QA.create(res.id) if options.write: for field, value in fields.items(): setattr(qa, field, value) model.Session.add(qa) add_stat('Added to QA table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def assert_archival_error(self, error_message_fragment, resource_id): archival = Archival.get_for_resource(resource_id) if error_message_fragment not in archival.reason: print('ERROR: %s (%s)' % (archival.reason, archival.status)) raise AssertionError("Expected error containing: {}, but was: {}".format(error_message_fragment, archival.reason))
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none( ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none( ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[ -1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
# The return value is only used by tests. Serialized for Celery. return json.dumps(dict(download_result_mock, **archive_result_mock)) # endif: processing locally uploaded resource log.info("Attempting to download resource: %s" % resource['url']) download_result = None download_status_id = Status.by_text('Archived successfully') context = { 'site_url': config.get('ckan.site_url_internally') or config['ckan.site_url'], 'cache_url_root': config.get('ckanext-archiver.cache_url_root'), 'previous': Archival.get_for_resource(resource_id) } try: download_result = download(context, resource) except NotChanged, e: download_status_id = Status.by_text('Content has not changed') try_as_api = False requires_archive = False except LinkInvalidError, e: download_status_id = Status.by_text('URL invalid') try_as_api = False except DownloadException, e: download_status_id = Status.by_text('Download error') try_as_api = True except DownloadError, e: download_status_id = Status.by_text('Download error')