def save_archival(resource, status_id, reason, url_redirected_to, download_result, archive_result, log): '''Writes to the archival table the result of an attempt to download the resource. May propagate a CkanError. ''' now = datetime.datetime.now() from ckanext.archiver.model import Archival, Status from ckan import model archival = Archival.get_for_resource(resource['id']) first_archival = not archival previous_archival_was_broken = None if not archival: archival = Archival.create(resource['id']) model.Session.add(archival) else: log.info('Archival from before: %r', archival) previous_archival_was_broken = archival.is_broken revision = model.Session.query(model.Revision).get(resource['revision_id']) archival.resource_timestamp = revision.timestamp # Details of the latest archival attempt archival.status_id = status_id archival.is_broken = Status.is_status_broken(status_id) archival.reason = reason archival.url_redirected_to = url_redirected_to # Details of successful archival if archival.is_broken is False: archival.cache_filepath = archive_result['cache_filepath'] archival.cache_url = archive_result['cache_url'] archival.size = download_result['size'] archival.mimetype = download_result['mimetype'] archival.hash = download_result['hash'] archival.etag = download_result['headers'].get('etag') archival.last_modified = download_result['headers'].get('last-modified') # History if archival.is_broken is False: archival.last_success = now archival.first_failure = None archival.failure_count = 0 else: log.info('First_archival=%r Previous_broken=%r Failure_count=%r' % (first_archival, previous_archival_was_broken, archival.failure_count)) if first_archival or previous_archival_was_broken is False: # i.e. this is the first failure (or the first archival) archival.first_failure = now archival.failure_count = 1 else: archival.failure_count += 1 archival.updated = now log.info('Archival saved: %r', archival) model.repo.commit_and_remove()
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def send_broken_link_notification_email(self): send_notification_emails_to_maintainers = asbool( config.get( 'ckanext-archiver.send_notification_emails_to_maintainers', False)) if send_notification_emails_to_maintainers: from ckan import model from ckanext.archiver.model import Archival, Status # send email to datasets which have had broken links for more than 5 days todayMinus5 = datetime.now() - timedelta(days=5) resources_with_broken = ( model.Session.query(Archival, model.Package, model.Resource).filter( Archival.is_broken == True) # noqa .filter(Archival.first_failure < todayMinus5).join( model.Package, Archival.package_id == model.Package.id).filter( model.Package.state == 'active').join( model.Resource, Archival.resource_id == model.Resource.id).filter( model.Resource.state == 'active')) grouped_by_maintainer = {} # Group resources together by maintainer # So we can send only one message to the maintainer containing all their broken resources for resource in resources_with_broken.all(): if Status.is_status_broken(resource[0].status_id): maintainer = resource[1].maintainer if maintainer not in grouped_by_maintainer: grouped_by_maintainer[maintainer] = { "email": resource[1].maintainer_email, "broken": [] } grouped_by_maintainer[maintainer]['broken'].append({ "package_id": resource[0].package_id, "package_title": resource[1].title, "resource_id": resource[0].resource_id, "status_id": resource[0].status_id, "first_failure": resource[0].first_failure, "failure_count": resource[0].failure_count, "broken_url": resource[2].url, }) exempt_email_domains = config.get( 'ckanext-archiver.exempt_domains_from_broken_link_notifications', []) # Create email to each maintainer and send them for maintainer_name, maintainer_details in grouped_by_maintainer.iteritems( ): if maintainer_details.get('email'): maintainer_domain = maintainer_details['email'].split( '@')[1] if maintainer_domain in exempt_email_domains: self.log.info( 'Maintainer in exempt domains, not sending email..' ) continue self.log.info('Sending broken link notification to %s' % maintainer_details["email"]) subject = email_template.subject.format( amount=len(maintainer_details["broken"])) body = email_template.message(maintainer_details["broken"]) try: mail_recipient(maintainer_name, maintainer_details["email"], subject, body) except MailerException as e: self.log.warn( 'Error sending broken link notification to "%s": %s' % (maintainer_details["email"], e)) self.log.info('All broken link notifications sent') else: self.log.info( "Notification to maintainers are disabled, no notifications sent." )
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none( ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none( ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[ -1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'