def categorize(options, test=False): from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: if test: theme = True else: theme = False packages = get_packages(publisher=options.publisher, theme=theme, uncategorized=options.uncategorized, limit=options.limit) themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg, stats) if options.write and not pkg.extras.get(PRIMARY_THEME) and themes: themes_to_write[pkg.name] = themes print 'Categorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package2(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0]['name'] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]['name']: print stats.add('Theme unchanged %s' % themes[0]['name'], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0]['name'], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq) / max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add( 'margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq)/max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def wms_revisions(options): ''' These revisions look like this: # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp; http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1 http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 The bad ones have been changed to "?service=" params. These revisions need removing. # Typical revision: id | timestamp | author | message | state | approved_timestamp a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active | # i.e. author='co-prod3...' (site-user, via API) ''' resources = common.get_resources(state='active', resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() stats.report_value_limit = 1000 total_bad_revisions = 0 need_to_commit = False widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): res = model.Resource.get( res.id) # as the session gets flushed during the loop res_rev_q = model.Session.query(model.ResourceRevision).filter_by( id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() first_res_rev = res_revs[0] if 'request=GetCapabilities&version=' in first_res_rev.url: print add_stat('First revision already was WMS', res, stats) continue # Identify bad revisions by the WMS URL parameters and author bad_res_revs = res_rev_q.filter( model.ResourceRevision.url.ilike( '%?service=W%S&request=GetCapabilities&version=%')).all() if bad_res_revs and \ bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'): print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author) continue if not bad_res_revs: add_stat('Resource ok', res, stats) continue print ' ' # don't overwrite progress bar print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs))) total_bad_revisions += len(bad_res_revs) # Find the new latest (good) revision bad_res_revs_set = set(bad_res_revs) for res_rev_index in reversed(xrange(len(res_revs))): if res_revs[res_rev_index] not in bad_res_revs_set: latest_good_res_rev = res_revs[res_rev_index] break else: print add_stat('No good revisions', res, stats) continue if not options.write: continue # Delete the revisions and resource_revisions print ' Deleting bad revisions...' def delete_bad_revisions(res_revs): # Build the sql as a list, as it is faster when you have 1000 strings to append sql = [ '''BEGIN; ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey; ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey; ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey; ''' ] for res_rev in res_revs: sql.append( "DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id)) # a revision created (e.g. over the API) can be connect to other # resources or a dataset, so only delete the revision if only # connected to this one. if model.Session.query(model.ResourceRevision).\ filter_by(revision_id=res_rev.revision_id).\ count() == 1 and \ model.Session.query(model.PackageRevision).\ filter_by(revision_id=res_rev.revision_id).count() == 0: sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id) sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \ (latest_good_res_rev.revision_id, res.id)) sql.append(''' ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); COMMIT;''') print ' sql..', model.Session.execute(''.join(sql)) print '.committed' model.Session.remove() def chunks(l, n): '''Yield successive n-sized chunks from l.''' for i in xrange(0, len(l), n): yield l[i:i + n] # chunk revisions in chunks to cope when there are so many widgets = [ 'Creating SQL: ', Counter(), 'k/%sk ' % int(float(len(bad_res_revs)) / 1000.0), Bar(), ' ', ETA() ] progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs)) / 1000.0) or 1) for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)): delete_bad_revisions(chunk_of_bad_res_revs) # Knit together the remaining revisions again print ' Knitting existing revisions back together...' res_rev_q = model.Session.query(model.ResourceRevision).filter_by( id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() latest_res_rev = res_revs[-1] if not latest_res_rev.current: latest_res_rev.current = True for i, res_rev in enumerate(res_revs[:-1]): if res_rev.expired_timestamp != res_revs[i + 1].revision_timestamp: res_rev.expired_timestamp = res_revs[i + 1].revision_timestamp res_rev.expired_id = res_revs[i + 1].revision_id if latest_res_rev.expired_timestamp != END_OF_TIME: latest_res_rev.expired_timestamp = END_OF_TIME if latest_res_rev.expired_id is not None: latest_res_rev.expired_id = None # Correct the URL on the resource model.Session.query(model.Resource).filter_by(id=res.id).update( {'url': latest_res_rev.url}) model.repo.commit_and_remove() print ' ...done' print 'Summary\n', stats.report() print 'Total bad revs: %d' % total_bad_revisions if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def wms_revisions(options): ''' These revisions look like this: # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp; http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1 http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 The bad ones have been changed to "?service=" params. These revisions need removing. # Typical revision: id | timestamp | author | message | state | approved_timestamp a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active | # i.e. author='co-prod3...' (site-user, via API) ''' resources = common.get_resources(state='active', resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() stats.report_value_limit = 1000 total_bad_revisions = 0 need_to_commit = False widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): res = model.Resource.get(res.id) # as the session gets flushed during the loop res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() first_res_rev = res_revs[0] if 'request=GetCapabilities&version=' in first_res_rev.url: print add_stat('First revision already was WMS', res, stats) continue # Identify bad revisions by the WMS URL parameters and author bad_res_revs = res_rev_q.filter(model.ResourceRevision.url.ilike('%?service=W%S&request=GetCapabilities&version=%')).all() if bad_res_revs and \ bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'): print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author) continue if not bad_res_revs: add_stat('Resource ok', res, stats) continue print ' ' # don't overwrite progress bar print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs))) total_bad_revisions += len(bad_res_revs) # Find the new latest (good) revision bad_res_revs_set = set(bad_res_revs) for res_rev_index in reversed(xrange(len(res_revs))): if res_revs[res_rev_index] not in bad_res_revs_set: latest_good_res_rev = res_revs[res_rev_index] break else: print add_stat('No good revisions', res, stats) continue if not options.write: continue # Delete the revisions and resource_revisions print ' Deleting bad revisions...' def delete_bad_revisions(res_revs): # Build the sql as a list, as it is faster when you have 1000 strings to append sql = ['''BEGIN; ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey; ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey; ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey; '''] for res_rev in res_revs: sql.append("DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id)) # a revision created (e.g. over the API) can be connect to other # resources or a dataset, so only delete the revision if only # connected to this one. if model.Session.query(model.ResourceRevision).\ filter_by(revision_id=res_rev.revision_id).\ count() == 1 and \ model.Session.query(model.PackageRevision).\ filter_by(revision_id=res_rev.revision_id).count() == 0: sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id) sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \ (latest_good_res_rev.revision_id, res.id)) sql.append(''' ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); COMMIT;''') print ' sql..', model.Session.execute(''.join(sql)) print '.committed' model.Session.remove() def chunks(l, n): '''Yield successive n-sized chunks from l.''' for i in xrange(0, len(l), n): yield l[i:i+n] # chunk revisions in chunks to cope when there are so many widgets = ['Creating SQL: ', Counter(), 'k/%sk ' % int(float(len(bad_res_revs))/1000.0), Bar(), ' ', ETA()] progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs))/1000.0) or 1) for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)): delete_bad_revisions(chunk_of_bad_res_revs) # Knit together the remaining revisions again print ' Knitting existing revisions back together...' res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() latest_res_rev = res_revs[-1] if not latest_res_rev.current: latest_res_rev.current = True for i, res_rev in enumerate(res_revs[:-1]): if res_rev.expired_timestamp != res_revs[i+1].revision_timestamp: res_rev.expired_timestamp = res_revs[i+1].revision_timestamp res_rev.expired_id = res_revs[i+1].revision_id if latest_res_rev.expired_timestamp != END_OF_TIME: latest_res_rev.expired_timestamp = END_OF_TIME if latest_res_rev.expired_id is not None: latest_res_rev.expired_id = None # Correct the URL on the resource model.Session.query(model.Resource).filter_by(id=res.id).update({'url': latest_res_rev.url}) model.repo.commit_and_remove() print ' ...done' print 'Summary\n', stats.report() print 'Total bad revs: %d' % total_bad_revisions if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def fix_redirects(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Repoint 410 Gone to webarchive url' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id==options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(or_(Archival.is_broken == True, Archival.url_redirected_to != None))\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org==org.id) results = results.all() def is_gov_uk(url): return url.startswith('https://www.gov.uk/') def is_webarchive(url): return url.startswith('http://webarchive.nationalarchives.gov.uk/') for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore')) if archival.reason.endswith('410 Gone'): # Find out the redirect - it is in the html try: page = requests.get(res.url) except requests.exceptions.ConnectionError: print stats_add('410 Gone but connection error') continue if '<a href="https://www.gov.uk">' not in page.text: print stats_add('410 Gone but not gov.uk') continue root = lxml.html.fromstring(page.text) hrefs = root.xpath('//div[@id="detail"]//a') for href in hrefs: url = href.attrib['href'] if is_webarchive(url): break else: print stats_add('410 Gone but no forward link') continue print stats_add('410 Gone and link found - change') if write: res.url = url needs_commit = True continue if not archival.url_redirected_to: # we've filtered for redirects and broken, so must be broken stats_add('Broken, but not a redirect - not interested') continue if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): stats_add('Internal gov.uk redirect - ignore') continue if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): print stats_add('Redirect to gov.uk - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if is_webarchive(res.url) and is_webarchive(archival.url_redirected_to): stats_add('Internal webarchive redirect - ignore') continue if not is_webarchive(res.url) and is_webarchive(archival.url_redirected_to): print stats_add('Redirect to webarchive - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(archival.url_redirected_to): stats_add('Redirect nothing to do with gov.uk or webarchive - ignore') continue print stats_add('Dunno') stats.report_value_limit = 500 print 'Summary', stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'
def fix_redirects(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Repoint 410 Gone to webarchive url' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id == options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(or_(Archival.is_broken == True, Archival.url_redirected_to != None))\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org == org.id) results = results.all() def is_gov_uk(url): return url.startswith('https://www.gov.uk/') def is_webarchive(url): return url.startswith('http://webarchive.nationalarchives.gov.uk/') for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode( 'latin7', 'ignore')) if archival.reason.endswith('410 Gone'): # Find out the redirect - it is in the html try: page = requests.get(res.url) except requests.exceptions.ConnectionError: print stats_add('410 Gone but connection error') continue if '<a href="https://www.gov.uk">' not in page.text: print stats_add('410 Gone but not gov.uk') continue root = lxml.html.fromstring(page.text) hrefs = root.xpath('//div[@id="detail"]//a') for href in hrefs: url = href.attrib['href'] if is_webarchive(url): break else: print stats_add('410 Gone but no forward link') continue print stats_add('410 Gone and link found - change') if write: res.url = url needs_commit = True continue if not archival.url_redirected_to: # we've filtered for redirects and broken, so must be broken stats_add('Broken, but not a redirect - not interested') continue if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): stats_add('Internal gov.uk redirect - ignore') continue if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): print stats_add('Redirect to gov.uk - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if is_webarchive(res.url) and is_webarchive( archival.url_redirected_to): stats_add('Internal webarchive redirect - ignore') continue if not is_webarchive(res.url) and is_webarchive( archival.url_redirected_to): print stats_add('Redirect to webarchive - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if not is_gov_uk(archival.url_redirected_to) and not is_webarchive( archival.url_redirected_to): stats_add( 'Redirect nothing to do with gov.uk or webarchive - ignore') continue print stats_add('Dunno') stats.report_value_limit = 500 print 'Summary', stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'