Esempio n. 1
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Esempio n. 2
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Esempio n. 3
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME,
                                       SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (
                theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package2(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0]['name'] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]['name']:
            print stats.add('Theme unchanged %s' % themes[0]['name'],
                            pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0]['name'],
                        pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Esempio n. 4
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME,
            SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]:
            print stats.add('Theme unchanged %s' % themes[0], pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0], pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Esempio n. 5
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq) / max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add(
            'margin %.1f' % winning_margin,
            '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
Esempio n. 6
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq)/max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
def wms_revisions(options):
    '''
    These revisions look like this:

    # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp;
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1
    http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3

    The bad ones have been changed to "?service=" params. These revisions need removing.

    # Typical revision:
                     id                  |         timestamp          |           author           |                         message                          | state  | approved_timestamp
    a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active |
    # i.e. author='co-prod3...' (site-user, via API)
    '''
    resources = common.get_resources(state='active',
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    stats.report_value_limit = 1000
    total_bad_revisions = 0
    need_to_commit = False
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        res = model.Resource.get(
            res.id)  # as the session gets flushed during the loop
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(
            id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        first_res_rev = res_revs[0]
        if 'request=GetCapabilities&version=' in first_res_rev.url:
            print add_stat('First revision already was WMS', res, stats)
            continue

        # Identify bad revisions by the WMS URL parameters and author
        bad_res_revs = res_rev_q.filter(
            model.ResourceRevision.url.ilike(
                '%?service=W%S&request=GetCapabilities&version=%')).all()
        if bad_res_revs and \
           bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'):
            print add_stat('Misidentified', res, stats,
                           'author=%r' % bad_res_revs[0].revision.author)
            continue
        if not bad_res_revs:
            add_stat('Resource ok', res, stats)
            continue
        print ' '  # don't overwrite progress bar
        print add_stat('Bad revisions', res, stats,
                       '(%d/%d)' % (len(bad_res_revs), len(res_revs)))
        total_bad_revisions += len(bad_res_revs)

        # Find the new latest (good) revision
        bad_res_revs_set = set(bad_res_revs)
        for res_rev_index in reversed(xrange(len(res_revs))):
            if res_revs[res_rev_index] not in bad_res_revs_set:
                latest_good_res_rev = res_revs[res_rev_index]
                break
        else:
            print add_stat('No good revisions', res, stats)
            continue
        if not options.write:
            continue

        # Delete the revisions and resource_revisions
        print '  Deleting bad revisions...'

        def delete_bad_revisions(res_revs):
            # Build the sql as a list, as it is faster when you have 1000 strings to append
            sql = [
                '''BEGIN;
            ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey;
            ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey;
            ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey;
            '''
            ]
            for res_rev in res_revs:
                sql.append(
                    "DELETE from resource_revision where id='%s' and revision_id='%s';\n"
                    % (res.id, res_rev.revision_id))
                # a revision created (e.g. over the API) can be connect to other
                # resources or a dataset, so only delete the revision if only
                # connected to this one.
                if model.Session.query(model.ResourceRevision).\
                        filter_by(revision_id=res_rev.revision_id).\
                        count() == 1 and \
                        model.Session.query(model.PackageRevision).\
                        filter_by(revision_id=res_rev.revision_id).count() == 0:
                    sql.append("DELETE from revision where id='%s';\n" %
                               res_rev.revision_id)
            sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \
                (latest_good_res_rev.revision_id, res.id))
            sql.append('''
            ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            COMMIT;''')
            print '  sql..',
            model.Session.execute(''.join(sql))
            print '.committed'
            model.Session.remove()

        def chunks(l, n):
            '''Yield successive n-sized chunks from l.'''
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        # chunk revisions in chunks to cope when there are so many
        widgets = [
            'Creating SQL: ',
            Counter(),
            'k/%sk ' % int(float(len(bad_res_revs)) / 1000.0),
            Bar(), ' ',
            ETA()
        ]
        progress2 = ProgressBar(widgets=widgets,
                                maxval=int(float(len(bad_res_revs)) / 1000.0)
                                or 1)
        for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)):
            delete_bad_revisions(chunk_of_bad_res_revs)

        # Knit together the remaining revisions again
        print '  Knitting existing revisions back together...'
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(
            id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        latest_res_rev = res_revs[-1]
        if not latest_res_rev.current:
            latest_res_rev.current = True
        for i, res_rev in enumerate(res_revs[:-1]):
            if res_rev.expired_timestamp != res_revs[i + 1].revision_timestamp:
                res_rev.expired_timestamp = res_revs[i + 1].revision_timestamp
                res_rev.expired_id = res_revs[i + 1].revision_id
        if latest_res_rev.expired_timestamp != END_OF_TIME:
            latest_res_rev.expired_timestamp = END_OF_TIME
        if latest_res_rev.expired_id is not None:
            latest_res_rev.expired_id = None

        # Correct the URL on the resource
        model.Session.query(model.Resource).filter_by(id=res.id).update(
            {'url': latest_res_rev.url})
        model.repo.commit_and_remove()
        print '  ...done'

    print 'Summary\n', stats.report()
    print 'Total bad revs: %d' % total_bad_revisions
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Esempio n. 8
0
def wms_revisions(options):
    '''
    These revisions look like this:

    # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp;
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1
    http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3

    The bad ones have been changed to "?service=" params. These revisions need removing.

    # Typical revision:
                     id                  |         timestamp          |           author           |                         message                          | state  | approved_timestamp
    a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active |
    # i.e. author='co-prod3...' (site-user, via API)
    '''
    resources = common.get_resources(state='active',
            resource_id=options.resource, dataset_name=options.dataset)
    stats = StatsList()
    stats.report_value_limit = 1000
    total_bad_revisions = 0
    need_to_commit = False
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        res = model.Resource.get(res.id)  # as the session gets flushed during the loop
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        first_res_rev = res_revs[0]
        if 'request=GetCapabilities&version=' in first_res_rev.url:
            print add_stat('First revision already was WMS', res, stats)
            continue

        # Identify bad revisions by the WMS URL parameters and author
        bad_res_revs = res_rev_q.filter(model.ResourceRevision.url.ilike('%?service=W%S&request=GetCapabilities&version=%')).all()
        if bad_res_revs and \
           bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'):
            print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author)
            continue
        if not bad_res_revs:
            add_stat('Resource ok', res, stats)
            continue
        print ' ' # don't overwrite progress bar
        print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs)))
        total_bad_revisions += len(bad_res_revs)

        # Find the new latest (good) revision
        bad_res_revs_set = set(bad_res_revs)
        for res_rev_index in reversed(xrange(len(res_revs))):
            if res_revs[res_rev_index] not in bad_res_revs_set:
                latest_good_res_rev = res_revs[res_rev_index]
                break
        else:
            print add_stat('No good revisions', res, stats)
            continue
        if not options.write:
            continue

        # Delete the revisions and resource_revisions
        print '  Deleting bad revisions...'
        def delete_bad_revisions(res_revs):
            # Build the sql as a list, as it is faster when you have 1000 strings to append
            sql = ['''BEGIN;
            ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey;
            ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey;
            ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey;
            ''']
            for res_rev in res_revs:
                sql.append("DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id))
                # a revision created (e.g. over the API) can be connect to other
                # resources or a dataset, so only delete the revision if only
                # connected to this one.
                if model.Session.query(model.ResourceRevision).\
                        filter_by(revision_id=res_rev.revision_id).\
                        count() == 1 and \
                        model.Session.query(model.PackageRevision).\
                        filter_by(revision_id=res_rev.revision_id).count() == 0:
                    sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id)
            sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \
                (latest_good_res_rev.revision_id, res.id))
            sql.append('''
            ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            COMMIT;''')
            print '  sql..',
            model.Session.execute(''.join(sql))
            print '.committed'
            model.Session.remove()
        def chunks(l, n):
            '''Yield successive n-sized chunks from l.'''
            for i in xrange(0, len(l), n):
                yield l[i:i+n]
        # chunk revisions in chunks to cope when there are so many
        widgets = ['Creating SQL: ', Counter(),
                   'k/%sk ' % int(float(len(bad_res_revs))/1000.0), Bar(),
                   ' ', ETA()]
        progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs))/1000.0) or 1)
        for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)):
            delete_bad_revisions(chunk_of_bad_res_revs)

        # Knit together the remaining revisions again
        print '  Knitting existing revisions back together...'
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        latest_res_rev = res_revs[-1]
        if not latest_res_rev.current:
            latest_res_rev.current = True
        for i, res_rev in enumerate(res_revs[:-1]):
            if res_rev.expired_timestamp != res_revs[i+1].revision_timestamp:
                res_rev.expired_timestamp = res_revs[i+1].revision_timestamp
                res_rev.expired_id = res_revs[i+1].revision_id
        if latest_res_rev.expired_timestamp != END_OF_TIME:
            latest_res_rev.expired_timestamp = END_OF_TIME
        if latest_res_rev.expired_id is not None:
            latest_res_rev.expired_id = None

        # Correct the URL on the resource
        model.Session.query(model.Resource).filter_by(id=res.id).update({'url': latest_res_rev.url})
        model.repo.commit_and_remove()
        print '  ...done'


    print 'Summary\n', stats.report()
    print 'Total bad revs: %d' % total_bad_revisions
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Esempio n. 9
0
def fix_redirects(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Repoint 410 Gone to webarchive url'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id==options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(or_(Archival.is_broken == True,
                                 Archival.url_redirected_to != None))\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org==org.id)
    results = results.all()

    def is_gov_uk(url):
        return url.startswith('https://www.gov.uk/')
    def is_webarchive(url):
        return url.startswith('http://webarchive.nationalarchives.gov.uk/')

    for archival, res in results:
        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore'))
        if archival.reason.endswith('410 Gone'):
            # Find out the redirect - it is in the html
            try:
                page = requests.get(res.url)
            except requests.exceptions.ConnectionError:
                print stats_add('410 Gone but connection error')
                continue
            if '<a href="https://www.gov.uk">' not in page.text:
                print stats_add('410 Gone but not gov.uk')
                continue
            root = lxml.html.fromstring(page.text)
            hrefs = root.xpath('//div[@id="detail"]//a')
            for href in hrefs:
                url = href.attrib['href']
                if is_webarchive(url):
                    break
            else:
                print stats_add('410 Gone but no forward link')
                continue
            print stats_add('410 Gone and link found - change')
            if write:
                res.url = url
                needs_commit = True
            continue

        if not archival.url_redirected_to:
            # we've filtered for redirects and broken, so must be broken
            stats_add('Broken, but not a redirect - not interested')
            continue
        if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            stats_add('Internal gov.uk redirect - ignore')
            continue
        if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            print stats_add('Redirect to gov.uk - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if is_webarchive(res.url) and is_webarchive(archival.url_redirected_to):
            stats_add('Internal webarchive redirect - ignore')
            continue
        if not is_webarchive(res.url) and is_webarchive(archival.url_redirected_to):
            print stats_add('Redirect to webarchive - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(archival.url_redirected_to):
            stats_add('Redirect nothing to do with gov.uk or webarchive - ignore')
            continue
        print stats_add('Dunno')

    stats.report_value_limit = 500
    print 'Summary', stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'
def fix_redirects(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Repoint 410 Gone to webarchive url'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id == options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(or_(Archival.is_broken == True,
                                 Archival.url_redirected_to != None))\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org == org.id)
    results = results.all()

    def is_gov_uk(url):
        return url.startswith('https://www.gov.uk/')

    def is_webarchive(url):
        return url.startswith('http://webarchive.nationalarchives.gov.uk/')

    for archival, res in results:

        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg,
                             ('%s/%s %s' % (pkg.name, res.id, res.url)).encode(
                                 'latin7', 'ignore'))

        if archival.reason.endswith('410 Gone'):
            # Find out the redirect - it is in the html
            try:
                page = requests.get(res.url)
            except requests.exceptions.ConnectionError:
                print stats_add('410 Gone but connection error')
                continue
            if '<a href="https://www.gov.uk">' not in page.text:
                print stats_add('410 Gone but not gov.uk')
                continue
            root = lxml.html.fromstring(page.text)
            hrefs = root.xpath('//div[@id="detail"]//a')
            for href in hrefs:
                url = href.attrib['href']
                if is_webarchive(url):
                    break
            else:
                print stats_add('410 Gone but no forward link')
                continue
            print stats_add('410 Gone and link found - change')
            if write:
                res.url = url
                needs_commit = True
            continue

        if not archival.url_redirected_to:
            # we've filtered for redirects and broken, so must be broken
            stats_add('Broken, but not a redirect - not interested')
            continue
        if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            stats_add('Internal gov.uk redirect - ignore')
            continue
        if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            print stats_add('Redirect to gov.uk - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if is_webarchive(res.url) and is_webarchive(
                archival.url_redirected_to):
            stats_add('Internal webarchive redirect - ignore')
            continue
        if not is_webarchive(res.url) and is_webarchive(
                archival.url_redirected_to):
            print stats_add('Redirect to webarchive - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(
                archival.url_redirected_to):
            stats_add(
                'Redirect nothing to do with gov.uk or webarchive - ignore')
            continue
        print stats_add('Dunno')

    stats.report_value_limit = 500
    print 'Summary', stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'