Exemple #1
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemple #2
0
    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {
            'external_reference': 'ONSHUB',
            'state': 'active'
        }
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue
            if not self.loader._pkg_matches_search_options(
                    pkg, onshub_packages_search_options):
                log.error(
                    merge_stats.add('Did not match ONSHUB search after all',
                                    pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {
                'title': pkg['title'],
                'groups': pkg['groups'][0] if pkg['groups'] else '',
                'external_reference': 'ONSHUB',
                'state': 'active'
            }
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself',
                                          pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(
                        dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s',
                             pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r', pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add(
                    '%i duplicates found and merged' % len(dupe_pkgs),
                    pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemple #3
0
def categorize(options, test=False):
    from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        if test:
            theme = True
        else:
            theme = False
        packages = get_packages(publisher=options.publisher,
                                theme=theme,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg, stats)
        if options.write and not pkg.extras.get(PRIMARY_THEME) and themes:
            themes_to_write[pkg.name] = themes

    print 'Categorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
    def set_initial_value(self):
        log = global_log
        stats = StatsList()

        from ckan import model
        import ckan.plugins as p
        from ckan.logic import ActionError
        from ckanext.dgu.lib.helpers import upsert_extra

        site_user = p.toolkit.get_action('get_site_user')({'model': model,'ignore_auth': True}, {})
        c = {'model': model, 'user': site_user['name']}
        packages = p.toolkit.get_action('package_list')(c, data_dict={})
        
        log.info('Processing %d packages', len(packages))

        for pkg_name in packages:
            pkg = model.Package.by_name(pkg_name)

            last_mod = self.determine_last_major_modification(pkg).isoformat()
            log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod)
            if pkg.extras.get('last_major_modification') != last_mod:
                log.info(stats.add('Adding modification date', pkg.name))
                model.repo.new_revision()
                pkg.extras['last_major_modification'] = last_mod
                model.repo.commit_and_remove()
            else:
                log.info(stats.add('No change needed', pkg.name))
        print stats.report()
Exemple #5
0
def undelete(options):
    resources = _get_resources('deleted', options)
    stats = StatsList()
    if options.write:
        rev = model.repo.new_revision()
        rev.author = 'current_revision_fixer2'
    need_to_commit = False
    for res in resources:
        # when viewing old revision of the dataset, there is one where the
        # resources are not deleted but they don't show up. This is seen where resource_revision has an expired_timestamp that has no corresponding revision_timestamp - i.e. a gap between them (and that is not 9999-12-31).
        # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='373bb814-7a49-4f53-8a0e-762002b2529c' order by revision_timestamp;
        #      revision_timestamp     |     expired_timestamp      | current
        # ----------------------------+----------------------------+---------
        # 2013-06-19 00:50:28.880058 | 2014-01-18 01:03:47.500041 | f
        # 2014-01-18 01:03:47.500041 | 2014-01-18 01:03:48.296204 | f
        # 2014-01-18 01:03:50.612196 | 9999-12-31 00:00:00        | t
        # Clearly there is a gap from the 2nd to the 3rd, indicating the problem.
        res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all()
        if len(res_revs) < 2:
            print add_stat('Not enough revisions', res, stats)
            continue
        if res_revs[-2].expired_timestamp == res_revs[-1].revision_timestamp:
            add_stat('Ok', res, stats)
            continue
        print add_stat('Timestamp gap', res, stats)
        if options.write:
            res.state = 'active'
            need_to_commit = True
    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
    def fetch(cls, site_url_filter, since_datetime):
        import ckan.model as model
        from running_stats import StatsList
        log = logging.getLogger(__name__)
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        import ckanext.certificates.client as client
        for entry in client.generate_entries(since=since_datetime):

            # We have to handle the case where the rel='about' might be
            # missing, if so we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                log.debug(stats.add('Ignore - no rel="about" specifying the dataset',
                                    '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not site_url_filter.search(about):
                log.debug(stats.add('Ignore - "about" field does not reference this site',
                                    '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                log.debug(stats.add('Ignore - is "about" DGU but not a dataset',
                                    '%s "%s" %s' % (about, entry['about'], entry['id'])))
                continue

            pkg = cls._get_package_from_url(entry.get('about'))
            if not pkg:
                log.error(stats.add('Unable to find the package',
                                    '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about'))))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(entry['alternate'])
            if not badge_data:
                log.info(stats.add('Error fetching badge data - skipped',
                                   '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue
            badge_data['cert_title'] = entry.get('content', '')  # e.g. 'Basic Level Certificate'

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                log.debug(stats.add('Certificate unchanged',
                                         badge_data['certificate_url']))
            else:
                operation = 'updated' if 'odi-certificate' in pkg.extras \
                    else 'added'
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                log.debug(stats.add('Certificate %s' % operation,
                               '"%s" %s' % (badge_data['title'],
                                            badge_data['certificate_url'])))
                model.Session.commit()

        log.info('Summary:\n' + stats.report())
def reconcile_aliases_that_match_entities_exactly():
    '''When adding entities using this tool, they might also currently be in
    the recon queue. In cases there the alias name matches exactly the entity
    name, link them up.

    (Ideally we'd just delete the alias from the recon queue, but there is no
    delete_alias API.)
    '''
    stats = StatsList()
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)
    for alias in nomen_data.unmatched_aliases:
        try:
            entity_or_alias = nk_dataset.lookup_detailed(alias.name,
                                                         readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        if entity_or_alias and isinstance(entity_or_alias,
                                          nomenklatura.Entity):
            try:
                nk_dataset.match(alias_id=alias.id,
                                 entity_id=entity_or_alias.id)
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add(
                    'Server error linking the alias to an entity: %s' % e,
                    alias.name)
                continue
            print stats.add('Matched alias to an entity of the same name',
                            alias.name)
        else:
            print stats.add('No matching entity', alias.name)
def command(dry_run=False):
    from ckan import model
    from ckanext.dgu.lib.resource_formats import match
    from running_stats import StatsList

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry = Registry()
    registry.prepare()
    translator_obj = MockTranslator()
    registry.register(translator, translator_obj)

    if not dry_run:
        model.repo.new_revision()

    # Add canonised formats to map
    for format_ in res_type_map.keys():
        res_type_map[canonise(format_)] = res_type_map[format_]

    log.info('Tidying resource types')

    stats = StatsList()

    res_query = model.Session.query(model.Resource)
    log.info('Tidying formats. Resources=%i Canonised formats=%i',
             res_query.count(), len(set(res_type_map.values())))

    for res in res_query:
        canonised_fmt = canonise(res.format or '')
        if canonised_fmt in res_type_map:
            improved_fmt = res_type_map[canonised_fmt]
        else:
            improved_fmt = tidy(res.format)
        match_ = match(improved_fmt)
        if match_:
            improved_fmt = match_
        if (improved_fmt or '') != (res.format or ''):
            if not dry_run:
                res.format = improved_fmt
            stats.add(improved_fmt, res.format)
        else:
            stats.add('No change', res.format)

    if not dry_run:
        model.repo.commit_and_remove()

    log.info('Stats report: %r', stats.report())
    print stats.report()

    log.info('Warnings (%i): %r', len(warnings), warnings)
    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {'external_reference': 'ONSHUB',
                                          'state': 'active'}
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue                
            if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options):
                log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {'title': pkg['title'],
                                   'groups': pkg['groups'][0] if pkg['groups'] else '',
                                   'external_reference': 'ONSHUB',
                                   'state': 'active'}
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself', pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r',
                         pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))
                

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
def reconcile_aliases_that_match_entities_exactly():
    '''When adding entities using this tool, they might also currently be in
    the recon queue. In cases there the alias name matches exactly the entity
    name, link them up.

    (Ideally we'd just delete the alias from the recon queue, but there is no
    delete_alias API.)
    '''
    stats = StatsList()
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)
    for alias in nomen_data.unmatched_aliases:
        try:
            entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
            try:
                nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id)
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error linking the alias to an entity: %s' % e, alias.name)
                continue
            print stats.add('Matched alias to an entity of the same name', alias.name)
        else:
            print stats.add('No matching entity', alias.name)
Exemple #11
0
def refix(options):
    resources = _get_resources('active', options)
    stats = StatsList()
    need_to_commit = False
    for res in resources:
        # the old uncommit command would set the wrong resource_revision to be current.
        # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='b2972b35-b6ae-4096-b8cc-40dab3927a71' order by revision_timestamp;
        #     revision_timestamp     |     expired_timestamp      | current
        # ---------------------------+----------------------------+---------i
        # 2013-04-13 01:47:30.18897  | 2013-06-18 19:01:45.910899 | f
        # 2013-06-18 19:01:45.910899 | 2014-01-18 08:55:41.443349 | t
        # 2014-01-18 08:55:41.443349 | 2014-01-18 08:55:41.566383 | f
        # Clearly only the latest should be current.
        res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all()
        fix_needed = False
        if len(res_revs) < 2:
            print add_stat('Not enought revisions', res, stats)
            continue
        for res_rev in res_revs[:-1]:
            if res_rev.current:
                print add_stat('Early revision is current', res, stats)
                fix_needed = True
                if options.write:
                    res_rev.current = False
                    need_to_commit = True
        if not res_revs[-1].current:
            print add_stat('Last revision is not current', res, stats)
            fix_needed = True
            if options.write:
                res_revs[-1].current = True
                need_to_commit = True
        if res_revs[-1].expired_timestamp != END_OF_TIME:
            print add_stat('Last revision is not 9999', res, stats)
            fix_needed = True
            if options.write:
                res_revs[-1].expired_timestamp = END_OF_TIME
                need_to_commit = True
        if not fix_needed:
            add_stat('Ok', res, stats)
            continue
    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
def command(dry_run=False):
    from ckan import model
    from ckanext.dgu.lib.resource_formats import match
    from running_stats import StatsList

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry=Registry()
    registry.prepare()
    translator_obj=MockTranslator() 
    registry.register(translator, translator_obj) 

    if not dry_run:
        model.repo.new_revision()

    # Add canonised formats to map
    for format_ in res_type_map.keys():
        res_type_map[canonise(format_)] = res_type_map[format_]

    log.info('Tidying resource types')

    stats = StatsList()

    res_query = model.Session.query(model.Resource)
    log.info('Tidying formats. Resources=%i Canonised formats=%i',
             res_query.count(), len(set(res_type_map.values())))

    for res in res_query:
        canonised_fmt = canonise(res.format or '')
        if canonised_fmt in res_type_map:
            improved_fmt = res_type_map[canonised_fmt]
        else:
            improved_fmt = tidy(res.format)
        match_ = match(improved_fmt)
        if match_:
            improved_fmt = match_
        if (improved_fmt or '') != (res.format or ''):
            if not dry_run:
                res.format = improved_fmt
            stats.add(improved_fmt, res.format)
        else:
            stats.add('No change', res.format)

    if not dry_run:
        model.repo.commit_and_remove()

    log.info('Stats report: %r', stats.report())
    print stats.report()

    log.info('Warnings (%i): %r', len(warnings), warnings)
    def set_initial_value(self):
        log = global_log
        stats = StatsList()

        from ckan import model
        import ckan.plugins as p
        from ckan.logic import ActionError
        from ckanext.dgu.lib.helpers import upsert_extra

        site_user = p.toolkit.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})
        c = {'model': model, 'user': site_user['name']}
        packages = p.toolkit.get_action('package_list')(c, data_dict={})

        log.info('Processing %d packages', len(packages))

        for pkg_name in packages:
            pkg = model.Package.by_name(pkg_name)

            last_mod = self.determine_last_major_modification(pkg).isoformat()
            log.info('%s: %s %s', pkg_name,
                     pkg.extras.get('last_major_modification'), last_mod)
            if pkg.extras.get('last_major_modification') != last_mod:
                log.info(stats.add('Adding modification date', pkg.name))
                model.repo.new_revision()
                pkg.extras['last_major_modification'] = last_mod
                model.repo.commit_and_remove()
            else:
                log.info(stats.add('No change needed', pkg.name))
        print stats.report()
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client)
            if not publisher_name:
                log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemple #15
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq) / max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add(
            'margin %.1f' % winning_margin,
            '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
Exemple #16
0
    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='!external_reference:ONSHUB \"Source agency\"',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add(
                        'Could not find "Source agency: " line after all',
                        pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemple #17
0
def no_current_resources(options):
    resources = _get_resources('active', options)
    stats = StatsList()
    need_to_commit = False
    for res in resources:
        latest_res_rev = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp.desc()).first()
        if not latest_res_rev.current:
            print add_stat('No current revision', res, stats)
            if options.write:
                latest_res_rev.current = True
                need_to_commit = True
        else:
            add_stat('Ok', res, stats)
        if latest_res_rev.revision_id != res.revision_id:
            print add_stat('Revision ID of resource too old', res, stats)
            if options.write:
                res.revision_id = latest_res_rev.revision_id
                need_to_commit = True

    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
Exemple #18
0
    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '

        res = self.client.action('package_search',
                                 q='external_reference:ONSHUB \"%s\"' % prefix,
                                 sort='name asc',
                                 fq=' +site_id:"dgu" +state:active',
                                 wt='json',
                                 rows=100,
                                 escape_q=False)

        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemple #19
0
def no_current_packages(options):
    pkgs = _get_packages('active', options)
    stats = StatsList()
    need_to_commit = False
    for pkg in pkgs:
        latest_pkg_rev = \
            model.Session.query(model.PackageRevision) \
            .filter_by(id=pkg.id) \
            .order_by(model.PackageRevision.revision_timestamp.desc()) \
            .first()
        # sometimes a revision_timestamp is null for some reason
        if latest_pkg_rev.revision_timestamp is None:
            # in which case, join them to the revision table and order by those
            # timestamps instead
            latest_pkg_rev = \
                model.Session.query(model.PackageRevision) \
                .filter_by(id=pkg.id) \
                .join(model.Revision) \
                .order_by(model.Revision.timestamp.desc()) \
                .first()

        if not latest_pkg_rev.current:
            print stats.add('No current revision', pkg.name)
            if options.write:
                latest_pkg_rev.current = True
                need_to_commit = True
        else:
            stats.add('Ok', pkg.name)
        if latest_pkg_rev.revision_id != pkg.revision_id:
            print stats.add('Revision ID of package too old', pkg.name)
            if options.write:
                pkg.revision_id = latest_pkg_rev.revision_id
                need_to_commit = True

    print 'Summary', stats.report()
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
    print
Exemple #20
0
def learn(options):
    '''Analyse datasets that are already categorise to find out which words
    associate with which theme.
    '''
    from ckanext.dgu.lib.theme import Themes
    level = 1
    freq_dists = {}
    fd_by_fraction = defaultdict(list)
    count = 0
    for theme in Themes.instance().data:
        count += 1
        if count == 30:
            break
        options.theme = theme
        freq_dist = get_freq_dist(options, level)
        print '%s: %r' % (theme, freq_dist)
        freq_dists[theme] = freq_dist
        if not len(freq_dist):
            continue
        max_freq = freq_dist[freq_dist.max()]
        freq_fraction_threshold = 0.0
        for word, freq in freq_dist.items():
            freq_fraction = float(freq)/max_freq
            if freq_fraction < freq_fraction_threshold:
                break
            fd_by_fraction[word].append((freq_fraction, theme, freq))

    stats = StatsList()
    stats.report_value_limit = 1000
    unique_words = defaultdict(list)  # theme: [word, ...]
    for word, counts in fd_by_fraction.items():
        if len(counts) == 1:
            print stats.add('unique', '%s %s' % (word, counts[0][1]))
            unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2]))
            continue
        sorted_counts = sorted(counts, key=lambda tup: -tup[0])
        winning_margin = sorted_counts[0][0] - sorted_counts[1][0]
        print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1]))
    print 'Unique words:'
    for theme, words in unique_words.items():
        print '%s: %s' % (theme, ' '.join(words))
    print 'Summary:'
    print stats.report()
    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '
        
        res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not find "Source agency: " line after all', pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url
                    or res.extras.get('cache_filepath')
                    or res.hash
                    or res.size
                    or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to',
                               'updated', 'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME,
                fields['first_failure'] or START_OF_TIME,
                fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res, stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Exemple #24
0
            user_drupal_id = user.name.replace('user_d', '')
            try:
                user_properties = drupal.get_user_properties(user_drupal_id)
            except DrupalRequestError, e:
                user_emails[user.name] = user.email
            else:
                user_emails[user.name] = user_properties['mail']
        else:
            # not a drupal user
            user_emails[user.name] = user.email
    return user_emails[user.name]

# NHS publishers
nhs = model.Group.by_name('national-health-service')
assert nhs
pub_stats = StatsList()
pct_rows = []
non_pct_rows = []
for pub in publisher_lib.go_down_tree(nhs):
    # Filter to PCTs
    title = pub.title
    not_pct = ('NHS Choices', 'NHS Connecting for Health', 'NHS Connecting for Health and NHS Business Services Authority')
    is_pct = ('Care Trust' in title or 'PCT' in title or title.startswith('NHS ') or 'Care Tust' in title) \
              and title not in not_pct and 'Foundation' not in title
    # Get the admins & editors
    admins = pub.members_of_type(model.User, 'admin').all()
    editors = pub.members_of_type(model.User, 'editor').all()
    # Get their email addresses
    users_with_email = []
    users_without_email = []
    warnings = None
Exemple #25
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME,
            SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]:
            print stats.add('Theme unchanged %s' % themes[0], pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0], pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
Exemple #26
0
    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='external_reference:ONSHUB !groups:["" TO *]',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(
                source_agency, self.client)
            if not publisher_name:
                log.error(
                    stats.add('Could not map source agency %s' % source_agency,
                              pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'
Exemple #27
0
def wms_revisions(options):
    '''
    These revisions look like this:

    # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp;
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1
    http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3

    The bad ones have been changed to "?service=" params. These revisions need removing.

    # Typical revision:
                     id                  |         timestamp          |           author           |                         message                          | state  | approved_timestamp
    a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active |
    # i.e. author='co-prod3...' (site-user, via API)
    '''
    resources = common.get_resources(state='active',
            resource_id=options.resource, dataset_name=options.dataset)
    stats = StatsList()
    stats.report_value_limit = 1000
    total_bad_revisions = 0
    need_to_commit = False
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        res = model.Resource.get(res.id)  # as the session gets flushed during the loop
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        first_res_rev = res_revs[0]
        if 'request=GetCapabilities&version=' in first_res_rev.url:
            print add_stat('First revision already was WMS', res, stats)
            continue

        # Identify bad revisions by the WMS URL parameters and author
        bad_res_revs = res_rev_q.filter(model.ResourceRevision.url.ilike('%?service=W%S&request=GetCapabilities&version=%')).all()
        if bad_res_revs and \
           bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'):
            print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author)
            continue
        if not bad_res_revs:
            add_stat('Resource ok', res, stats)
            continue
        print ' ' # don't overwrite progress bar
        print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs)))
        total_bad_revisions += len(bad_res_revs)

        # Find the new latest (good) revision
        bad_res_revs_set = set(bad_res_revs)
        for res_rev_index in reversed(xrange(len(res_revs))):
            if res_revs[res_rev_index] not in bad_res_revs_set:
                latest_good_res_rev = res_revs[res_rev_index]
                break
        else:
            print add_stat('No good revisions', res, stats)
            continue
        if not options.write:
            continue

        # Delete the revisions and resource_revisions
        print '  Deleting bad revisions...'
        def delete_bad_revisions(res_revs):
            # Build the sql as a list, as it is faster when you have 1000 strings to append
            sql = ['''BEGIN;
            ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey;
            ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey;
            ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey;
            ''']
            for res_rev in res_revs:
                sql.append("DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id))
                # a revision created (e.g. over the API) can be connect to other
                # resources or a dataset, so only delete the revision if only
                # connected to this one.
                if model.Session.query(model.ResourceRevision).\
                        filter_by(revision_id=res_rev.revision_id).\
                        count() == 1 and \
                        model.Session.query(model.PackageRevision).\
                        filter_by(revision_id=res_rev.revision_id).count() == 0:
                    sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id)
            sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \
                (latest_good_res_rev.revision_id, res.id))
            sql.append('''
            ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            COMMIT;''')
            print '  sql..',
            model.Session.execute(''.join(sql))
            print '.committed'
            model.Session.remove()
        def chunks(l, n):
            '''Yield successive n-sized chunks from l.'''
            for i in xrange(0, len(l), n):
                yield l[i:i+n]
        # chunk revisions in chunks to cope when there are so many
        widgets = ['Creating SQL: ', Counter(),
                   'k/%sk ' % int(float(len(bad_res_revs))/1000.0), Bar(),
                   ' ', ETA()]
        progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs))/1000.0) or 1)
        for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)):
            delete_bad_revisions(chunk_of_bad_res_revs)

        # Knit together the remaining revisions again
        print '  Knitting existing revisions back together...'
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        latest_res_rev = res_revs[-1]
        if not latest_res_rev.current:
            latest_res_rev.current = True
        for i, res_rev in enumerate(res_revs[:-1]):
            if res_rev.expired_timestamp != res_revs[i+1].revision_timestamp:
                res_rev.expired_timestamp = res_revs[i+1].revision_timestamp
                res_rev.expired_id = res_revs[i+1].revision_id
        if latest_res_rev.expired_timestamp != END_OF_TIME:
            latest_res_rev.expired_timestamp = END_OF_TIME
        if latest_res_rev.expired_id is not None:
            latest_res_rev.expired_id = None

        # Correct the URL on the resource
        model.Session.query(model.Resource).filter_by(id=res.id).update({'url': latest_res_rev.url})
        model.repo.commit_and_remove()
        print '  ...done'


    print 'Summary\n', stats.report()
    print 'Total bad revs: %d' % total_bad_revisions
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
    def command(self):
        # Load configuration
        self._load_config()

        # Initialise database access
        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)

        # Logging, post-config
        self.setup_logging()

        from pylons import config

        site_url = config.get('ckan.site_url')

        # Handling of sites that support www. but don't use it.
        full_site_url = site_url
        if not '//www.' in full_site_url:
            full_site_url = full_site_url.replace('//', '//www.')

        from running_stats import StatsList
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        for entry in client.generate_entries(self.log):

            # We have to handle the case where the rel='about' might be missing, if so
            # we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                self.log.debug(stats.add('Ignore - no rel="about" specifying the dataset',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not about.startswith(site_url) and not about.startswith(full_site_url):
                self.log.debug(stats.add('Ignore - "about" field does not reference this site',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                self.log.debug(stats.add('Ignore - is "about" DGU but not a dataset',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            pkg = self._get_package_from_url(entry.get('about'))
            if not pkg:
                self.log.error(stats.add('Unable to find the package',
                                         '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(self.log, entry['alternate'])
            badge_data['cert_title'] = entry.get('content', '')

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                self.log.debug(stats.add('Certificate unchanged',
                                         badge_data['certificate_url']))
            else:
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                operation = 'updated' if 'odi-certificate' in pkg.extras else 'added'
                self.log.debug(stats.add('Certificate %s' % operation,
                               '"%s" %s' % (badge_data['title'],
                                            badge_data['certificate_url'])))
                model.Session.commit()

        self.log.info('Summary:\n' + stats.report())
Exemple #29
0
    def command(self):
        # Load configuration
        self._load_config()

        # Initialise database access
        import ckan.model as model
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)

        # Logging, post-config
        self.setup_logging()

        from pylons import config

        site_url = config.get('ckan.site_url')

        # Handling of sites that support www. but don't use it.
        full_site_url = site_url
        if not '//www.' in full_site_url:
            full_site_url = full_site_url.replace('//', '//www.')

        from running_stats import StatsList
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        for entry in client.generate_entries(self.log):

            # We have to handle the case where the rel='about' might be missing, if so
            # we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                self.log.debug(
                    stats.add(
                        'Ignore - no rel="about" specifying the dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not about.startswith(site_url) and not about.startswith(
                    full_site_url):
                self.log.debug(
                    stats.add(
                        'Ignore - "about" field does not reference this site',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                self.log.debug(
                    stats.add(
                        'Ignore - is "about" DGU but not a dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            pkg = self._get_package_from_url(entry.get('about'))
            if not pkg:
                self.log.error(
                    stats.add(
                        'Unable to find the package',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(self.log, entry['alternate'])
            badge_data['cert_title'] = entry.get('content', '')

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                self.log.debug(
                    stats.add('Certificate unchanged',
                              badge_data['certificate_url']))
            else:
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                operation = 'updated' if 'odi-certificate' in pkg.extras else 'added'
                self.log.debug(
                    stats.add(
                        'Certificate %s' % operation, '"%s" %s' %
                        (badge_data['title'], badge_data['certificate_url'])))
                model.Session.commit()

        self.log.info('Summary:\n' + stats.report())
def update_entities_from_dgu(publishers=None):
    dgu_client = ckan_client()
    if not publishers:
        # Get list of publishers from DGU
        publishers = dgu_client.action('organization_list')

    stats = StatsList()
    nk_dataset = nk_connect(nk_dataset_name)
    for publisher_name in publishers:
        publisher = dgu_client.action('organization_show', id=publisher_name)

        # Match each publisher with a Nomen entity
        try:
            entity_or_alias = nk_dataset.lookup_detailed(publisher['title'],
                                                         readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        data = {'dgu_name': publisher_name}
        if entity_or_alias and isinstance(entity_or_alias,
                                          nomenklatura.Entity):
            # Matched an entity
            entity = entity_or_alias
            if entity.data.get('dgu_name') == publisher_name:
                # Matching ID, ensure Nomen still has the title as per DGU
                print stats.add('Matching ID. Title match: %s' % \
                        (entity.name == publisher['title']), publisher_name)
            elif 'dgu_name' in entity.data:
                print stats.add('Wrong ID - ignoring', publisher_name)
            elif entity.name == publisher['title']:
                nk_dataset.update_entity(entity.id, entity.name, data)
                print stats.add('Matching title, just added ID',
                                publisher_name)
            else:
                # The title differs because of canonization? Hasn't happened yet.
                print stats.add('Title differs - ignoring', publisher_name)
        elif entity_or_alias and isinstance(entity_or_alias,
                                            nomenklatura.Alias):
            # Matched an alias
            alias_ = entity_or_alias
            if alias_.is_matched:
                entity = nk_dataset.get_entity(id=alias_.entity['id'])
                if entity.data.get('dgu_name'):
                    print stats.add(
                        'Matched an alias for an entity which already has an ID - ignoring',
                        publisher_name)
                else:
                    nk_dataset.update_entity(entity.id, publisher['title'],
                                             data)
                    # we can't delete the existing alias (that is now the same
                    # as the entity) but we can create a new alias for the old
                    # entity
                    try:
                        new_alias = nk_dataset.lookup(entity.name)
                    except nk_dataset.NoMatch:
                        nk_dataset.match(alias_id=new_alias.id,
                                         entity_id=entity.id)
                        print stats.add(
                            'Matched an alias for an entity - swapped them over',
                            publisher_name)
                    except nk_dataset.Invalid:
                        # This is not expected, but still fine
                        print stats.add(
                            'Matched an alias for an entity - overwrote the entity',
                            publisher_name)
                    else:
                        # This is not expected, but still fine
                        print stats.add(
                            'Matched an alias for an entity - overwrote the entity',
                            publisher_name)
            else:
                new_entity = nk_dataset.add_entity(publisher['title'], data)
                nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
                print stats.add(
                    'Matched an alias without a matching entity - created the entity'
                )
        else:
            # No match - create Nomen entity
            nk_dataset.add_entity(publisher['title'], data)
            print stats.add('No match - added to Nomen', publisher_name)
    print 'Summary'
    print stats.report()
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival, Status

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of archivals from TaskStatus and Resource
        # to fill all properties of Archival apart from:
        # * package_id
        # * resource_id
        fields = {}
        archiver_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='archiver')\
                                    .filter_by(key='status')\
                                    .first()
        if archiver_task_status:
            ats_error = json.loads(archiver_task_status.error)
            fields['status_id'] = Status.by_text(archiver_task_status.value)
            fields['is_broken'] = Status.is_status_broken(fields['status_id'])
            fields['reason'] = ats_error['reason']
            fields['last_success'] = date_str_to_datetime_or_none(
                ats_error['last_success'])
            fields['first_failure'] = date_str_to_datetime_or_none(
                ats_error['first_failure'])
            fields['failure_count'] = int(ats_error['failure_count'])
            fields['url_redirected_to'] = ats_error['url_redirected_to']
            fields['updated'] = archiver_task_status.last_updated
        else:
            if not (res.cache_url or res.extras.get('cache_filepath')
                    or res.hash or res.size or res.mimetype):
                add_stat('No archive data', res, stats)
                continue
            for field_name in ('status_id', 'is_broken', 'reason',
                               'last_success', 'first_failure',
                               'failure_count', 'url_redirected_to', 'updated',
                               'created'):
                fields[field_name] = None

        fields['cache_filepath'] = res.extras.get('cache_filepath')
        fields['cache_url'] = res.cache_url
        fields['hash'] = res.hash
        fields['size'] = res.size
        fields['mimetype'] = res.mimetype

        revisions_with_hash = model.Session.query(model.ResourceRevision)\
                .filter_by(id=res.id)\
                .order_by(model.ResourceRevision.revision_timestamp)\
                .filter(model.ResourceRevision.hash != '').all()
        if revisions_with_hash:
            # these are not perfect by not far off
            fields['created'] = revisions_with_hash[0].revision_timestamp
            fields['resource_timestamp'] = revisions_with_hash[
                -1].revision_timestamp
        else:
            fields['created'] = min(fields['updated'] or END_OF_TIME,
                                    fields['first_failure'] or END_OF_TIME,
                                    fields['last_success'] or END_OF_TIME)
            fields['resource_timestamp'] = max(
                fields['updated'] or START_OF_TIME, fields['first_failure']
                or START_OF_TIME, fields['last_success'] or START_OF_TIME)

        # Compare with any existing data in the Archival table
        archival = Archival.get_for_resource(res.id)
        if archival:
            changed = None
            for field, value in fields.items():
                if getattr(archival, field) != value:
                    if options.write:
                        setattr(archival, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in archival table', res,
                         stats)
                continue
            add_stat('Updated in archival table', res, stats)
        else:
            archival = Archival.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(archival, field, value)
                model.Session.add(archival)
            add_stat('Added to archival table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True):
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)

    # Gather the list of entities & aliases from the file and command-line
    entities_or_aliases = []
    def find_name(name, stats):
        if not name.strip():
            print stats.add('blank', name)
        elif entities and name in nomen_data.entities_dict_by_name:
            entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity'])
            print stats.add('Entity found', name)
        elif aliases and name in nomen_data.aliases_by_name:
            entities_or_aliases.append(nomen_data.aliases_by_name[name])
            print stats.add('Alias found', name)
        else:
            print stats.add('Not found', name)
    if entity_or_alias_names:
        stats = StatsList()
        for name in entity_or_alias_names:
            find_name(name, stats)
        print 'Given names:'
        print stats.report()
    if filepath:
        if not os.path.exists(filepath):
            raise Exception('Filepath not found: %s' % filepath)
        with open(filepath, 'r') as f:
            stats = StatsList()
            for line in f:
                name = line.rstrip('\n\r')
                find_name(name, stats)
                #try:
                #    entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
                #except nk_dataset.NoMatch:
                #    print stats.add('Not found', publisher['title'])
                #    continue
                #except nk_dataset.Invalid:
                #    pass
                #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
                #entities_or_aliases.append(entity_or_alias)
        print 'File names:'
        print stats.report()

    # Do the action to each entity
    stats = StatsList()
    for entity_or_alias in entities_or_aliases:
        name = entity_or_alias.name
        if action=='invalidate':
            if isinstance(entity_or_alias, nomenklatura.Entity):
                print stats.add('Cannot invalidate an Entity', name)
                continue
            alias = entity_or_alias
            if alias.is_invalid:
                print stats.add('Already invalid', name)
                continue
            try:
                nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error: %s' % e, alias.name)
                continue
            print stats.add('Invalidated', name)
        else:
            raise NotImplemented
Exemple #33
0
[u'Crime'] -> ['Crime']
'''

import os
import sys
from sqlalchemy import engine_from_config
from pylons import config
import common
import ast
import json
from optparse import OptionParser
from ckan import model

from running_stats import StatsList

stats = StatsList()


class FixSecondaryTheme(object):
    @classmethod
    def command(cls, config_ini, write):
        common.load_config(config_ini)
        common.register_translator()

        rev = model.repo.new_revision()
        rev.author = 'fix_secondary_theme.py'

        for package in model.Session.query(model.Package):
            if 'theme-secondary' in package.extras:
                stats.add('Fixing', package.name)
def update_entities_from_dgu(publishers=None):
    dgu_client = ckan_client()
    if not publishers:
        # Get list of publishers from DGU
        publishers = dgu_client.action('organization_list')

    stats = StatsList()
    nk_dataset = nk_connect(nk_dataset_name)
    for publisher_name in publishers:
        publisher = dgu_client.action('organization_show', id=publisher_name)

        # Match each publisher with a Nomen entity
        try:
            entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
        except (nk_dataset.Invalid, nk_dataset.NoMatch):
            entity_or_alias = None

        data = {'dgu_name': publisher_name}
        if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity):
            # Matched an entity
            entity = entity_or_alias
            if entity.data.get('dgu_name') == publisher_name:
                # Matching ID, ensure Nomen still has the title as per DGU
                print stats.add('Matching ID. Title match: %s' % \
                        (entity.name == publisher['title']), publisher_name)
            elif 'dgu_name' in entity.data:
                print stats.add('Wrong ID - ignoring', publisher_name)
            elif entity.name == publisher['title']:
                nk_dataset.update_entity(entity.id, entity.name, data)
                print stats.add('Matching title, just added ID', publisher_name)
            else:
                # The title differs because of canonization? Hasn't happened yet.
                print stats.add('Title differs - ignoring', publisher_name)
        elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias):
            # Matched an alias
            alias_ = entity_or_alias
            if alias_.is_matched:
                entity = nk_dataset.get_entity(id=alias_.entity['id'])
                if entity.data.get('dgu_name'):
                    print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name)
                else:
                    nk_dataset.update_entity(entity.id, publisher['title'], data)
                    # we can't delete the existing alias (that is now the same
                    # as the entity) but we can create a new alias for the old
                    # entity
                    try:
                        new_alias = nk_dataset.lookup(entity.name)
                    except nk_dataset.NoMatch:
                        nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id)
                        print stats.add('Matched an alias for an entity - swapped them over', publisher_name)
                    except nk_dataset.Invalid:
                        # This is not expected, but still fine
                        print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
                    else:
                        # This is not expected, but still fine
                        print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name)
            else:
                new_entity = nk_dataset.add_entity(publisher['title'], data)
                nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id)
                print stats.add('Matched an alias without a matching entity - created the entity')
        else:
            # No match - create Nomen entity
            nk_dataset.add_entity(publisher['title'], data)
            print stats.add('No match - added to Nomen', publisher_name)
    print 'Summary'
    print stats.report()
def fix_redirects(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Repoint 410 Gone to webarchive url'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id==options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(or_(Archival.is_broken == True,
                                 Archival.url_redirected_to != None))\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org==org.id)
    results = results.all()

    def is_gov_uk(url):
        return url.startswith('https://www.gov.uk/')
    def is_webarchive(url):
        return url.startswith('http://webarchive.nationalarchives.gov.uk/')

    for archival, res in results:
        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore'))
        if archival.reason.endswith('410 Gone'):
            # Find out the redirect - it is in the html
            try:
                page = requests.get(res.url)
            except requests.exceptions.ConnectionError:
                print stats_add('410 Gone but connection error')
                continue
            if '<a href="https://www.gov.uk">' not in page.text:
                print stats_add('410 Gone but not gov.uk')
                continue
            root = lxml.html.fromstring(page.text)
            hrefs = root.xpath('//div[@id="detail"]//a')
            for href in hrefs:
                url = href.attrib['href']
                if is_webarchive(url):
                    break
            else:
                print stats_add('410 Gone but no forward link')
                continue
            print stats_add('410 Gone and link found - change')
            if write:
                res.url = url
                needs_commit = True
            continue

        if not archival.url_redirected_to:
            # we've filtered for redirects and broken, so must be broken
            stats_add('Broken, but not a redirect - not interested')
            continue
        if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            stats_add('Internal gov.uk redirect - ignore')
            continue
        if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            print stats_add('Redirect to gov.uk - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if is_webarchive(res.url) and is_webarchive(archival.url_redirected_to):
            stats_add('Internal webarchive redirect - ignore')
            continue
        if not is_webarchive(res.url) and is_webarchive(archival.url_redirected_to):
            print stats_add('Redirect to webarchive - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(archival.url_redirected_to):
            stats_add('Redirect nothing to do with gov.uk or webarchive - ignore')
            continue
        print stats_add('Dunno')

    stats.report_value_limit = 500
    print 'Summary', stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'
def migrate(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    from ckanext.qa.model import QA

    resources = common.get_resources(state='active',
                                     publisher_ref=options.publisher,
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        # Gather the details of QA from TaskStatus
        # to fill all properties of QA apart from:
        # * package_id
        # * resource_id
        fields = {}
        qa_task_status = model.Session.query(model.TaskStatus)\
                                    .filter_by(entity_id=res.id)\
                                    .filter_by(task_type='qa')\
                                    .filter_by(key='status')\
                                    .first()
        if not qa_task_status:
            add_stat('No QA data', res, stats)
            continue
        qa_error = json.loads(qa_task_status.error)
        fields['openness_score'] = int(qa_task_status.value)
        fields['openness_score_reason'] = qa_error['reason']
        fields['format'] = qa_error['format']
        qa_date = qa_task_status.last_updated
        # NB qa_task_status.last_updated appears to be 1hr ahead of the revision
        # time, so some timezone nonesense going on. Can't do much.
        archival = Archival.get_for_resource(res.id)
        if not archival:
            print add_stat('QA but no Archival data', res, stats)
            continue
        archival_date = archival.updated
        # the state of the resource was as it was archived on the date of
        # the QA update but we only know when the latest archival was. So
        # if it was archived before the QA update thenwe know that was the
        # archival, otherwise we don't know when the relevant archival was.
        if archival_date and qa_date >= archival_date:
            fields['archival_timestamp'] = archival_date
            fields['updated'] = archival_date
            fields['created'] = archival_date
            # Assume the resource URL archived was the one when the
            # archival was done (it may not be if the URL was queued and
            # there was significant delay before it was archived)
            get_resource_as_at = archival_date
        else:
            # This is common for when a resource is created and qa runs just
            # before archiver and you get:
            # "This file had not been downloaded at the time of scoring it."
            # Just put sensible datetimes since we don't really know the exact
            # ones
            fields['archival_timestamp'] = qa_date
            fields['updated'] = qa_date
            fields['created'] = qa_date
            get_resource_as_at = qa_date
        res_rev = model.Session.query(model.ResourceRevision).\
            filter_by(id=res.id).\
            filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\
            order_by(model.ResourceRevision.revision_timestamp.desc()).\
            first()
        fields['resource_timestamp'] = res_rev.revision_timestamp

        # Compare with any existing data in the Archival table
        qa = QA.get_for_resource(res.id)
        if qa:
            changed = None
            for field, value in fields.items():
                if getattr(qa, field) != value:
                    if options.write:
                        setattr(qa, field, value)
                    changed = True
            if not changed:
                add_stat('Already exists correctly in QA table', res, stats)
                continue
            add_stat('Updated in QA table', res, stats)
        else:
            qa = QA.create(res.id)
            if options.write:
                for field, value in fields.items():
                    setattr(qa, field, value)
                model.Session.add(qa)
            add_stat('Added to QA table', res, stats)

    print 'Summary\n', stats.report()
    if options.write:
        model.repo.commit_and_remove()
        print 'Written'
Exemple #37
0
def fix_links(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'script-fix-links-tna'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id==options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(Archival.is_broken == True)\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org==org.id)
    results = results.all()
    print '%i broken resources' % len(results)

    for archival, res in results:
        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore'))

        if is_webarchive(res.url):
            stats_add('Webarchive already - ignore')
            continue
        if is_broken_api(res.url, archival):
            stats_add('It is an API error - ignore')
            continue
        if archival.last_success and \
           datetime.datetime.now() - archival.last_success < datetime.timedelta(days=3):
            print stats_add('Not broken for at least a month yet - ignore')
            continue
        if archival.failure_count < 3:
            print stats_add('Not broken for at least 3 occasions yet - ignore')
            continue

        # see if it is on the webarchive
        url = "http://webarchive.nationalarchives.gov.uk/+/" + res.url
        print '%s' % res.url.encode('latin7', 'ignore')

        try:
            req = requests.head(url, headers=USER_AGENT, verify=False)
        except Exception, e:
            if 'ukgwacnf.html?url=' in str(e):
                print stats_add('Not in the webarchive, %s' % get_cache_status(archival))
                continue
            print stats_add('!! Problem with request %s' % e)
            continue
        if req.status_code == 200:
            print stats_add('On webarchive - fixed')
            if write:
                res.url = url
                needs_commit = True
            continue
        elif not is_webarchive(req.url):
            if res.url.startswith('http://www.dft.gov.uk/'):
                result_str, good_url = try_earlier_webarchivals(url)
                print stats_add('Trying earlier webarchivals - %s' % result_str)
                if good_url and write:
                    res.url = good_url
                    needs_commit = True
                continue
            if 'ukgwacnf.html?url=' in (req.url + ''.join((resp.url for resp in req.history))):
                # webarchive seems to add this to the url!
                print stats_add('Not in the webarchive, %s' % get_cache_status(archival))
                continue
            print stats_add('Redirected off webarchive to an error - check manually')
            continue
        print stats_add('Not on webarchive, %s' % get_cache_status(archival))
        time.sleep(random.randint(1, 3))

        '''
def command(dry_run=False):
    from ckan import model

    # Register a translator in this thread so that
    # the _() functions in logic layer can work
    from ckan.lib.cli import MockTranslator
    registry=Registry()
    registry.prepare()
    translator_obj=MockTranslator() 
    registry.register(translator, translator_obj) 

    global_log.info('Tidying package fields')

    stats = StatsList()

    if not dry_run:
        rev = model.repo.new_revision()
        rev.message = 'Package fields migration'

    for pkg in model.Session.query(model.Package)\
            .filter_by(state='active')\
            .order_by(model.Package.name):
        # field map
        for existing_fields, destination_field in field_map.items():
            value = pkg.extras.get(destination_field)
            if value:
                continue
            for existing_field in existing_fields:
                if hasattr(pkg, existing_field):
                    value = getattr(pkg, existing_field)
                else:
                    value = pkg.extras.get(existing_field)
                if value:
                    value = value.strip()
                    if value:
                        # take the first hit
                        continue
            if not dry_run:
                pkg.extras[destination_field] = value or ''
                # delete existing field values
                for existing_field in existing_fields:
                    if hasattr(pkg, existing_field):
                        setattr(pkg, existing_field, '')
                    elif existing_field in pkg.extras:
                        del pkg.extras[existing_field]
            if value:
                stats.add('Merged to field "%s"' % destination_field, pkg.name)
            else:
                stats.add('Not merged to field "%s"' % destination_field, pkg.name)

        # move url to additional resource
        if pkg.url:
            stats.add('Url moved to additional resource', value)
            if not dry_run:
                if not pkg.resource_groups:
                    res_group = model.ResourceGroup(label="default")
                    pkg.resource_groups.append(res_group)
                res_group = pkg.resource_groups[0]
                res = model.Resource(format='HTML', resource_type='documentation',
                                     url=pkg.url, description='Web page about the data')
                res_group.resources.append(res)
                model.Session.add(res)
                #pkg.url = ''
            stats.add('URL moved to additional resource', pkg.name)
        else:
            stats.add('No URL to move to additional resource', pkg.name)

        # delete fields
        for field in delete_fields:
            if field in pkg.extras:
                if not dry_run:
                    del pkg.extras[field]
                stats.add('Deleted field "%s"' % field, pkg.name)
            else:
                stats.add('No field to delete "%s"' % field, pkg.name)

    if not dry_run:
        model.repo.commit_and_remove()

    global_log.info(stats.report())
            try:
                user_properties = drupal.get_user_properties(user_drupal_id)
            except DrupalRequestError, e:
                user_emails[user.name] = user.email
            else:
                user_emails[user.name] = user_properties['mail']
        else:
            # not a drupal user
            user_emails[user.name] = user.email
    return user_emails[user.name]


# NHS publishers
nhs = model.Group.by_name('national-health-service')
assert nhs
pub_stats = StatsList()
pct_rows = []
non_pct_rows = []
for pub in publisher_lib.go_down_tree(nhs):
    # Filter to PCTs
    title = pub.title
    not_pct = ('NHS Choices', 'NHS Connecting for Health',
               'NHS Connecting for Health and NHS Business Services Authority')
    is_pct = ('Care Trust' in title or 'PCT' in title or title.startswith('NHS ') or 'Care Tust' in title) \
              and title not in not_pct and 'Foundation' not in title
    # Get the admins & editors
    admins = pub.members_of_type(model.User, 'admin').all()
    editors = pub.members_of_type(model.User, 'editor').all()
    # Get their email addresses
    users_with_email = []
    users_without_email = []
Exemple #40
0
def canada_extras():
    keys_changed = StatsCount()
    unmapped_keys = StatsList()
    licenses_changed = StatsCount()
    unmapped_licenses = StatsList()
    licenses = StatsList()
    key_mapping = {
        'Level of Government': 'level_of_government',
    }
    license_mapping = {
        'http://geogratis.ca/geogratis/en/licence.jsp': 'geogratis',
        'Crown Copyright': 'canada-crown',
    }
    from ckan import model
    rev = RevisionManager('Standardize extra keys', 10)
    for pkg in model.Session.query(model.Package):
        for old_key, new_key in key_mapping.items():
            if pkg.extras.has_key(old_key):
                rev.before_change()
                pkg.extras[new_key] = pkg.extras[old_key]
                del pkg.extras[old_key]
                keys_changed.increment(old_key)
                rev.after_change()
        for license_key in ('License', 'License URL'):
            if pkg.extras.has_key(license_key):
                old_license = pkg.extras[license_key]
                if old_license in license_mapping:
                    rev.before_change()
                    pkg.license_id = unicode(license_mapping[old_license])
                    del pkg.extras[license_key]
                    licenses_changed.increment(old_license)
                    rev.after_change()
                else:
                    unmapped_licenses.add(old_license, pkg.name)
        licenses.add(pkg.license_id, pkg.name)
        for key in pkg.extras.keys():
            if key not in key_mapping.keys() and \
               key not in key_mapping.values():
                unmapped_keys.add(key, pkg.name)
    rev.finished()
    print 'Packages: %i' % model.Session.query(model.Package).count()
    print 'Changed keys:\n', keys_changed.report()
    print 'Unmapped keys:\n', unmapped_keys.report()
    print 'Changed licenses:\n', licenses_changed.report()
    print 'Unmapped licenses:\n', unmapped_licenses.report()
    print 'Licenses:\n', licenses.report()
def bulk_action(action=None,
                filepath=None,
                entity_or_alias_names=None,
                entities=True,
                aliases=True):
    nomen_data = NomenData()
    nk_dataset = nk_connect(nk_dataset_name)

    # Gather the list of entities & aliases from the file and command-line
    entities_or_aliases = []

    def find_name(name, stats):
        if not name.strip():
            print stats.add('blank', name)
        elif entities and name in nomen_data.entities_dict_by_name:
            entities_or_aliases.append(
                nomen_data.entities_dict_by_name[name]['entity'])
            print stats.add('Entity found', name)
        elif aliases and name in nomen_data.aliases_by_name:
            entities_or_aliases.append(nomen_data.aliases_by_name[name])
            print stats.add('Alias found', name)
        else:
            print stats.add('Not found', name)

    if entity_or_alias_names:
        stats = StatsList()
        for name in entity_or_alias_names:
            find_name(name, stats)
        print 'Given names:'
        print stats.report()
    if filepath:
        if not os.path.exists(filepath):
            raise Exception('Filepath not found: %s' % filepath)
        with open(filepath, 'r') as f:
            stats = StatsList()
            for line in f:
                name = line.rstrip('\n\r')
                find_name(name, stats)
                #try:
                #    entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True)
                #except nk_dataset.NoMatch:
                #    print stats.add('Not found', publisher['title'])
                #    continue
                #except nk_dataset.Invalid:
                #    pass
                #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name)
                #entities_or_aliases.append(entity_or_alias)
        print 'File names:'
        print stats.report()

    # Do the action to each entity
    stats = StatsList()
    for entity_or_alias in entities_or_aliases:
        name = entity_or_alias.name
        if action == 'invalidate':
            if isinstance(entity_or_alias, nomenklatura.Entity):
                print stats.add('Cannot invalidate an Entity', name)
                continue
            alias = entity_or_alias
            if alias.is_invalid:
                print stats.add('Already invalid', name)
                continue
            try:
                nk_dataset.match(alias_id=alias.id, entity_id='INVALID')
            except requests.exceptions.HTTPError, e:
                # Seem to get occasional 502s due to overloading
                print stats.add('Server error: %s' % e, alias.name)
                continue
            print stats.add('Invalidated', name)
        else:
            raise NotImplemented
Exemple #42
0
def recategorize(options):
    from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME,
                                       SECONDARY_THEMES, Themes)

    stats = StatsList()
    stats.report_value_limit = 1000

    if options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        packages = [pkg]
    else:
        packages = get_packages(publisher=options.publisher,
                                theme=None,
                                uncategorized=options.uncategorized,
                                limit=options.limit)

    # process the list of themes we are interested in setting on packages
    themes = Themes.instance()
    if options.theme:
        theme_filter = set(options.theme.split(','))
        for theme in theme_filter:
            assert theme in themes.data, '"%s" not in %r' % (
                theme, themes.data.keys())
    else:
        theme_filter = themes.data

    themes_to_write = {}  # pkg_name:themes

    for pkg in packages:
        print 'Dataset: %s' % pkg.name
        themes = categorize_package(pkg)
        existing_theme = pkg.extras.get(PRIMARY_THEME)
        pkg_identity = '%s (%s)' % (pkg.name, existing_theme)
        if not themes:
            print stats.add('Cannot decide theme', pkg_identity)
            continue
        if themes[0] not in theme_filter:
            print stats.add('Not interested in theme', pkg_identity)
            continue
        if existing_theme == themes[0]:
            print stats.add('Theme unchanged %s' % themes[0], pkg_identity)
            continue
        print stats.add('Recategorized to %s' % themes[0], pkg_identity)
        if options.write:
            themes_to_write[pkg.name] = themes

    print 'Recategorize summary:'
    print stats.report()

    if options.write:
        write_themes(themes_to_write)
def wms_revisions(options):
    '''
    These revisions look like this:

    # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp;
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1
    http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0
    http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3

    The bad ones have been changed to "?service=" params. These revisions need removing.

    # Typical revision:
                     id                  |         timestamp          |           author           |                         message                          | state  | approved_timestamp
    a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active |
    # i.e. author='co-prod3...' (site-user, via API)
    '''
    resources = common.get_resources(state='active',
                                     resource_id=options.resource,
                                     dataset_name=options.dataset)
    stats = StatsList()
    stats.report_value_limit = 1000
    total_bad_revisions = 0
    need_to_commit = False
    widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()]
    progress = ProgressBar(widgets=widgets)
    for res in progress(resources):
        res = model.Resource.get(
            res.id)  # as the session gets flushed during the loop
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(
            id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        first_res_rev = res_revs[0]
        if 'request=GetCapabilities&version=' in first_res_rev.url:
            print add_stat('First revision already was WMS', res, stats)
            continue

        # Identify bad revisions by the WMS URL parameters and author
        bad_res_revs = res_rev_q.filter(
            model.ResourceRevision.url.ilike(
                '%?service=W%S&request=GetCapabilities&version=%')).all()
        if bad_res_revs and \
           bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'):
            print add_stat('Misidentified', res, stats,
                           'author=%r' % bad_res_revs[0].revision.author)
            continue
        if not bad_res_revs:
            add_stat('Resource ok', res, stats)
            continue
        print ' '  # don't overwrite progress bar
        print add_stat('Bad revisions', res, stats,
                       '(%d/%d)' % (len(bad_res_revs), len(res_revs)))
        total_bad_revisions += len(bad_res_revs)

        # Find the new latest (good) revision
        bad_res_revs_set = set(bad_res_revs)
        for res_rev_index in reversed(xrange(len(res_revs))):
            if res_revs[res_rev_index] not in bad_res_revs_set:
                latest_good_res_rev = res_revs[res_rev_index]
                break
        else:
            print add_stat('No good revisions', res, stats)
            continue
        if not options.write:
            continue

        # Delete the revisions and resource_revisions
        print '  Deleting bad revisions...'

        def delete_bad_revisions(res_revs):
            # Build the sql as a list, as it is faster when you have 1000 strings to append
            sql = [
                '''BEGIN;
            ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey;
            ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey;
            ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey;
            '''
            ]
            for res_rev in res_revs:
                sql.append(
                    "DELETE from resource_revision where id='%s' and revision_id='%s';\n"
                    % (res.id, res_rev.revision_id))
                # a revision created (e.g. over the API) can be connect to other
                # resources or a dataset, so only delete the revision if only
                # connected to this one.
                if model.Session.query(model.ResourceRevision).\
                        filter_by(revision_id=res_rev.revision_id).\
                        count() == 1 and \
                        model.Session.query(model.PackageRevision).\
                        filter_by(revision_id=res_rev.revision_id).count() == 0:
                    sql.append("DELETE from revision where id='%s';\n" %
                               res_rev.revision_id)
            sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \
                (latest_good_res_rev.revision_id, res.id))
            sql.append('''
            ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id);
            COMMIT;''')
            print '  sql..',
            model.Session.execute(''.join(sql))
            print '.committed'
            model.Session.remove()

        def chunks(l, n):
            '''Yield successive n-sized chunks from l.'''
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        # chunk revisions in chunks to cope when there are so many
        widgets = [
            'Creating SQL: ',
            Counter(),
            'k/%sk ' % int(float(len(bad_res_revs)) / 1000.0),
            Bar(), ' ',
            ETA()
        ]
        progress2 = ProgressBar(widgets=widgets,
                                maxval=int(float(len(bad_res_revs)) / 1000.0)
                                or 1)
        for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)):
            delete_bad_revisions(chunk_of_bad_res_revs)

        # Knit together the remaining revisions again
        print '  Knitting existing revisions back together...'
        res_rev_q = model.Session.query(model.ResourceRevision).filter_by(
            id=res.id).order_by(model.ResourceRevision.revision_timestamp)
        res_revs = res_rev_q.all()
        latest_res_rev = res_revs[-1]
        if not latest_res_rev.current:
            latest_res_rev.current = True
        for i, res_rev in enumerate(res_revs[:-1]):
            if res_rev.expired_timestamp != res_revs[i + 1].revision_timestamp:
                res_rev.expired_timestamp = res_revs[i + 1].revision_timestamp
                res_rev.expired_id = res_revs[i + 1].revision_id
        if latest_res_rev.expired_timestamp != END_OF_TIME:
            latest_res_rev.expired_timestamp = END_OF_TIME
        if latest_res_rev.expired_id is not None:
            latest_res_rev.expired_id = None

        # Correct the URL on the resource
        model.Session.query(model.Resource).filter_by(id=res.id).update(
            {'url': latest_res_rev.url})
        model.repo.commit_and_remove()
        print '  ...done'

    print 'Summary\n', stats.report()
    print 'Total bad revs: %d' % total_bad_revisions
    if options.write and need_to_commit:
        model.repo.commit_and_remove()
        print 'Written'
    def fetch(cls, site_url_filter, since_datetime):
        import ckan.model as model
        from running_stats import StatsList
        log = logging.getLogger(__name__)
        stats = StatsList()

        # Use the generate_entries generator to get all of
        # the entries from the ODI Atom feed.  This should
        # correctly handle all of the pages within the feed.
        import ckanext.certificates.client as client
        for entry in client.generate_entries(since=since_datetime):

            # We have to handle the case where the rel='about' might be
            # missing, if so we'll ignore it and catch it next time
            about = entry.get('about', '')
            if not about:
                log.debug(
                    stats.add(
                        'Ignore - no rel="about" specifying the dataset',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not site_url_filter.search(about):
                log.debug(
                    stats.add(
                        'Ignore - "about" field does not reference this site',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue

            if not '/dataset/' in entry['about']:
                log.debug(
                    stats.add(
                        'Ignore - is "about" DGU but not a dataset',
                        '%s "%s" %s' % (about, entry['about'], entry['id'])))
                continue

            pkg = cls._get_package_from_url(entry.get('about'))
            if not pkg:
                log.error(
                    stats.add(
                        'Unable to find the package',
                        '%s "%s" %s %r' % (about, entry['about'], entry['id'],
                                           entry.get('about'))))
                continue

            # Build the JSON subset we want to describe the certificate
            badge_data = client.get_badge_data(entry['alternate'])
            if not badge_data:
                log.info(
                    stats.add(
                        'Error fetching badge data - skipped',
                        '%s "%s" %s' % (about, entry['title'], entry['id'])))
                continue
            badge_data['cert_title'] = entry.get(
                'content', '')  # e.g. 'Basic Level Certificate'

            badge_json = json.dumps(badge_data)
            if pkg.extras.get('odi-certificate') == badge_json:
                log.debug(
                    stats.add('Certificate unchanged',
                              badge_data['certificate_url']))
            else:
                operation = 'updated' if 'odi-certificate' in pkg.extras \
                    else 'added'
                model.repo.new_revision()
                pkg.extras['odi-certificate'] = json.dumps(badge_data)
                log.debug(
                    stats.add(
                        'Certificate %s' % operation, '"%s" %s' %
                        (badge_data['title'], badge_data['certificate_url'])))
                model.Session.commit()

        log.info('Summary:\n' + stats.report())
Exemple #45
0
def canada_extras():
    keys_changed = StatsCount()
    unmapped_keys = StatsList()
    licenses_changed = StatsCount()
    unmapped_licenses = StatsList()
    licenses = StatsList()
    key_mapping = {
        'Level of Government':'level_of_government',
        }
    license_mapping = {
        # CS: bad_spelling ignore
        'http://geogratis.ca/geogratis/en/licence.jsp':'geogratis',
        'Crown Copyright':'canada-crown',
        }
    from ckan import model
    rev = RevisionManager('Standardize extra keys', 10)
    for pkg in model.Session.query(model.Package):
        for old_key, new_key in key_mapping.items():
            if pkg.extras.has_key(old_key):
                rev.before_change()
                pkg.extras[new_key] = pkg.extras[old_key]
                del pkg.extras[old_key]
                keys_changed.increment(old_key)
                rev.after_change()
        for license_key in ('License', 'License URL'):
            if pkg.extras.has_key(license_key):
                old_license = pkg.extras[license_key]
                if old_license in license_mapping:
                    rev.before_change()
                    pkg.license_id = unicode(license_mapping[old_license])
                    del pkg.extras[license_key]
                    licenses_changed.increment(old_license)
                    rev.after_change()
                else:
                    unmapped_licenses.add(old_license, pkg.name)
        licenses.add(pkg.license_id, pkg.name)
        for key in pkg.extras.keys():
            if key not in key_mapping.keys() and \
               key not in key_mapping.values():
                unmapped_keys.add(key, pkg.name)
    rev.finished()
    print 'Packages: %i' % model.Session.query(model.Package).count()
    print 'Changed keys:\n', keys_changed.report()
    print 'Unmapped keys:\n', unmapped_keys.report()
    print 'Changed licenses:\n', licenses_changed.report()
    print 'Unmapped licenses:\n', unmapped_licenses.report()
    print 'Licenses:\n', licenses.report()
def fix_redirects(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Repoint 410 Gone to webarchive url'
        needs_commit = False
    stats = StatsList()

    # Get resources
    results = model.Session.query(Archival, model.Resource)
    if options.resource:
        results = results.filter(Archival.resource_id == options.resource)
    elif options.dataset:
        pkg = model.Package.get(options.dataset)
        assert pkg
        results = results.filter(Archival.package_id==pkg.id)\
                             .order_by(model.Resource.position)
    results = results.filter(or_(Archival.is_broken == True,
                                 Archival.url_redirected_to != None))\
                    .join(model.Package, Archival.package_id == model.Package.id)\
                    .filter(model.Package.state == 'active')\
                    .join(model.Resource, Archival.resource_id == model.Resource.id)\
                    .filter(model.Resource.state == 'active')\
                    .order_by(model.Package.name)
    if options.organization:
        org = model.Group.get(options.organization)
        assert org
        results = results.filter(model.Package.owner_org == org.id)
    results = results.all()

    def is_gov_uk(url):
        return url.startswith('https://www.gov.uk/')

    def is_webarchive(url):
        return url.startswith('http://webarchive.nationalarchives.gov.uk/')

    for archival, res in results:

        def stats_add(msg):
            pkg = res.resource_group.package
            return stats.add(msg,
                             ('%s/%s %s' % (pkg.name, res.id, res.url)).encode(
                                 'latin7', 'ignore'))

        if archival.reason.endswith('410 Gone'):
            # Find out the redirect - it is in the html
            try:
                page = requests.get(res.url)
            except requests.exceptions.ConnectionError:
                print stats_add('410 Gone but connection error')
                continue
            if '<a href="https://www.gov.uk">' not in page.text:
                print stats_add('410 Gone but not gov.uk')
                continue
            root = lxml.html.fromstring(page.text)
            hrefs = root.xpath('//div[@id="detail"]//a')
            for href in hrefs:
                url = href.attrib['href']
                if is_webarchive(url):
                    break
            else:
                print stats_add('410 Gone but no forward link')
                continue
            print stats_add('410 Gone and link found - change')
            if write:
                res.url = url
                needs_commit = True
            continue

        if not archival.url_redirected_to:
            # we've filtered for redirects and broken, so must be broken
            stats_add('Broken, but not a redirect - not interested')
            continue
        if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            stats_add('Internal gov.uk redirect - ignore')
            continue
        if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to):
            print stats_add('Redirect to gov.uk - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if is_webarchive(res.url) and is_webarchive(
                archival.url_redirected_to):
            stats_add('Internal webarchive redirect - ignore')
            continue
        if not is_webarchive(res.url) and is_webarchive(
                archival.url_redirected_to):
            print stats_add('Redirect to webarchive - change')
            if write:
                res.url = archival.url_redirected_to
                needs_commit = True
            continue
        if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(
                archival.url_redirected_to):
            stats_add(
                'Redirect nothing to do with gov.uk or webarchive - ignore')
            continue
        print stats_add('Dunno')

    stats.report_value_limit = 500
    print 'Summary', stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'