def categorize(options, test=False): from ckanext.dgu.lib.theme import categorize_package, PRIMARY_THEME stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: if test: theme = True else: theme = False packages = get_packages(publisher=options.publisher, theme=theme, uncategorized=options.uncategorized, limit=options.limit) themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg, stats) if options.write and not pkg.extras.get(PRIMARY_THEME) and themes: themes_to_write[pkg.name] = themes print 'Categorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def merge_duplicates(self): merge_stats = StatsList() onshub_packages_search_options = { 'external_reference': 'ONSHUB', 'state': 'active' } res = self.loader._package_search(onshub_packages_search_options) log.info('ONSHUB records: %i', res['count']) pkgs_already_merged = set() for pkg_ref in res['results']: pkg = self.loader._get_package(pkg_ref) if pkg['name'] in pkgs_already_merged: log.info(merge_stats.add('Already merged', pkg['name'])) continue if not self.loader._pkg_matches_search_options( pkg, onshub_packages_search_options): log.error( merge_stats.add('Did not match ONSHUB search after all', pkg['name'])) continue # look for duplicates dupe_search_options = { 'title': pkg['title'], 'groups': pkg['groups'][0] if pkg['groups'] else '', 'external_reference': 'ONSHUB', 'state': 'active' } res = self.loader._package_search(dupe_search_options) if not res['count']: log.error(merge_stats.add('Could not find itself', pkg['name'])) continue dupe_pkgs = [] for dupe_pkg_ref in res['results']: dupe_pkg = self.loader._get_package(dupe_pkg_ref) if dupe_pkg['name'] == pkg['name']: continue if not self.loader._pkg_matches_search_options( dupe_pkg, dupe_search_options): log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name']) continue dupe_pkgs.append(dupe_pkg) if dupe_pkgs: log.info('Found duplicates for %s: %r', pkg['name'], [pkg_['name'] for pkg_ in dupe_pkgs]) # Fix duplicates merge_stats.add( '%i duplicates found and merged' % len(dupe_pkgs), pkg['name']) for dupe_pkg in dupe_pkgs: pkgs_already_merged.add(dupe_pkg['name']) self.do_merge(pkg, dupe_pkgs) else: log.info(merge_stats.add('No duplicates', pkg['name'])) print merge_stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def set_initial_value(self): log = global_log stats = StatsList() from ckan import model import ckan.plugins as p from ckan.logic import ActionError from ckanext.dgu.lib.helpers import upsert_extra site_user = p.toolkit.get_action('get_site_user')({'model': model,'ignore_auth': True}, {}) c = {'model': model, 'user': site_user['name']} packages = p.toolkit.get_action('package_list')(c, data_dict={}) log.info('Processing %d packages', len(packages)) for pkg_name in packages: pkg = model.Package.by_name(pkg_name) last_mod = self.determine_last_major_modification(pkg).isoformat() log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod) if pkg.extras.get('last_major_modification') != last_mod: log.info(stats.add('Adding modification date', pkg.name)) model.repo.new_revision() pkg.extras['last_major_modification'] = last_mod model.repo.commit_and_remove() else: log.info(stats.add('No change needed', pkg.name)) print stats.report()
def undelete(options): resources = _get_resources('deleted', options) stats = StatsList() if options.write: rev = model.repo.new_revision() rev.author = 'current_revision_fixer2' need_to_commit = False for res in resources: # when viewing old revision of the dataset, there is one where the # resources are not deleted but they don't show up. This is seen where resource_revision has an expired_timestamp that has no corresponding revision_timestamp - i.e. a gap between them (and that is not 9999-12-31). # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='373bb814-7a49-4f53-8a0e-762002b2529c' order by revision_timestamp; # revision_timestamp | expired_timestamp | current # ----------------------------+----------------------------+--------- # 2013-06-19 00:50:28.880058 | 2014-01-18 01:03:47.500041 | f # 2014-01-18 01:03:47.500041 | 2014-01-18 01:03:48.296204 | f # 2014-01-18 01:03:50.612196 | 9999-12-31 00:00:00 | t # Clearly there is a gap from the 2nd to the 3rd, indicating the problem. res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all() if len(res_revs) < 2: print add_stat('Not enough revisions', res, stats) continue if res_revs[-2].expired_timestamp == res_revs[-1].revision_timestamp: add_stat('Ok', res, stats) continue print add_stat('Timestamp gap', res, stats) if options.write: res.state = 'active' need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def fetch(cls, site_url_filter, since_datetime): import ckan.model as model from running_stats import StatsList log = logging.getLogger(__name__) stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. import ckanext.certificates.client as client for entry in client.generate_entries(since=since_datetime): # We have to handle the case where the rel='about' might be # missing, if so we'll ignore it and catch it next time about = entry.get('about', '') if not about: log.debug(stats.add('Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not site_url_filter.search(about): log.debug(stats.add('Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: log.debug(stats.add('Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['about'], entry['id']))) continue pkg = cls._get_package_from_url(entry.get('about')) if not pkg: log.error(stats.add('Unable to find the package', '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about')))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(entry['alternate']) if not badge_data: log.info(stats.add('Error fetching badge data - skipped', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue badge_data['cert_title'] = entry.get('content', '') # e.g. 'Basic Level Certificate' badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: log.debug(stats.add('Certificate unchanged', badge_data['certificate_url'])) else: operation = 'updated' if 'odi-certificate' in pkg.extras \ else 'added' model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) log.debug(stats.add('Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() log.info('Summary:\n' + stats.report())
def reconcile_aliases_that_match_entities_exactly(): '''When adding entities using this tool, they might also currently be in the recon queue. In cases there the alias name matches exactly the entity name, link them up. (Ideally we'd just delete the alias from the recon queue, but there is no delete_alias API.) ''' stats = StatsList() nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) for alias in nomen_data.unmatched_aliases: try: entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): try: nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id) except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add( 'Server error linking the alias to an entity: %s' % e, alias.name) continue print stats.add('Matched alias to an entity of the same name', alias.name) else: print stats.add('No matching entity', alias.name)
def command(dry_run=False): from ckan import model from ckanext.dgu.lib.resource_formats import match from running_stats import StatsList # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry = Registry() registry.prepare() translator_obj = MockTranslator() registry.register(translator, translator_obj) if not dry_run: model.repo.new_revision() # Add canonised formats to map for format_ in res_type_map.keys(): res_type_map[canonise(format_)] = res_type_map[format_] log.info('Tidying resource types') stats = StatsList() res_query = model.Session.query(model.Resource) log.info('Tidying formats. Resources=%i Canonised formats=%i', res_query.count(), len(set(res_type_map.values()))) for res in res_query: canonised_fmt = canonise(res.format or '') if canonised_fmt in res_type_map: improved_fmt = res_type_map[canonised_fmt] else: improved_fmt = tidy(res.format) match_ = match(improved_fmt) if match_: improved_fmt = match_ if (improved_fmt or '') != (res.format or ''): if not dry_run: res.format = improved_fmt stats.add(improved_fmt, res.format) else: stats.add('No change', res.format) if not dry_run: model.repo.commit_and_remove() log.info('Stats report: %r', stats.report()) print stats.report() log.info('Warnings (%i): %r', len(warnings), warnings)
def merge_duplicates(self): merge_stats = StatsList() onshub_packages_search_options = {'external_reference': 'ONSHUB', 'state': 'active'} res = self.loader._package_search(onshub_packages_search_options) log.info('ONSHUB records: %i', res['count']) pkgs_already_merged = set() for pkg_ref in res['results']: pkg = self.loader._get_package(pkg_ref) if pkg['name'] in pkgs_already_merged: log.info(merge_stats.add('Already merged', pkg['name'])) continue if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options): log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name'])) continue # look for duplicates dupe_search_options = {'title': pkg['title'], 'groups': pkg['groups'][0] if pkg['groups'] else '', 'external_reference': 'ONSHUB', 'state': 'active'} res = self.loader._package_search(dupe_search_options) if not res['count']: log.error(merge_stats.add('Could not find itself', pkg['name'])) continue dupe_pkgs = [] for dupe_pkg_ref in res['results']: dupe_pkg = self.loader._get_package(dupe_pkg_ref) if dupe_pkg['name'] == pkg['name']: continue if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options): log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name']) continue dupe_pkgs.append(dupe_pkg) if dupe_pkgs: log.info('Found duplicates for %s: %r', pkg['name'], [pkg_['name'] for pkg_ in dupe_pkgs]) # Fix duplicates merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name']) for dupe_pkg in dupe_pkgs: pkgs_already_merged.add(dupe_pkg['name']) self.do_merge(pkg, dupe_pkgs) else: log.info(merge_stats.add('No duplicates', pkg['name'])) print merge_stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def reconcile_aliases_that_match_entities_exactly(): '''When adding entities using this tool, they might also currently be in the recon queue. In cases there the alias name matches exactly the entity name, link them up. (Ideally we'd just delete the alias from the recon queue, but there is no delete_alias API.) ''' stats = StatsList() nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) for alias in nomen_data.unmatched_aliases: try: entity_or_alias = nk_dataset.lookup_detailed(alias.name, readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): try: nk_dataset.match(alias_id=alias.id, entity_id=entity_or_alias.id) except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error linking the alias to an entity: %s' % e, alias.name) continue print stats.add('Matched alias to an entity of the same name', alias.name) else: print stats.add('No matching entity', alias.name)
def refix(options): resources = _get_resources('active', options) stats = StatsList() need_to_commit = False for res in resources: # the old uncommit command would set the wrong resource_revision to be current. # e.g. select revision_timestamp,expired_timestamp,current from resource_revision where id='b2972b35-b6ae-4096-b8cc-40dab3927a71' order by revision_timestamp; # revision_timestamp | expired_timestamp | current # ---------------------------+----------------------------+---------i # 2013-04-13 01:47:30.18897 | 2013-06-18 19:01:45.910899 | f # 2013-06-18 19:01:45.910899 | 2014-01-18 08:55:41.443349 | t # 2014-01-18 08:55:41.443349 | 2014-01-18 08:55:41.566383 | f # Clearly only the latest should be current. res_revs = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by('revision_timestamp').all() fix_needed = False if len(res_revs) < 2: print add_stat('Not enought revisions', res, stats) continue for res_rev in res_revs[:-1]: if res_rev.current: print add_stat('Early revision is current', res, stats) fix_needed = True if options.write: res_rev.current = False need_to_commit = True if not res_revs[-1].current: print add_stat('Last revision is not current', res, stats) fix_needed = True if options.write: res_revs[-1].current = True need_to_commit = True if res_revs[-1].expired_timestamp != END_OF_TIME: print add_stat('Last revision is not 9999', res, stats) fix_needed = True if options.write: res_revs[-1].expired_timestamp = END_OF_TIME need_to_commit = True if not fix_needed: add_stat('Ok', res, stats) continue print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def command(dry_run=False): from ckan import model from ckanext.dgu.lib.resource_formats import match from running_stats import StatsList # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) if not dry_run: model.repo.new_revision() # Add canonised formats to map for format_ in res_type_map.keys(): res_type_map[canonise(format_)] = res_type_map[format_] log.info('Tidying resource types') stats = StatsList() res_query = model.Session.query(model.Resource) log.info('Tidying formats. Resources=%i Canonised formats=%i', res_query.count(), len(set(res_type_map.values()))) for res in res_query: canonised_fmt = canonise(res.format or '') if canonised_fmt in res_type_map: improved_fmt = res_type_map[canonised_fmt] else: improved_fmt = tidy(res.format) match_ = match(improved_fmt) if match_: improved_fmt = match_ if (improved_fmt or '') != (res.format or ''): if not dry_run: res.format = improved_fmt stats.add(improved_fmt, res.format) else: stats.add('No change', res.format) if not dry_run: model.repo.commit_and_remove() log.info('Stats report: %r', stats.report()) print stats.report() log.info('Warnings (%i): %r', len(warnings), warnings)
def set_initial_value(self): log = global_log stats = StatsList() from ckan import model import ckan.plugins as p from ckan.logic import ActionError from ckanext.dgu.lib.helpers import upsert_extra site_user = p.toolkit.get_action('get_site_user')({ 'model': model, 'ignore_auth': True }, {}) c = {'model': model, 'user': site_user['name']} packages = p.toolkit.get_action('package_list')(c, data_dict={}) log.info('Processing %d packages', len(packages)) for pkg_name in packages: pkg = model.Package.by_name(pkg_name) last_mod = self.determine_last_major_modification(pkg).isoformat() log.info('%s: %s %s', pkg_name, pkg.extras.get('last_major_modification'), last_mod) if pkg.extras.get('last_major_modification') != last_mod: log.info(stats.add('Adding modification date', pkg.name)) model.repo.new_revision() pkg.extras['last_major_modification'] = last_mod model.repo.commit_and_remove() else: log.info(stats.add('No change needed', pkg.name)) print stats.report()
def add_missing_publisher(self): stats = StatsList() res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client) if not publisher_name: log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq) / max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add( 'margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def add_missing_onshub_extra(self): '''Some ONSHUB datasets were edited manually and due to a bug, many of the extras got lost. Here we restore the external_reference=ONSHUB extra. ''' stats = StatsList() res = self.client.action( 'package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing extras: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add( 'Could not find "Source agency: " line after all', pkg['name'])) continue # Add the extra pkg['extras']['external_reference'] = 'ONSHUB' if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Added extra', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def no_current_resources(options): resources = _get_resources('active', options) stats = StatsList() need_to_commit = False for res in resources: latest_res_rev = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp.desc()).first() if not latest_res_rev.current: print add_stat('No current revision', res, stats) if options.write: latest_res_rev.current = True need_to_commit = True else: add_stat('Ok', res, stats) if latest_res_rev.revision_id != res.revision_id: print add_stat('Revision ID of resource too old', res, stats) if options.write: res.revision_id = latest_res_rev.revision_id need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def correct_home_office_titles(self): '''Home Office edited their ONSHUB titles to be prefixed with "UK National Statistics Publication Hub: ". These cannot be added to by the ons_loader in the future because of this title change so remove the prefix. e.g. scientific_procedures_on_living_animals_great_britain ''' stats = StatsList() prefix = 'UK National Statistics Publication Hub: ' res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count']) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if not pkg['title'].startswith(prefix): log.error(stats.add('Prefix not there after all', pkg['name'])) continue # Remove the prefix pkg['title'] = pkg['title'][len(prefix):] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Remove prefix', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def no_current_packages(options): pkgs = _get_packages('active', options) stats = StatsList() need_to_commit = False for pkg in pkgs: latest_pkg_rev = \ model.Session.query(model.PackageRevision) \ .filter_by(id=pkg.id) \ .order_by(model.PackageRevision.revision_timestamp.desc()) \ .first() # sometimes a revision_timestamp is null for some reason if latest_pkg_rev.revision_timestamp is None: # in which case, join them to the revision table and order by those # timestamps instead latest_pkg_rev = \ model.Session.query(model.PackageRevision) \ .filter_by(id=pkg.id) \ .join(model.Revision) \ .order_by(model.Revision.timestamp.desc()) \ .first() if not latest_pkg_rev.current: print stats.add('No current revision', pkg.name) if options.write: latest_pkg_rev.current = True need_to_commit = True else: stats.add('Ok', pkg.name) if latest_pkg_rev.revision_id != pkg.revision_id: print stats.add('Revision ID of package too old', pkg.name) if options.write: pkg.revision_id = latest_pkg_rev.revision_id need_to_commit = True print 'Summary', stats.report() if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written' print
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq)/max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def correct_home_office_titles(self): '''Home Office edited their ONSHUB titles to be prefixed with "UK National Statistics Publication Hub: ". These cannot be added to by the ons_loader in the future because of this title change so remove the prefix. e.g. scientific_procedures_on_living_animals_great_britain ''' stats = StatsList() prefix = 'UK National Statistics Publication Hub: ' res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count']) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if not pkg['title'].startswith(prefix): log.error(stats.add('Prefix not there after all', pkg['name'])) continue # Remove the prefix pkg['title'] = pkg['title'][len(prefix):] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Remove prefix', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def add_missing_onshub_extra(self): '''Some ONSHUB datasets were edited manually and due to a bug, many of the extras got lost. Here we restore the external_reference=ONSHUB extra. ''' stats = StatsList() res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing extras: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) match = source_agency_re.search(pkg['notes']) if not match: log.error(stats.add('Could not find "Source agency: " line after all', pkg['name'])) continue # Add the extra pkg['extras']['external_reference'] = 'ONSHUB' if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue log.info(stats.add('Added extra', pkg['name'])) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none(ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none(ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[-1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
user_drupal_id = user.name.replace('user_d', '') try: user_properties = drupal.get_user_properties(user_drupal_id) except DrupalRequestError, e: user_emails[user.name] = user.email else: user_emails[user.name] = user_properties['mail'] else: # not a drupal user user_emails[user.name] = user.email return user_emails[user.name] # NHS publishers nhs = model.Group.by_name('national-health-service') assert nhs pub_stats = StatsList() pct_rows = [] non_pct_rows = [] for pub in publisher_lib.go_down_tree(nhs): # Filter to PCTs title = pub.title not_pct = ('NHS Choices', 'NHS Connecting for Health', 'NHS Connecting for Health and NHS Business Services Authority') is_pct = ('Care Trust' in title or 'PCT' in title or title.startswith('NHS ') or 'Care Tust' in title) \ and title not in not_pct and 'Foundation' not in title # Get the admins & editors admins = pub.members_of_type(model.User, 'admin').all() editors = pub.members_of_type(model.User, 'editor').all() # Get their email addresses users_with_email = [] users_without_email = [] warnings = None
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def add_missing_publisher(self): stats = StatsList() res = self.client.action( 'package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False) log.info('ONSHUB datasets missing publisher: %i', res['count']) source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE) for pkg in res['results']: # solr data_dict is not the correct sort of pkg dictionary so # get it via the API pkg = self.loader._get_package(pkg['name']) if pkg['groups']: log.error(stats.add('Package had a publisher', pkg['name'])) continue match = source_agency_re.search(pkg['notes']) if not match: log.error( stats.add('Could not match source agency', pkg['name'])) continue # Find equivalent publisher source_agency = match.groups()[0] publisher_name = OnsImporter._source_to_publisher_( source_agency, self.client) if not publisher_name: log.error( stats.add('Could not map source agency %s' % source_agency, pkg['name'])) continue pkg['groups'] = [publisher_name] if not self.dry_run: try: self.client.package_entity_put(pkg) except CkanApiError: log.error('Error (%s) adding publisher over API: %s' % \ (self.client.last_status, self.client.last_message)) stats.add( 'Error writing to publisher over API %s' % self.client.last_status, pkg['name']) continue stats.add('Added publisher %s' % publisher_name, pkg['name']) print stats.report() if self.dry_run: print 'NB: No packages changed - dry run.'
def wms_revisions(options): ''' These revisions look like this: # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp; http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1 http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 The bad ones have been changed to "?service=" params. These revisions need removing. # Typical revision: id | timestamp | author | message | state | approved_timestamp a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active | # i.e. author='co-prod3...' (site-user, via API) ''' resources = common.get_resources(state='active', resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() stats.report_value_limit = 1000 total_bad_revisions = 0 need_to_commit = False widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): res = model.Resource.get(res.id) # as the session gets flushed during the loop res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() first_res_rev = res_revs[0] if 'request=GetCapabilities&version=' in first_res_rev.url: print add_stat('First revision already was WMS', res, stats) continue # Identify bad revisions by the WMS URL parameters and author bad_res_revs = res_rev_q.filter(model.ResourceRevision.url.ilike('%?service=W%S&request=GetCapabilities&version=%')).all() if bad_res_revs and \ bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'): print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author) continue if not bad_res_revs: add_stat('Resource ok', res, stats) continue print ' ' # don't overwrite progress bar print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs))) total_bad_revisions += len(bad_res_revs) # Find the new latest (good) revision bad_res_revs_set = set(bad_res_revs) for res_rev_index in reversed(xrange(len(res_revs))): if res_revs[res_rev_index] not in bad_res_revs_set: latest_good_res_rev = res_revs[res_rev_index] break else: print add_stat('No good revisions', res, stats) continue if not options.write: continue # Delete the revisions and resource_revisions print ' Deleting bad revisions...' def delete_bad_revisions(res_revs): # Build the sql as a list, as it is faster when you have 1000 strings to append sql = ['''BEGIN; ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey; ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey; ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey; '''] for res_rev in res_revs: sql.append("DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id)) # a revision created (e.g. over the API) can be connect to other # resources or a dataset, so only delete the revision if only # connected to this one. if model.Session.query(model.ResourceRevision).\ filter_by(revision_id=res_rev.revision_id).\ count() == 1 and \ model.Session.query(model.PackageRevision).\ filter_by(revision_id=res_rev.revision_id).count() == 0: sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id) sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \ (latest_good_res_rev.revision_id, res.id)) sql.append(''' ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); COMMIT;''') print ' sql..', model.Session.execute(''.join(sql)) print '.committed' model.Session.remove() def chunks(l, n): '''Yield successive n-sized chunks from l.''' for i in xrange(0, len(l), n): yield l[i:i+n] # chunk revisions in chunks to cope when there are so many widgets = ['Creating SQL: ', Counter(), 'k/%sk ' % int(float(len(bad_res_revs))/1000.0), Bar(), ' ', ETA()] progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs))/1000.0) or 1) for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)): delete_bad_revisions(chunk_of_bad_res_revs) # Knit together the remaining revisions again print ' Knitting existing revisions back together...' res_rev_q = model.Session.query(model.ResourceRevision).filter_by(id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() latest_res_rev = res_revs[-1] if not latest_res_rev.current: latest_res_rev.current = True for i, res_rev in enumerate(res_revs[:-1]): if res_rev.expired_timestamp != res_revs[i+1].revision_timestamp: res_rev.expired_timestamp = res_revs[i+1].revision_timestamp res_rev.expired_id = res_revs[i+1].revision_id if latest_res_rev.expired_timestamp != END_OF_TIME: latest_res_rev.expired_timestamp = END_OF_TIME if latest_res_rev.expired_id is not None: latest_res_rev.expired_id = None # Correct the URL on the resource model.Session.query(model.Resource).filter_by(id=res.id).update({'url': latest_res_rev.url}) model.repo.commit_and_remove() print ' ...done' print 'Summary\n', stats.report() print 'Total bad revs: %d' % total_bad_revisions if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def command(self): # Load configuration self._load_config() # Initialise database access import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) # Logging, post-config self.setup_logging() from pylons import config site_url = config.get('ckan.site_url') # Handling of sites that support www. but don't use it. full_site_url = site_url if not '//www.' in full_site_url: full_site_url = full_site_url.replace('//', '//www.') from running_stats import StatsList stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. for entry in client.generate_entries(self.log): # We have to handle the case where the rel='about' might be missing, if so # we'll ignore it and catch it next time about = entry.get('about', '') if not about: self.log.debug(stats.add('Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not about.startswith(site_url) and not about.startswith(full_site_url): self.log.debug(stats.add('Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: self.log.debug(stats.add('Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue pkg = self._get_package_from_url(entry.get('about')) if not pkg: self.log.error(stats.add('Unable to find the package', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(self.log, entry['alternate']) badge_data['cert_title'] = entry.get('content', '') badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: self.log.debug(stats.add('Certificate unchanged', badge_data['certificate_url'])) else: model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) operation = 'updated' if 'odi-certificate' in pkg.extras else 'added' self.log.debug(stats.add('Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() self.log.info('Summary:\n' + stats.report())
def command(self): # Load configuration self._load_config() # Initialise database access import ckan.model as model model.Session.remove() model.Session.configure(bind=model.meta.engine) # Logging, post-config self.setup_logging() from pylons import config site_url = config.get('ckan.site_url') # Handling of sites that support www. but don't use it. full_site_url = site_url if not '//www.' in full_site_url: full_site_url = full_site_url.replace('//', '//www.') from running_stats import StatsList stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. for entry in client.generate_entries(self.log): # We have to handle the case where the rel='about' might be missing, if so # we'll ignore it and catch it next time about = entry.get('about', '') if not about: self.log.debug( stats.add( 'Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not about.startswith(site_url) and not about.startswith( full_site_url): self.log.debug( stats.add( 'Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: self.log.debug( stats.add( 'Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue pkg = self._get_package_from_url(entry.get('about')) if not pkg: self.log.error( stats.add( 'Unable to find the package', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(self.log, entry['alternate']) badge_data['cert_title'] = entry.get('content', '') badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: self.log.debug( stats.add('Certificate unchanged', badge_data['certificate_url'])) else: model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) operation = 'updated' if 'odi-certificate' in pkg.extras else 'added' self.log.debug( stats.add( 'Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() self.log.info('Summary:\n' + stats.report())
def update_entities_from_dgu(publishers=None): dgu_client = ckan_client() if not publishers: # Get list of publishers from DGU publishers = dgu_client.action('organization_list') stats = StatsList() nk_dataset = nk_connect(nk_dataset_name) for publisher_name in publishers: publisher = dgu_client.action('organization_show', id=publisher_name) # Match each publisher with a Nomen entity try: entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None data = {'dgu_name': publisher_name} if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): # Matched an entity entity = entity_or_alias if entity.data.get('dgu_name') == publisher_name: # Matching ID, ensure Nomen still has the title as per DGU print stats.add('Matching ID. Title match: %s' % \ (entity.name == publisher['title']), publisher_name) elif 'dgu_name' in entity.data: print stats.add('Wrong ID - ignoring', publisher_name) elif entity.name == publisher['title']: nk_dataset.update_entity(entity.id, entity.name, data) print stats.add('Matching title, just added ID', publisher_name) else: # The title differs because of canonization? Hasn't happened yet. print stats.add('Title differs - ignoring', publisher_name) elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias): # Matched an alias alias_ = entity_or_alias if alias_.is_matched: entity = nk_dataset.get_entity(id=alias_.entity['id']) if entity.data.get('dgu_name'): print stats.add( 'Matched an alias for an entity which already has an ID - ignoring', publisher_name) else: nk_dataset.update_entity(entity.id, publisher['title'], data) # we can't delete the existing alias (that is now the same # as the entity) but we can create a new alias for the old # entity try: new_alias = nk_dataset.lookup(entity.name) except nk_dataset.NoMatch: nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id) print stats.add( 'Matched an alias for an entity - swapped them over', publisher_name) except nk_dataset.Invalid: # This is not expected, but still fine print stats.add( 'Matched an alias for an entity - overwrote the entity', publisher_name) else: # This is not expected, but still fine print stats.add( 'Matched an alias for an entity - overwrote the entity', publisher_name) else: new_entity = nk_dataset.add_entity(publisher['title'], data) nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id) print stats.add( 'Matched an alias without a matching entity - created the entity' ) else: # No match - create Nomen entity nk_dataset.add_entity(publisher['title'], data) print stats.add('No match - added to Nomen', publisher_name) print 'Summary' print stats.report()
def migrate(options): from ckan import model from ckanext.archiver.model import Archival, Status resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of archivals from TaskStatus and Resource # to fill all properties of Archival apart from: # * package_id # * resource_id fields = {} archiver_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='archiver')\ .filter_by(key='status')\ .first() if archiver_task_status: ats_error = json.loads(archiver_task_status.error) fields['status_id'] = Status.by_text(archiver_task_status.value) fields['is_broken'] = Status.is_status_broken(fields['status_id']) fields['reason'] = ats_error['reason'] fields['last_success'] = date_str_to_datetime_or_none( ats_error['last_success']) fields['first_failure'] = date_str_to_datetime_or_none( ats_error['first_failure']) fields['failure_count'] = int(ats_error['failure_count']) fields['url_redirected_to'] = ats_error['url_redirected_to'] fields['updated'] = archiver_task_status.last_updated else: if not (res.cache_url or res.extras.get('cache_filepath') or res.hash or res.size or res.mimetype): add_stat('No archive data', res, stats) continue for field_name in ('status_id', 'is_broken', 'reason', 'last_success', 'first_failure', 'failure_count', 'url_redirected_to', 'updated', 'created'): fields[field_name] = None fields['cache_filepath'] = res.extras.get('cache_filepath') fields['cache_url'] = res.cache_url fields['hash'] = res.hash fields['size'] = res.size fields['mimetype'] = res.mimetype revisions_with_hash = model.Session.query(model.ResourceRevision)\ .filter_by(id=res.id)\ .order_by(model.ResourceRevision.revision_timestamp)\ .filter(model.ResourceRevision.hash != '').all() if revisions_with_hash: # these are not perfect by not far off fields['created'] = revisions_with_hash[0].revision_timestamp fields['resource_timestamp'] = revisions_with_hash[ -1].revision_timestamp else: fields['created'] = min(fields['updated'] or END_OF_TIME, fields['first_failure'] or END_OF_TIME, fields['last_success'] or END_OF_TIME) fields['resource_timestamp'] = max( fields['updated'] or START_OF_TIME, fields['first_failure'] or START_OF_TIME, fields['last_success'] or START_OF_TIME) # Compare with any existing data in the Archival table archival = Archival.get_for_resource(res.id) if archival: changed = None for field, value in fields.items(): if getattr(archival, field) != value: if options.write: setattr(archival, field, value) changed = True if not changed: add_stat('Already exists correctly in archival table', res, stats) continue add_stat('Updated in archival table', res, stats) else: archival = Archival.create(res.id) if options.write: for field, value in fields.items(): setattr(archival, field, value) model.Session.add(archival) add_stat('Added to archival table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True): nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) # Gather the list of entities & aliases from the file and command-line entities_or_aliases = [] def find_name(name, stats): if not name.strip(): print stats.add('blank', name) elif entities and name in nomen_data.entities_dict_by_name: entities_or_aliases.append(nomen_data.entities_dict_by_name[name]['entity']) print stats.add('Entity found', name) elif aliases and name in nomen_data.aliases_by_name: entities_or_aliases.append(nomen_data.aliases_by_name[name]) print stats.add('Alias found', name) else: print stats.add('Not found', name) if entity_or_alias_names: stats = StatsList() for name in entity_or_alias_names: find_name(name, stats) print 'Given names:' print stats.report() if filepath: if not os.path.exists(filepath): raise Exception('Filepath not found: %s' % filepath) with open(filepath, 'r') as f: stats = StatsList() for line in f: name = line.rstrip('\n\r') find_name(name, stats) #try: # entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) #except nk_dataset.NoMatch: # print stats.add('Not found', publisher['title']) # continue #except nk_dataset.Invalid: # pass #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name) #entities_or_aliases.append(entity_or_alias) print 'File names:' print stats.report() # Do the action to each entity stats = StatsList() for entity_or_alias in entities_or_aliases: name = entity_or_alias.name if action=='invalidate': if isinstance(entity_or_alias, nomenklatura.Entity): print stats.add('Cannot invalidate an Entity', name) continue alias = entity_or_alias if alias.is_invalid: print stats.add('Already invalid', name) continue try: nk_dataset.match(alias_id=alias.id, entity_id='INVALID') except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error: %s' % e, alias.name) continue print stats.add('Invalidated', name) else: raise NotImplemented
[u'Crime'] -> ['Crime'] ''' import os import sys from sqlalchemy import engine_from_config from pylons import config import common import ast import json from optparse import OptionParser from ckan import model from running_stats import StatsList stats = StatsList() class FixSecondaryTheme(object): @classmethod def command(cls, config_ini, write): common.load_config(config_ini) common.register_translator() rev = model.repo.new_revision() rev.author = 'fix_secondary_theme.py' for package in model.Session.query(model.Package): if 'theme-secondary' in package.extras: stats.add('Fixing', package.name)
def update_entities_from_dgu(publishers=None): dgu_client = ckan_client() if not publishers: # Get list of publishers from DGU publishers = dgu_client.action('organization_list') stats = StatsList() nk_dataset = nk_connect(nk_dataset_name) for publisher_name in publishers: publisher = dgu_client.action('organization_show', id=publisher_name) # Match each publisher with a Nomen entity try: entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) except (nk_dataset.Invalid, nk_dataset.NoMatch): entity_or_alias = None data = {'dgu_name': publisher_name} if entity_or_alias and isinstance(entity_or_alias, nomenklatura.Entity): # Matched an entity entity = entity_or_alias if entity.data.get('dgu_name') == publisher_name: # Matching ID, ensure Nomen still has the title as per DGU print stats.add('Matching ID. Title match: %s' % \ (entity.name == publisher['title']), publisher_name) elif 'dgu_name' in entity.data: print stats.add('Wrong ID - ignoring', publisher_name) elif entity.name == publisher['title']: nk_dataset.update_entity(entity.id, entity.name, data) print stats.add('Matching title, just added ID', publisher_name) else: # The title differs because of canonization? Hasn't happened yet. print stats.add('Title differs - ignoring', publisher_name) elif entity_or_alias and isinstance(entity_or_alias, nomenklatura.Alias): # Matched an alias alias_ = entity_or_alias if alias_.is_matched: entity = nk_dataset.get_entity(id=alias_.entity['id']) if entity.data.get('dgu_name'): print stats.add('Matched an alias for an entity which already has an ID - ignoring', publisher_name) else: nk_dataset.update_entity(entity.id, publisher['title'], data) # we can't delete the existing alias (that is now the same # as the entity) but we can create a new alias for the old # entity try: new_alias = nk_dataset.lookup(entity.name) except nk_dataset.NoMatch: nk_dataset.match(alias_id=new_alias.id, entity_id=entity.id) print stats.add('Matched an alias for an entity - swapped them over', publisher_name) except nk_dataset.Invalid: # This is not expected, but still fine print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name) else: # This is not expected, but still fine print stats.add('Matched an alias for an entity - overwrote the entity', publisher_name) else: new_entity = nk_dataset.add_entity(publisher['title'], data) nk_dataset.match(alias_id=alias_.id, entity_id=new_entity.id) print stats.add('Matched an alias without a matching entity - created the entity') else: # No match - create Nomen entity nk_dataset.add_entity(publisher['title'], data) print stats.add('No match - added to Nomen', publisher_name) print 'Summary' print stats.report()
def fix_redirects(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Repoint 410 Gone to webarchive url' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id==options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(or_(Archival.is_broken == True, Archival.url_redirected_to != None))\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org==org.id) results = results.all() def is_gov_uk(url): return url.startswith('https://www.gov.uk/') def is_webarchive(url): return url.startswith('http://webarchive.nationalarchives.gov.uk/') for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore')) if archival.reason.endswith('410 Gone'): # Find out the redirect - it is in the html try: page = requests.get(res.url) except requests.exceptions.ConnectionError: print stats_add('410 Gone but connection error') continue if '<a href="https://www.gov.uk">' not in page.text: print stats_add('410 Gone but not gov.uk') continue root = lxml.html.fromstring(page.text) hrefs = root.xpath('//div[@id="detail"]//a') for href in hrefs: url = href.attrib['href'] if is_webarchive(url): break else: print stats_add('410 Gone but no forward link') continue print stats_add('410 Gone and link found - change') if write: res.url = url needs_commit = True continue if not archival.url_redirected_to: # we've filtered for redirects and broken, so must be broken stats_add('Broken, but not a redirect - not interested') continue if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): stats_add('Internal gov.uk redirect - ignore') continue if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): print stats_add('Redirect to gov.uk - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if is_webarchive(res.url) and is_webarchive(archival.url_redirected_to): stats_add('Internal webarchive redirect - ignore') continue if not is_webarchive(res.url) and is_webarchive(archival.url_redirected_to): print stats_add('Redirect to webarchive - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if not is_gov_uk(archival.url_redirected_to) and not is_webarchive(archival.url_redirected_to): stats_add('Redirect nothing to do with gov.uk or webarchive - ignore') continue print stats_add('Dunno') stats.report_value_limit = 500 print 'Summary', stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'
def migrate(options): from ckan import model from ckanext.archiver.model import Archival from ckanext.qa.model import QA resources = common.get_resources(state='active', publisher_ref=options.publisher, resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): # Gather the details of QA from TaskStatus # to fill all properties of QA apart from: # * package_id # * resource_id fields = {} qa_task_status = model.Session.query(model.TaskStatus)\ .filter_by(entity_id=res.id)\ .filter_by(task_type='qa')\ .filter_by(key='status')\ .first() if not qa_task_status: add_stat('No QA data', res, stats) continue qa_error = json.loads(qa_task_status.error) fields['openness_score'] = int(qa_task_status.value) fields['openness_score_reason'] = qa_error['reason'] fields['format'] = qa_error['format'] qa_date = qa_task_status.last_updated # NB qa_task_status.last_updated appears to be 1hr ahead of the revision # time, so some timezone nonesense going on. Can't do much. archival = Archival.get_for_resource(res.id) if not archival: print add_stat('QA but no Archival data', res, stats) continue archival_date = archival.updated # the state of the resource was as it was archived on the date of # the QA update but we only know when the latest archival was. So # if it was archived before the QA update thenwe know that was the # archival, otherwise we don't know when the relevant archival was. if archival_date and qa_date >= archival_date: fields['archival_timestamp'] = archival_date fields['updated'] = archival_date fields['created'] = archival_date # Assume the resource URL archived was the one when the # archival was done (it may not be if the URL was queued and # there was significant delay before it was archived) get_resource_as_at = archival_date else: # This is common for when a resource is created and qa runs just # before archiver and you get: # "This file had not been downloaded at the time of scoring it." # Just put sensible datetimes since we don't really know the exact # ones fields['archival_timestamp'] = qa_date fields['updated'] = qa_date fields['created'] = qa_date get_resource_as_at = qa_date res_rev = model.Session.query(model.ResourceRevision).\ filter_by(id=res.id).\ filter(model.ResourceRevision.revision_timestamp < get_resource_as_at).\ order_by(model.ResourceRevision.revision_timestamp.desc()).\ first() fields['resource_timestamp'] = res_rev.revision_timestamp # Compare with any existing data in the Archival table qa = QA.get_for_resource(res.id) if qa: changed = None for field, value in fields.items(): if getattr(qa, field) != value: if options.write: setattr(qa, field, value) changed = True if not changed: add_stat('Already exists correctly in QA table', res, stats) continue add_stat('Updated in QA table', res, stats) else: qa = QA.create(res.id) if options.write: for field, value in fields.items(): setattr(qa, field, value) model.Session.add(qa) add_stat('Added to QA table', res, stats) print 'Summary\n', stats.report() if options.write: model.repo.commit_and_remove() print 'Written'
def fix_links(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'script-fix-links-tna' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id==options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(Archival.is_broken == True)\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org==org.id) results = results.all() print '%i broken resources' % len(results) for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode('latin7', 'ignore')) if is_webarchive(res.url): stats_add('Webarchive already - ignore') continue if is_broken_api(res.url, archival): stats_add('It is an API error - ignore') continue if archival.last_success and \ datetime.datetime.now() - archival.last_success < datetime.timedelta(days=3): print stats_add('Not broken for at least a month yet - ignore') continue if archival.failure_count < 3: print stats_add('Not broken for at least 3 occasions yet - ignore') continue # see if it is on the webarchive url = "http://webarchive.nationalarchives.gov.uk/+/" + res.url print '%s' % res.url.encode('latin7', 'ignore') try: req = requests.head(url, headers=USER_AGENT, verify=False) except Exception, e: if 'ukgwacnf.html?url=' in str(e): print stats_add('Not in the webarchive, %s' % get_cache_status(archival)) continue print stats_add('!! Problem with request %s' % e) continue if req.status_code == 200: print stats_add('On webarchive - fixed') if write: res.url = url needs_commit = True continue elif not is_webarchive(req.url): if res.url.startswith('http://www.dft.gov.uk/'): result_str, good_url = try_earlier_webarchivals(url) print stats_add('Trying earlier webarchivals - %s' % result_str) if good_url and write: res.url = good_url needs_commit = True continue if 'ukgwacnf.html?url=' in (req.url + ''.join((resp.url for resp in req.history))): # webarchive seems to add this to the url! print stats_add('Not in the webarchive, %s' % get_cache_status(archival)) continue print stats_add('Redirected off webarchive to an error - check manually') continue print stats_add('Not on webarchive, %s' % get_cache_status(archival)) time.sleep(random.randint(1, 3)) '''
def command(dry_run=False): from ckan import model # Register a translator in this thread so that # the _() functions in logic layer can work from ckan.lib.cli import MockTranslator registry=Registry() registry.prepare() translator_obj=MockTranslator() registry.register(translator, translator_obj) global_log.info('Tidying package fields') stats = StatsList() if not dry_run: rev = model.repo.new_revision() rev.message = 'Package fields migration' for pkg in model.Session.query(model.Package)\ .filter_by(state='active')\ .order_by(model.Package.name): # field map for existing_fields, destination_field in field_map.items(): value = pkg.extras.get(destination_field) if value: continue for existing_field in existing_fields: if hasattr(pkg, existing_field): value = getattr(pkg, existing_field) else: value = pkg.extras.get(existing_field) if value: value = value.strip() if value: # take the first hit continue if not dry_run: pkg.extras[destination_field] = value or '' # delete existing field values for existing_field in existing_fields: if hasattr(pkg, existing_field): setattr(pkg, existing_field, '') elif existing_field in pkg.extras: del pkg.extras[existing_field] if value: stats.add('Merged to field "%s"' % destination_field, pkg.name) else: stats.add('Not merged to field "%s"' % destination_field, pkg.name) # move url to additional resource if pkg.url: stats.add('Url moved to additional resource', value) if not dry_run: if not pkg.resource_groups: res_group = model.ResourceGroup(label="default") pkg.resource_groups.append(res_group) res_group = pkg.resource_groups[0] res = model.Resource(format='HTML', resource_type='documentation', url=pkg.url, description='Web page about the data') res_group.resources.append(res) model.Session.add(res) #pkg.url = '' stats.add('URL moved to additional resource', pkg.name) else: stats.add('No URL to move to additional resource', pkg.name) # delete fields for field in delete_fields: if field in pkg.extras: if not dry_run: del pkg.extras[field] stats.add('Deleted field "%s"' % field, pkg.name) else: stats.add('No field to delete "%s"' % field, pkg.name) if not dry_run: model.repo.commit_and_remove() global_log.info(stats.report())
try: user_properties = drupal.get_user_properties(user_drupal_id) except DrupalRequestError, e: user_emails[user.name] = user.email else: user_emails[user.name] = user_properties['mail'] else: # not a drupal user user_emails[user.name] = user.email return user_emails[user.name] # NHS publishers nhs = model.Group.by_name('national-health-service') assert nhs pub_stats = StatsList() pct_rows = [] non_pct_rows = [] for pub in publisher_lib.go_down_tree(nhs): # Filter to PCTs title = pub.title not_pct = ('NHS Choices', 'NHS Connecting for Health', 'NHS Connecting for Health and NHS Business Services Authority') is_pct = ('Care Trust' in title or 'PCT' in title or title.startswith('NHS ') or 'Care Tust' in title) \ and title not in not_pct and 'Foundation' not in title # Get the admins & editors admins = pub.members_of_type(model.User, 'admin').all() editors = pub.members_of_type(model.User, 'editor').all() # Get their email addresses users_with_email = [] users_without_email = []
def canada_extras(): keys_changed = StatsCount() unmapped_keys = StatsList() licenses_changed = StatsCount() unmapped_licenses = StatsList() licenses = StatsList() key_mapping = { 'Level of Government': 'level_of_government', } license_mapping = { 'http://geogratis.ca/geogratis/en/licence.jsp': 'geogratis', 'Crown Copyright': 'canada-crown', } from ckan import model rev = RevisionManager('Standardize extra keys', 10) for pkg in model.Session.query(model.Package): for old_key, new_key in key_mapping.items(): if pkg.extras.has_key(old_key): rev.before_change() pkg.extras[new_key] = pkg.extras[old_key] del pkg.extras[old_key] keys_changed.increment(old_key) rev.after_change() for license_key in ('License', 'License URL'): if pkg.extras.has_key(license_key): old_license = pkg.extras[license_key] if old_license in license_mapping: rev.before_change() pkg.license_id = unicode(license_mapping[old_license]) del pkg.extras[license_key] licenses_changed.increment(old_license) rev.after_change() else: unmapped_licenses.add(old_license, pkg.name) licenses.add(pkg.license_id, pkg.name) for key in pkg.extras.keys(): if key not in key_mapping.keys() and \ key not in key_mapping.values(): unmapped_keys.add(key, pkg.name) rev.finished() print 'Packages: %i' % model.Session.query(model.Package).count() print 'Changed keys:\n', keys_changed.report() print 'Unmapped keys:\n', unmapped_keys.report() print 'Changed licenses:\n', licenses_changed.report() print 'Unmapped licenses:\n', unmapped_licenses.report() print 'Licenses:\n', licenses.report()
def bulk_action(action=None, filepath=None, entity_or_alias_names=None, entities=True, aliases=True): nomen_data = NomenData() nk_dataset = nk_connect(nk_dataset_name) # Gather the list of entities & aliases from the file and command-line entities_or_aliases = [] def find_name(name, stats): if not name.strip(): print stats.add('blank', name) elif entities and name in nomen_data.entities_dict_by_name: entities_or_aliases.append( nomen_data.entities_dict_by_name[name]['entity']) print stats.add('Entity found', name) elif aliases and name in nomen_data.aliases_by_name: entities_or_aliases.append(nomen_data.aliases_by_name[name]) print stats.add('Alias found', name) else: print stats.add('Not found', name) if entity_or_alias_names: stats = StatsList() for name in entity_or_alias_names: find_name(name, stats) print 'Given names:' print stats.report() if filepath: if not os.path.exists(filepath): raise Exception('Filepath not found: %s' % filepath) with open(filepath, 'r') as f: stats = StatsList() for line in f: name = line.rstrip('\n\r') find_name(name, stats) #try: # entity_or_alias = nk_dataset.lookup_detailed(publisher['title'], readonly=True) #except nk_dataset.NoMatch: # print stats.add('Not found', publisher['title']) # continue #except nk_dataset.Invalid: # pass #print stats.add('Found %s' % entity_or_alias.__class__.__name__, entity_or_alias.name) #entities_or_aliases.append(entity_or_alias) print 'File names:' print stats.report() # Do the action to each entity stats = StatsList() for entity_or_alias in entities_or_aliases: name = entity_or_alias.name if action == 'invalidate': if isinstance(entity_or_alias, nomenklatura.Entity): print stats.add('Cannot invalidate an Entity', name) continue alias = entity_or_alias if alias.is_invalid: print stats.add('Already invalid', name) continue try: nk_dataset.match(alias_id=alias.id, entity_id='INVALID') except requests.exceptions.HTTPError, e: # Seem to get occasional 502s due to overloading print stats.add('Server error: %s' % e, alias.name) continue print stats.add('Invalidated', name) else: raise NotImplemented
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def wms_revisions(options): ''' These revisions look like this: # select url from resource_revision where id='3b157e17-cef2-43dc-b0ce-76de18549852' order by revision_timestamp; http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?id=2918&p=0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.1.1 http://www.acas.org.uk/CHttpHandler.ashx?service=WFS&request=GetCapabilities&version=2.0 http://www.acas.org.uk/CHttpHandler.ashx?service=WMS&request=GetCapabilities&version=1.3 The bad ones have been changed to "?service=" params. These revisions need removing. # Typical revision: id | timestamp | author | message | state | approved_timestamp a2370bd1-b1b8-41b4-9fc1-d38b46d2fbda | 2014-02-22 04:34:56.634442 | co-prod3.dh.bytemark.co.uk | REST API: Update object financial-transactions-data-acas | active | # i.e. author='co-prod3...' (site-user, via API) ''' resources = common.get_resources(state='active', resource_id=options.resource, dataset_name=options.dataset) stats = StatsList() stats.report_value_limit = 1000 total_bad_revisions = 0 need_to_commit = False widgets = ['Resources: ', Percentage(), ' ', Bar(), ' ', ETA()] progress = ProgressBar(widgets=widgets) for res in progress(resources): res = model.Resource.get( res.id) # as the session gets flushed during the loop res_rev_q = model.Session.query(model.ResourceRevision).filter_by( id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() first_res_rev = res_revs[0] if 'request=GetCapabilities&version=' in first_res_rev.url: print add_stat('First revision already was WMS', res, stats) continue # Identify bad revisions by the WMS URL parameters and author bad_res_revs = res_rev_q.filter( model.ResourceRevision.url.ilike( '%?service=W%S&request=GetCapabilities&version=%')).all() if bad_res_revs and \ bad_res_revs[0].revision.author not in ('co-prod3.dh.bytemark.co.uk', 'current_revision_fixer2'): print add_stat('Misidentified', res, stats, 'author=%r' % bad_res_revs[0].revision.author) continue if not bad_res_revs: add_stat('Resource ok', res, stats) continue print ' ' # don't overwrite progress bar print add_stat('Bad revisions', res, stats, '(%d/%d)' % (len(bad_res_revs), len(res_revs))) total_bad_revisions += len(bad_res_revs) # Find the new latest (good) revision bad_res_revs_set = set(bad_res_revs) for res_rev_index in reversed(xrange(len(res_revs))): if res_revs[res_rev_index] not in bad_res_revs_set: latest_good_res_rev = res_revs[res_rev_index] break else: print add_stat('No good revisions', res, stats) continue if not options.write: continue # Delete the revisions and resource_revisions print ' Deleting bad revisions...' def delete_bad_revisions(res_revs): # Build the sql as a list, as it is faster when you have 1000 strings to append sql = [ '''BEGIN; ALTER TABLE package_tag DROP CONSTRAINT package_tag_revision_id_fkey; ALTER TABLE package_extra DROP CONSTRAINT package_extra_revision_id_fkey; ALTER TABLE resource DROP CONSTRAINT resource_revision_id_fkey; ''' ] for res_rev in res_revs: sql.append( "DELETE from resource_revision where id='%s' and revision_id='%s';\n" % (res.id, res_rev.revision_id)) # a revision created (e.g. over the API) can be connect to other # resources or a dataset, so only delete the revision if only # connected to this one. if model.Session.query(model.ResourceRevision).\ filter_by(revision_id=res_rev.revision_id).\ count() == 1 and \ model.Session.query(model.PackageRevision).\ filter_by(revision_id=res_rev.revision_id).count() == 0: sql.append("DELETE from revision where id='%s';\n" % res_rev.revision_id) sql.append("UPDATE resource SET revision_id='%s' WHERE id='%s';\n" % \ (latest_good_res_rev.revision_id, res.id)) sql.append(''' ALTER TABLE package_tag ADD CONSTRAINT package_tag_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE package_extra ADD CONSTRAINT package_extra_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); ALTER TABLE resource ADD CONSTRAINT resource_revision_id_fkey FOREIGN KEY (revision_id) REFERENCES revision(id); COMMIT;''') print ' sql..', model.Session.execute(''.join(sql)) print '.committed' model.Session.remove() def chunks(l, n): '''Yield successive n-sized chunks from l.''' for i in xrange(0, len(l), n): yield l[i:i + n] # chunk revisions in chunks to cope when there are so many widgets = [ 'Creating SQL: ', Counter(), 'k/%sk ' % int(float(len(bad_res_revs)) / 1000.0), Bar(), ' ', ETA() ] progress2 = ProgressBar(widgets=widgets, maxval=int(float(len(bad_res_revs)) / 1000.0) or 1) for chunk_of_bad_res_revs in progress2(chunks(bad_res_revs, 1000)): delete_bad_revisions(chunk_of_bad_res_revs) # Knit together the remaining revisions again print ' Knitting existing revisions back together...' res_rev_q = model.Session.query(model.ResourceRevision).filter_by( id=res.id).order_by(model.ResourceRevision.revision_timestamp) res_revs = res_rev_q.all() latest_res_rev = res_revs[-1] if not latest_res_rev.current: latest_res_rev.current = True for i, res_rev in enumerate(res_revs[:-1]): if res_rev.expired_timestamp != res_revs[i + 1].revision_timestamp: res_rev.expired_timestamp = res_revs[i + 1].revision_timestamp res_rev.expired_id = res_revs[i + 1].revision_id if latest_res_rev.expired_timestamp != END_OF_TIME: latest_res_rev.expired_timestamp = END_OF_TIME if latest_res_rev.expired_id is not None: latest_res_rev.expired_id = None # Correct the URL on the resource model.Session.query(model.Resource).filter_by(id=res.id).update( {'url': latest_res_rev.url}) model.repo.commit_and_remove() print ' ...done' print 'Summary\n', stats.report() print 'Total bad revs: %d' % total_bad_revisions if options.write and need_to_commit: model.repo.commit_and_remove() print 'Written'
def fetch(cls, site_url_filter, since_datetime): import ckan.model as model from running_stats import StatsList log = logging.getLogger(__name__) stats = StatsList() # Use the generate_entries generator to get all of # the entries from the ODI Atom feed. This should # correctly handle all of the pages within the feed. import ckanext.certificates.client as client for entry in client.generate_entries(since=since_datetime): # We have to handle the case where the rel='about' might be # missing, if so we'll ignore it and catch it next time about = entry.get('about', '') if not about: log.debug( stats.add( 'Ignore - no rel="about" specifying the dataset', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not site_url_filter.search(about): log.debug( stats.add( 'Ignore - "about" field does not reference this site', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue if not '/dataset/' in entry['about']: log.debug( stats.add( 'Ignore - is "about" DGU but not a dataset', '%s "%s" %s' % (about, entry['about'], entry['id']))) continue pkg = cls._get_package_from_url(entry.get('about')) if not pkg: log.error( stats.add( 'Unable to find the package', '%s "%s" %s %r' % (about, entry['about'], entry['id'], entry.get('about')))) continue # Build the JSON subset we want to describe the certificate badge_data = client.get_badge_data(entry['alternate']) if not badge_data: log.info( stats.add( 'Error fetching badge data - skipped', '%s "%s" %s' % (about, entry['title'], entry['id']))) continue badge_data['cert_title'] = entry.get( 'content', '') # e.g. 'Basic Level Certificate' badge_json = json.dumps(badge_data) if pkg.extras.get('odi-certificate') == badge_json: log.debug( stats.add('Certificate unchanged', badge_data['certificate_url'])) else: operation = 'updated' if 'odi-certificate' in pkg.extras \ else 'added' model.repo.new_revision() pkg.extras['odi-certificate'] = json.dumps(badge_data) log.debug( stats.add( 'Certificate %s' % operation, '"%s" %s' % (badge_data['title'], badge_data['certificate_url']))) model.Session.commit() log.info('Summary:\n' + stats.report())
def canada_extras(): keys_changed = StatsCount() unmapped_keys = StatsList() licenses_changed = StatsCount() unmapped_licenses = StatsList() licenses = StatsList() key_mapping = { 'Level of Government':'level_of_government', } license_mapping = { # CS: bad_spelling ignore 'http://geogratis.ca/geogratis/en/licence.jsp':'geogratis', 'Crown Copyright':'canada-crown', } from ckan import model rev = RevisionManager('Standardize extra keys', 10) for pkg in model.Session.query(model.Package): for old_key, new_key in key_mapping.items(): if pkg.extras.has_key(old_key): rev.before_change() pkg.extras[new_key] = pkg.extras[old_key] del pkg.extras[old_key] keys_changed.increment(old_key) rev.after_change() for license_key in ('License', 'License URL'): if pkg.extras.has_key(license_key): old_license = pkg.extras[license_key] if old_license in license_mapping: rev.before_change() pkg.license_id = unicode(license_mapping[old_license]) del pkg.extras[license_key] licenses_changed.increment(old_license) rev.after_change() else: unmapped_licenses.add(old_license, pkg.name) licenses.add(pkg.license_id, pkg.name) for key in pkg.extras.keys(): if key not in key_mapping.keys() and \ key not in key_mapping.values(): unmapped_keys.add(key, pkg.name) rev.finished() print 'Packages: %i' % model.Session.query(model.Package).count() print 'Changed keys:\n', keys_changed.report() print 'Unmapped keys:\n', unmapped_keys.report() print 'Changed licenses:\n', licenses_changed.report() print 'Unmapped licenses:\n', unmapped_licenses.report() print 'Licenses:\n', licenses.report()
def fix_redirects(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Repoint 410 Gone to webarchive url' needs_commit = False stats = StatsList() # Get resources results = model.Session.query(Archival, model.Resource) if options.resource: results = results.filter(Archival.resource_id == options.resource) elif options.dataset: pkg = model.Package.get(options.dataset) assert pkg results = results.filter(Archival.package_id==pkg.id)\ .order_by(model.Resource.position) results = results.filter(or_(Archival.is_broken == True, Archival.url_redirected_to != None))\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active')\ .order_by(model.Package.name) if options.organization: org = model.Group.get(options.organization) assert org results = results.filter(model.Package.owner_org == org.id) results = results.all() def is_gov_uk(url): return url.startswith('https://www.gov.uk/') def is_webarchive(url): return url.startswith('http://webarchive.nationalarchives.gov.uk/') for archival, res in results: def stats_add(msg): pkg = res.resource_group.package return stats.add(msg, ('%s/%s %s' % (pkg.name, res.id, res.url)).encode( 'latin7', 'ignore')) if archival.reason.endswith('410 Gone'): # Find out the redirect - it is in the html try: page = requests.get(res.url) except requests.exceptions.ConnectionError: print stats_add('410 Gone but connection error') continue if '<a href="https://www.gov.uk">' not in page.text: print stats_add('410 Gone but not gov.uk') continue root = lxml.html.fromstring(page.text) hrefs = root.xpath('//div[@id="detail"]//a') for href in hrefs: url = href.attrib['href'] if is_webarchive(url): break else: print stats_add('410 Gone but no forward link') continue print stats_add('410 Gone and link found - change') if write: res.url = url needs_commit = True continue if not archival.url_redirected_to: # we've filtered for redirects and broken, so must be broken stats_add('Broken, but not a redirect - not interested') continue if is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): stats_add('Internal gov.uk redirect - ignore') continue if not is_gov_uk(res.url) and is_gov_uk(archival.url_redirected_to): print stats_add('Redirect to gov.uk - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if is_webarchive(res.url) and is_webarchive( archival.url_redirected_to): stats_add('Internal webarchive redirect - ignore') continue if not is_webarchive(res.url) and is_webarchive( archival.url_redirected_to): print stats_add('Redirect to webarchive - change') if write: res.url = archival.url_redirected_to needs_commit = True continue if not is_gov_uk(archival.url_redirected_to) and not is_webarchive( archival.url_redirected_to): stats_add( 'Redirect nothing to do with gov.uk or webarchive - ignore') continue print stats_add('Dunno') stats.report_value_limit = 500 print 'Summary', stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'