def command(self): from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH from ckanext.dgu.ons.importer import OnsImporter from ckanext.dgu.ons.loader import OnsLoader ApiCommand.command(self) log = logging.getLogger(__name__) try: if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date(self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if self.options.month: self.options.month = self.parse_month(self.options.month) if self.options.months_since: self.options.months_since = self.parse_month(self.options.months_since) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.month: data_filepaths = OnsData.download_month(year=self.options.month.year, month=self.options.month.month) elif self.options.months_since: data_filepaths = OnsData.download_months_since( year=self.options.months_since.year, month=self.options.months_since.month, force_download=self.options.force_download) elif self.options.all_time: data_filepaths = OnsData.download_all(force_download=self.options.force_download) else: self.parser.error('Please specify a time period') filter_ = {} if self.options.publisher: filter_['publisher'] = self.options.publisher stats = StatsList() importer = OnsImporter(filepaths=data_filepaths, ckanclient=self.client, stats=stats, filter_=filter_) loader = OnsLoader(self.client, stats) loader.load_packages(importer.pkg_dict()) log.info('Summary:\n' + stats.report()) except: # Any problem, make sure it gets logged log.exception('ONS Loader exception') raise
def change_publisher(from_publisher_name, to_publisher_name, options): from ckan import model stats = StatsList() if options.write: rev = model.repo.new_revision() rev.author = 'script_dataset_change_publisher' needs_commit = False from_publisher = model.Group.get(from_publisher_name) to_publisher = model.Group.get(to_publisher_name) datasets = common.get_datasets(dataset_name=options.dataset, organization_ref=from_publisher_name) assert to_publisher for dataset in datasets: member = model.Session.query(model.Member) \ .filter_by(group_id=from_publisher.id) \ .filter_by(table_name='package') \ .filter_by(table_id=dataset.id) \ .first() if member: print stats.add('Change owner_id and Member', dataset.name) else: print stats.add('Change owner_id but no Member', dataset.name) if options.write: dataset.owner_org = to_publisher.id if member: member.group_id = to_publisher.id needs_commit = True print stats.report() if options.write and needs_commit: model.repo.commit_and_remove()
''' import os import sys import glob from itertools import chain import ckanext.dgu.bin.common as common from optparse import OptionParser from ckan import model from ckanext.archiver.model import Archival from ckanext.dgu.bin.running_stats import StatsList DEFAULT_CACHE_DIR = '/media/hulk/ckan_resource_cache/' stats = StatsList() class CleanCachedResources(object): @classmethod def command(cls, config_ini, cache_dir, delete): common.load_config(config_ini) common.register_translator() #rev = model.repo.new_revision() #rev.author = 'fix_secondary_theme.py' no_archival = [] deleted_res = [] for f in glob.glob(os.path.join(cache_dir, '*/*/*')): a = model.Session.query(Archival).filter(
''' import re import os import logging import sys import requests from pylons import config from nose.tools import assert_equal from lxml import etree from ckanext.dgu.bin import common from ckanext.dgu.bin.running_stats import StatsList service_stats = StatsList() couple_stats = StatsList() additional_couple_stats = StatsList() class FindError(Exception): pass class CoupledResources(object): @classmethod def detect(cls): '''Finds datasets that are coupled and adds their harvest_source_reference to the HarvestObject and package extras. ''' from ckan.lib.base import json from ckan import model from ckanext.harvest.model import HarvestObject
def command(self): self._load_config() log = logging.getLogger('ckanext') import ckan.model as model from ckanext.dgu.bin.running_stats import StatsList from ckanext.dgu.lib.publisher_matcher import PublisherMatcher model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") self.working_directory = self.args[0] log.info("Working directory set to %s" % self.working_directory) start = time.time() self.authorities_file = self._get_authorities_csv() # Read in the WDTK publishers and store in matcher wdtk_publishers = {} # slug: name matcher = PublisherMatcher() with open(self.authorities_file, 'rU') as f: reader = csv.reader(f) for row in reader: name, short_name, slug = row[0:3] matcher.add_external_publisher(slug, name, short_name) wdtk_publishers[slug] = name.replace('\x92', "'").decode('utf8') # Match up DGU publishers publishers = model.Session.query(model.Group) \ .filter(model.Group.type == 'publisher') \ .filter(model.Group.state == 'active').all() log.info("Found %d publishers to process in DB" % len(publishers)) match_stats = StatsList() for publisher in publishers: match = matcher.match_to_external_publisher(publisher.title) if not match: match = matcher.match_to_external_publisher( publisher.extras.get('abbreviation', '')) if not match: match = matcher.match_to_external_publisher( re.sub('[-_]+', ' ', publisher.name)) if not match and publisher.name in direct_matches: match = direct_matches[publisher.name] log.info(match_stats.add('Direct match', publisher.name)) continue # We don't want to write any details automatically if we have # any existing phone, email or web details for FOI. have_previous_details = any([ publisher.extras.get('foi-phone'), publisher.extras.get('foi-email'), publisher.extras.get('foi-web') ]) if not match: if have_previous_details: log.info( match_stats.add( 'No match but already have FOI details', publisher.name)) else: log.info( match_stats.add('No match and still needs FOI details', publisher.name)) continue # Save the publisher log.info('%s matches WDTK %s', publisher.name, match) # Store the match. Used for publisher_sync and publicbodies/nomen work. if not DRY_RUN and publisher.get('wdtk-id') != match and \ publisher.get('wdtk-title') != wdtk_publishers[match]: publisher.extras['wdtk-id'] = match publisher.extras['wdtk-title'] = wdtk_publishers[match] model.Session.commit() # Check if previous WDTK details are still correct wdtk_url = WDTK_REQUEST_URL % match if 'whatdotheyknow' in publisher.extras.get('foi-web', ''): if publisher.extras['foi-web'] == wdtk_url: log.info( match_stats.add( 'Match, but already have WDTK FOI details', publisher.name)) continue else: log.info( match_stats.add( 'Match, and correcting WDTK FOI details', publisher.name)) elif have_previous_details: log.info( match_stats.add('Match, but already have FOI details', publisher.name)) continue else: log.info( match_stats.add('Match and added FOI details', publisher.name)) if not DRY_RUN: publisher.extras['foi-web'] = wdtk_url model.Session.commit() print 'Full list of publishers not matched:' for name in match_stats[ 'No match and still needs FOI details'] + match_stats[ 'No match but already have FOI details']: print name, repr(model.Group.by_name(name).title) end = time.time() took = str(datetime.timedelta(seconds=end - start)) log.info('Time taken: %s' % took) print match_stats.report() if DRY_RUN: print 'NB: No changes made - this was a dry run'
def command(self): self._load_config() log = logging.getLogger('ckanext') import ckan.model as model from ckanext.dgu.bin.running_stats import StatsList from ckanext.dgu.lib.publisher_matcher import PublisherMatcher model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") self.working_directory = self.args[0] log.info("Working directory set to %s" % self.working_directory) start = time.time() self.authorities_file = self._get_authorities_csv() # Read in the WDTK publishers and store in matcher wdtk_publishers = {} # slug: name matcher = PublisherMatcher() with open(self.authorities_file, 'rU') as f: reader = csv.reader(f) for row in reader: name, short_name, slug = row[0:3] matcher.add_external_publisher(slug, name, short_name) wdtk_publishers[slug] = name.replace('\x92', "'").decode('utf8') # Match up DGU publishers publishers = model.Session.query(model.Group) \ .filter(model.Group.type == 'publisher') \ .filter(model.Group.state == 'active').all() log.info("Found %d publishers to process in DB" % len(publishers)) match_stats = StatsList() for publisher in publishers: match = matcher.match_to_external_publisher(publisher.title) if not match: match = matcher.match_to_external_publisher(publisher.extras.get('abbreviation', '')) if not match: match = matcher.match_to_external_publisher(re.sub('[-_]+', ' ', publisher.name)) if not match and publisher.name in direct_matches: match = direct_matches[publisher.name] log.info(match_stats.add('Direct match', publisher.name)) continue # We don't want to write any details automatically if we have # any existing phone, email or web details for FOI. have_previous_details = any([publisher.extras.get('foi-phone'), publisher.extras.get('foi-email'), publisher.extras.get('foi-web')]) if not match: if have_previous_details: log.info(match_stats.add('No match but already have FOI details', publisher.name)) else: log.info(match_stats.add('No match and still needs FOI details', publisher.name)) continue # Save the publisher log.info('%s matches WDTK %s', publisher.name, match) # Store the match. Used for publisher_sync and publicbodies/nomen work. if not DRY_RUN and publisher.get('wdtk-id') != match and \ publisher.get('wdtk-title') != wdtk_publishers[match]: publisher.extras['wdtk-id'] = match publisher.extras['wdtk-title'] = wdtk_publishers[match] model.Session.commit() # Check if previous WDTK details are still correct wdtk_url = WDTK_REQUEST_URL % match if 'whatdotheyknow' in publisher.extras.get('foi-web', ''): if publisher.extras['foi-web'] == wdtk_url: log.info(match_stats.add('Match, but already have WDTK FOI details', publisher.name)) continue else: log.info(match_stats.add('Match, and correcting WDTK FOI details', publisher.name)) elif have_previous_details: log.info(match_stats.add('Match, but already have FOI details', publisher.name)) continue else: log.info(match_stats.add('Match and added FOI details', publisher.name)) if not DRY_RUN: publisher.extras['foi-web'] = wdtk_url model.Session.commit() print 'Full list of publishers not matched:' for name in match_stats['No match and still needs FOI details'] + match_stats['No match but already have FOI details']: print name, repr(model.Group.by_name(name).title) end = time.time() took = str(datetime.timedelta(seconds=end-start)) log.info('Time taken: %s' % took) print match_stats.report() if DRY_RUN: print 'NB: No changes made - this was a dry run'
def command(self): from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH from ckanext.dgu.ons.importer import OnsImporter from ckanext.dgu.ons.loader import OnsLoader ApiCommand.command(self) log = logging.getLogger(__name__) try: if self.options.days: self.options.days = int(self.options.days) if self.options.start_date: self.options.start_date = self.parse_date( self.options.start_date) if self.options.end_date: self.options.end_date = self.parse_date(self.options.end_date) if self.options.month: self.options.month = self.parse_month(self.options.month) if self.options.months_since: self.options.months_since = self.parse_month( self.options.months_since) if not self.options.ons_cache_dir: self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH if self.options.days or \ self.options.start_date or \ self.options.end_date: data_filepaths = OnsData.download_flexible( days=self.options.days, start_date=self.options.start_date, end_date=self.options.end_date, ons_cache_dir=self.options.ons_cache_dir) elif self.options.month: data_filepaths = OnsData.download_month( year=self.options.month.year, month=self.options.month.month) elif self.options.months_since: data_filepaths = OnsData.download_months_since( year=self.options.months_since.year, month=self.options.months_since.month, force_download=self.options.force_download) elif self.options.all_time: data_filepaths = OnsData.download_all( force_download=self.options.force_download) else: self.parser.error('Please specify a time period') filter_ = {} if self.options.publisher: filter_['publisher'] = self.options.publisher stats = StatsList() importer = OnsImporter(filepaths=data_filepaths, ckanclient=self.client, stats=stats, filter_=filter_) loader = OnsLoader(self.client, stats) loader.load_packages(importer.pkg_dict()) log.info('Summary:\n' + stats.report()) except: # Any problem, make sure it gets logged log.exception('ONS Loader exception') raise
def fix_links(csv_filepath, write=False): from ckan import model stats = StatsList() if write: rev = model.repo.new_revision() rev.author = 'Link fix from CSV' needs_commit = False with open(csv_filepath, 'rU') as f: reader = csv.reader(f) header = reader.next() assert header == ['NS Title', 'Bad link', 'Good link'], header for row in reader: ns_title, bad_link, good_link = row # Find the package and resource pkg_title = ns_title.split(' - ')[0] res_title = ' - '.join(ns_title.split(' - ')[1:]) pkgs = model.Session.query(model.Package)\ .filter_by(title=pkg_title)\ .filter_by(state='active')\ .filter(model.Package.notes.like('%Source agency%'))\ .all() if not pkgs: print stats.add('Package title did not match', ns_title) continue if len(pkgs) > 1: print stats.add('Multiple package title matches', ns_title) continue pkg = pkgs[0] for res_ in pkg.resources: if res_.description[:len( res_title)] == res_title and 'hub-id' in res_.extras: res = res_ break else: print stats.add('Resource title did not match', ns_title) continue # Update the link if res.url == good_link: print stats.add('Resource URL already fixed', ns_title) continue if res.url != bad_link and res.url.startswith( 'http://webarchive.nationalarchives.gov.uk'): print stats.add( 'Resource is already pointing to the webarchive - leave it', ns_title) continue if res.url != bad_link: print stats.add('Resource URL is not expected', ns_title) continue if write: print stats.add('Update link (written)', ns_title) res.url = good_link needs_commit = True else: print stats.add('Update link (not written)', ns_title) print stats.report() if write and needs_commit: model.repo.commit_and_remove()
def fix_duplicates(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Fix duplicate resources' needs_commit = False stats = StatsList() pkgs = model.Session.query(model.Package)\ .filter_by(state='active')\ .join(model.PackageExtra)\ .filter_by(state='active')\ .filter_by(key='external_reference')\ .filter_by(value='ONSHUB')\ .order_by(model.Package.name) if options.dataset: pkg = model.Package.get(options.dataset) pkgs = pkgs.filter(model.Package.id==pkg.id) pkgs = pkgs.all() for pkg in pkgs: previous_resources = {} def get_res_properties(resource): return {'url': resource.url, 'hub-id': resource.extras.get('hub-id'), 'date': resource.extras.get('date'), 'publish-date': resource.extras.get('publish-date')} def is_res_broken(resource): archival = Archival.get_for_resource(resource.id) if not archival: return None return archival.is_broken has_duplicates = False if not pkg.resources: print stats.add('No resources', pkg.name) for res in pkg.resources: res_properties = get_res_properties(res) res_identity = '%s %s' % (pkg.name, res.description) if res.description in previous_resources: has_duplicates = True prev_res = previous_resources[res.description] prev_res_properties = get_res_properties(prev_res) if res_properties == prev_res_properties: needs_commit=True print stats.add('Resource indentical - dedupe', res_identity) merge_resources((res, prev_res), write) elif prev_res_properties['date'] != res_properties['date']: print stats.add('Resource same description, different date in timeseries - ok', res_identity) elif prev_res_properties['hub-id'] and res_properties['hub-id'] and prev_res_properties['hub-id'] != res_properties['hub-id']: print stats.add('Resource same description, different hub-id - ok', res_identity) elif prev_res_properties['hub-id'] and prev_res_properties['hub-id'] == res_properties['hub-id']: needs_commit=True print stats.add('Resource with same hub-id - merge', res_identity) pprint(prev_res_properties) pprint(res_properties) merge_resources((res, prev_res), write) elif prev_res_properties['url'] == res_properties['url']: needs_commit=True print stats.add('Resource same description & url, different other properties - merge', res_identity) pprint(prev_res_properties) pprint(res_properties) merge_resources((res, prev_res), write) elif is_res_broken(prev_res) or is_res_broken(res): print stats.add('Resource same description, different properties, some breakage - delete one', res_identity) if is_res_broken(prev_res): print 'BROKEN:' pprint(prev_res_properties) if is_res_broken(res): print 'BROKEN:' pprint(res_properties) else: print stats.add('Resource same description, different properties - manual decision', res_identity) pprint(prev_res_properties) pprint(res_properties) previous_resources[res.description] = res if not has_duplicates: print stats.add('Package without duplicates', pkg.name) print stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'
def fix_links(csv_filepath, write=False): from ckan import model stats = StatsList() if write: rev = model.repo.new_revision() rev.author = 'Link fix from CSV' needs_commit = False with open(csv_filepath, 'rU') as f: reader = csv.reader(f) header = reader.next() assert header == ['NS Title', 'Bad link', 'Good link'], header for row in reader: ns_title, bad_link, good_link = row # Find the package and resource pkg_title = ns_title.split(' - ')[0] res_title = ' - '.join(ns_title.split(' - ')[1:]) pkgs = model.Session.query(model.Package)\ .filter_by(title=pkg_title)\ .filter_by(state='active')\ .filter(model.Package.notes.like('%Source agency%'))\ .all() if not pkgs: print stats.add('Package title did not match', ns_title) continue if len(pkgs) > 1: print stats.add('Multiple package title matches', ns_title) continue pkg = pkgs[0] for res_ in pkg.resources: if res_.description[:len(res_title)] == res_title and 'hub-id' in res_.extras: res = res_ break else: print stats.add('Resource title did not match', ns_title) continue # Update the link if res.url == good_link: print stats.add('Resource URL already fixed', ns_title) continue if res.url != bad_link and res.url.startswith('http://webarchive.nationalarchives.gov.uk'): print stats.add('Resource is already pointing to the webarchive - leave it', ns_title) continue if res.url != bad_link: print stats.add('Resource URL is not expected', ns_title) continue if write: print stats.add('Update link (written)', ns_title) res.url = good_link needs_commit = True else: print stats.add('Update link (not written)', ns_title) print stats.report() if write and needs_commit: model.repo.commit_and_remove()