def change_publisher(from_publisher_name, to_publisher_name, options): from ckan import model stats = StatsList() if options.write: rev = model.repo.new_revision() rev.author = 'script_dataset_change_publisher' needs_commit = False from_publisher = model.Group.get(from_publisher_name) to_publisher = model.Group.get(to_publisher_name) datasets = common.get_datasets(dataset_name=options.dataset, organization_ref=from_publisher_name) assert to_publisher for dataset in datasets: member = model.Session.query(model.Member) \ .filter_by(group_id=from_publisher.id) \ .filter_by(table_name='package') \ .filter_by(table_id=dataset.id) \ .first() if member: print stats.add('Change owner_id and Member', dataset.name) else: print stats.add('Change owner_id but no Member', dataset.name) if options.write: dataset.owner_org = to_publisher.id if member: member.group_id = to_publisher.id needs_commit = True print stats.report() if options.write and needs_commit: model.repo.commit_and_remove()
log.warning('Could not get GUID from Gemini downloaded href:"%s"', \ href) couples_all_detected = False break except etree.XMLSyntaxError, e: couple_stats.add('Could not parse "Gemini" downloaded', couple_id) log.warning('Could not parse "Gemini" downloaded href:"%s"', \ href) couples_all_detected = False break try: harvest_object = cls.find_harvest_object_by_guid(guid) except FindError, e: log.error('%s href=%s couple=%s', e, href, couple_id) couple_stats.add(str(e), couple_id) couples_all_detected = False continue dataset_record = harvest_object.package #res.resource_group.package couple_stats.add('Couple completed', couple_id) log.info('Couple completed %s <-> %s', service_record.name, dataset_record.name) cls.add_coupling(service_record, dataset_record, harvest_object, href) couples_detected = True if coupled_resources: if couples_all_detected: service_stats.add('Service couples all completed', service_record.name) elif couples_detected:
def command(self): self._load_config() log = logging.getLogger('ckanext') import ckan.model as model from ckanext.dgu.bin.running_stats import StatsList from ckanext.dgu.lib.publisher_matcher import PublisherMatcher model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") self.working_directory = self.args[0] log.info("Working directory set to %s" % self.working_directory) start = time.time() self.authorities_file = self._get_authorities_csv() # Read in the WDTK publishers and store in matcher wdtk_publishers = {} # slug: name matcher = PublisherMatcher() with open(self.authorities_file, 'rU') as f: reader = csv.reader(f) for row in reader: name, short_name, slug = row[0:3] matcher.add_external_publisher(slug, name, short_name) wdtk_publishers[slug] = name.replace('\x92', "'").decode('utf8') # Match up DGU publishers publishers = model.Session.query(model.Group) \ .filter(model.Group.type == 'publisher') \ .filter(model.Group.state == 'active').all() log.info("Found %d publishers to process in DB" % len(publishers)) match_stats = StatsList() for publisher in publishers: match = matcher.match_to_external_publisher(publisher.title) if not match: match = matcher.match_to_external_publisher( publisher.extras.get('abbreviation', '')) if not match: match = matcher.match_to_external_publisher( re.sub('[-_]+', ' ', publisher.name)) if not match and publisher.name in direct_matches: match = direct_matches[publisher.name] log.info(match_stats.add('Direct match', publisher.name)) continue # We don't want to write any details automatically if we have # any existing phone, email or web details for FOI. have_previous_details = any([ publisher.extras.get('foi-phone'), publisher.extras.get('foi-email'), publisher.extras.get('foi-web') ]) if not match: if have_previous_details: log.info( match_stats.add( 'No match but already have FOI details', publisher.name)) else: log.info( match_stats.add('No match and still needs FOI details', publisher.name)) continue # Save the publisher log.info('%s matches WDTK %s', publisher.name, match) # Store the match. Used for publisher_sync and publicbodies/nomen work. if not DRY_RUN and publisher.get('wdtk-id') != match and \ publisher.get('wdtk-title') != wdtk_publishers[match]: publisher.extras['wdtk-id'] = match publisher.extras['wdtk-title'] = wdtk_publishers[match] model.Session.commit() # Check if previous WDTK details are still correct wdtk_url = WDTK_REQUEST_URL % match if 'whatdotheyknow' in publisher.extras.get('foi-web', ''): if publisher.extras['foi-web'] == wdtk_url: log.info( match_stats.add( 'Match, but already have WDTK FOI details', publisher.name)) continue else: log.info( match_stats.add( 'Match, and correcting WDTK FOI details', publisher.name)) elif have_previous_details: log.info( match_stats.add('Match, but already have FOI details', publisher.name)) continue else: log.info( match_stats.add('Match and added FOI details', publisher.name)) if not DRY_RUN: publisher.extras['foi-web'] = wdtk_url model.Session.commit() print 'Full list of publishers not matched:' for name in match_stats[ 'No match and still needs FOI details'] + match_stats[ 'No match but already have FOI details']: print name, repr(model.Group.by_name(name).title) end = time.time() took = str(datetime.timedelta(seconds=end - start)) log.info('Time taken: %s' % took) print match_stats.report() if DRY_RUN: print 'NB: No changes made - this was a dry run'
def command(self): self._load_config() log = logging.getLogger('ckanext') import ckan.model as model from ckanext.dgu.bin.running_stats import StatsList from ckanext.dgu.lib.publisher_matcher import PublisherMatcher model.Session.remove() model.Session.configure(bind=model.meta.engine) model.repo.new_revision() log.info("Database access initialised") self.working_directory = self.args[0] log.info("Working directory set to %s" % self.working_directory) start = time.time() self.authorities_file = self._get_authorities_csv() # Read in the WDTK publishers and store in matcher wdtk_publishers = {} # slug: name matcher = PublisherMatcher() with open(self.authorities_file, 'rU') as f: reader = csv.reader(f) for row in reader: name, short_name, slug = row[0:3] matcher.add_external_publisher(slug, name, short_name) wdtk_publishers[slug] = name.replace('\x92', "'").decode('utf8') # Match up DGU publishers publishers = model.Session.query(model.Group) \ .filter(model.Group.type == 'publisher') \ .filter(model.Group.state == 'active').all() log.info("Found %d publishers to process in DB" % len(publishers)) match_stats = StatsList() for publisher in publishers: match = matcher.match_to_external_publisher(publisher.title) if not match: match = matcher.match_to_external_publisher(publisher.extras.get('abbreviation', '')) if not match: match = matcher.match_to_external_publisher(re.sub('[-_]+', ' ', publisher.name)) if not match and publisher.name in direct_matches: match = direct_matches[publisher.name] log.info(match_stats.add('Direct match', publisher.name)) continue # We don't want to write any details automatically if we have # any existing phone, email or web details for FOI. have_previous_details = any([publisher.extras.get('foi-phone'), publisher.extras.get('foi-email'), publisher.extras.get('foi-web')]) if not match: if have_previous_details: log.info(match_stats.add('No match but already have FOI details', publisher.name)) else: log.info(match_stats.add('No match and still needs FOI details', publisher.name)) continue # Save the publisher log.info('%s matches WDTK %s', publisher.name, match) # Store the match. Used for publisher_sync and publicbodies/nomen work. if not DRY_RUN and publisher.get('wdtk-id') != match and \ publisher.get('wdtk-title') != wdtk_publishers[match]: publisher.extras['wdtk-id'] = match publisher.extras['wdtk-title'] = wdtk_publishers[match] model.Session.commit() # Check if previous WDTK details are still correct wdtk_url = WDTK_REQUEST_URL % match if 'whatdotheyknow' in publisher.extras.get('foi-web', ''): if publisher.extras['foi-web'] == wdtk_url: log.info(match_stats.add('Match, but already have WDTK FOI details', publisher.name)) continue else: log.info(match_stats.add('Match, and correcting WDTK FOI details', publisher.name)) elif have_previous_details: log.info(match_stats.add('Match, but already have FOI details', publisher.name)) continue else: log.info(match_stats.add('Match and added FOI details', publisher.name)) if not DRY_RUN: publisher.extras['foi-web'] = wdtk_url model.Session.commit() print 'Full list of publishers not matched:' for name in match_stats['No match and still needs FOI details'] + match_stats['No match but already have FOI details']: print name, repr(model.Group.by_name(name).title) end = time.time() took = str(datetime.timedelta(seconds=end-start)) log.info('Time taken: %s' % took) print match_stats.report() if DRY_RUN: print 'NB: No changes made - this was a dry run'
def fix_links(csv_filepath, write=False): from ckan import model stats = StatsList() if write: rev = model.repo.new_revision() rev.author = 'Link fix from CSV' needs_commit = False with open(csv_filepath, 'rU') as f: reader = csv.reader(f) header = reader.next() assert header == ['NS Title', 'Bad link', 'Good link'], header for row in reader: ns_title, bad_link, good_link = row # Find the package and resource pkg_title = ns_title.split(' - ')[0] res_title = ' - '.join(ns_title.split(' - ')[1:]) pkgs = model.Session.query(model.Package)\ .filter_by(title=pkg_title)\ .filter_by(state='active')\ .filter(model.Package.notes.like('%Source agency%'))\ .all() if not pkgs: print stats.add('Package title did not match', ns_title) continue if len(pkgs) > 1: print stats.add('Multiple package title matches', ns_title) continue pkg = pkgs[0] for res_ in pkg.resources: if res_.description[:len( res_title)] == res_title and 'hub-id' in res_.extras: res = res_ break else: print stats.add('Resource title did not match', ns_title) continue # Update the link if res.url == good_link: print stats.add('Resource URL already fixed', ns_title) continue if res.url != bad_link and res.url.startswith( 'http://webarchive.nationalarchives.gov.uk'): print stats.add( 'Resource is already pointing to the webarchive - leave it', ns_title) continue if res.url != bad_link: print stats.add('Resource URL is not expected', ns_title) continue if write: print stats.add('Update link (written)', ns_title) res.url = good_link needs_commit = True else: print stats.add('Update link (not written)', ns_title) print stats.report() if write and needs_commit: model.repo.commit_and_remove()
guid = gemini.read_value("guid") except KeyError, e: couple_stats.add("Could not get GUID from Gemini downloaded" % href, couple_id) log.warning('Could not get GUID from Gemini downloaded href:"%s"', href) couples_all_detected = False break except etree.XMLSyntaxError, e: couple_stats.add('Could not parse "Gemini" downloaded', couple_id) log.warning('Could not parse "Gemini" downloaded href:"%s"', href) couples_all_detected = False break try: harvest_object = cls.find_harvest_object_by_guid(guid) except FindError, e: log.error("%s href=%s couple=%s", e, href, couple_id) couple_stats.add(str(e), couple_id) couples_all_detected = False continue dataset_record = harvest_object.package # res.resource_group.package couple_stats.add("Couple completed", couple_id) log.info("Couple completed %s <-> %s", service_record.name, dataset_record.name) cls.add_coupling(service_record, dataset_record, harvest_object, href) couples_detected = True if coupled_resources: if couples_all_detected: service_stats.add("Service couples all completed", service_record.name) elif couples_detected: service_stats.add("Service couples partially completed", service_record.name)
def fix_duplicates(options): from ckan import model from ckanext.archiver.model import Archival write = options.write if write: rev = model.repo.new_revision() rev.author = 'Fix duplicate resources' needs_commit = False stats = StatsList() pkgs = model.Session.query(model.Package)\ .filter_by(state='active')\ .join(model.PackageExtra)\ .filter_by(state='active')\ .filter_by(key='external_reference')\ .filter_by(value='ONSHUB')\ .order_by(model.Package.name) if options.dataset: pkg = model.Package.get(options.dataset) pkgs = pkgs.filter(model.Package.id==pkg.id) pkgs = pkgs.all() for pkg in pkgs: previous_resources = {} def get_res_properties(resource): return {'url': resource.url, 'hub-id': resource.extras.get('hub-id'), 'date': resource.extras.get('date'), 'publish-date': resource.extras.get('publish-date')} def is_res_broken(resource): archival = Archival.get_for_resource(resource.id) if not archival: return None return archival.is_broken has_duplicates = False if not pkg.resources: print stats.add('No resources', pkg.name) for res in pkg.resources: res_properties = get_res_properties(res) res_identity = '%s %s' % (pkg.name, res.description) if res.description in previous_resources: has_duplicates = True prev_res = previous_resources[res.description] prev_res_properties = get_res_properties(prev_res) if res_properties == prev_res_properties: needs_commit=True print stats.add('Resource indentical - dedupe', res_identity) merge_resources((res, prev_res), write) elif prev_res_properties['date'] != res_properties['date']: print stats.add('Resource same description, different date in timeseries - ok', res_identity) elif prev_res_properties['hub-id'] and res_properties['hub-id'] and prev_res_properties['hub-id'] != res_properties['hub-id']: print stats.add('Resource same description, different hub-id - ok', res_identity) elif prev_res_properties['hub-id'] and prev_res_properties['hub-id'] == res_properties['hub-id']: needs_commit=True print stats.add('Resource with same hub-id - merge', res_identity) pprint(prev_res_properties) pprint(res_properties) merge_resources((res, prev_res), write) elif prev_res_properties['url'] == res_properties['url']: needs_commit=True print stats.add('Resource same description & url, different other properties - merge', res_identity) pprint(prev_res_properties) pprint(res_properties) merge_resources((res, prev_res), write) elif is_res_broken(prev_res) or is_res_broken(res): print stats.add('Resource same description, different properties, some breakage - delete one', res_identity) if is_res_broken(prev_res): print 'BROKEN:' pprint(prev_res_properties) if is_res_broken(res): print 'BROKEN:' pprint(res_properties) else: print stats.add('Resource same description, different properties - manual decision', res_identity) pprint(prev_res_properties) pprint(res_properties) previous_resources[res.description] = res if not has_duplicates: print stats.add('Package without duplicates', pkg.name) print stats.report() if write and needs_commit: print 'Writing...' model.repo.commit_and_remove() print '...done' elif write: print 'Nothing to write' else: print 'Not written'
def fix_links(csv_filepath, write=False): from ckan import model stats = StatsList() if write: rev = model.repo.new_revision() rev.author = 'Link fix from CSV' needs_commit = False with open(csv_filepath, 'rU') as f: reader = csv.reader(f) header = reader.next() assert header == ['NS Title', 'Bad link', 'Good link'], header for row in reader: ns_title, bad_link, good_link = row # Find the package and resource pkg_title = ns_title.split(' - ')[0] res_title = ' - '.join(ns_title.split(' - ')[1:]) pkgs = model.Session.query(model.Package)\ .filter_by(title=pkg_title)\ .filter_by(state='active')\ .filter(model.Package.notes.like('%Source agency%'))\ .all() if not pkgs: print stats.add('Package title did not match', ns_title) continue if len(pkgs) > 1: print stats.add('Multiple package title matches', ns_title) continue pkg = pkgs[0] for res_ in pkg.resources: if res_.description[:len(res_title)] == res_title and 'hub-id' in res_.extras: res = res_ break else: print stats.add('Resource title did not match', ns_title) continue # Update the link if res.url == good_link: print stats.add('Resource URL already fixed', ns_title) continue if res.url != bad_link and res.url.startswith('http://webarchive.nationalarchives.gov.uk'): print stats.add('Resource is already pointing to the webarchive - leave it', ns_title) continue if res.url != bad_link: print stats.add('Resource URL is not expected', ns_title) continue if write: print stats.add('Update link (written)', ns_title) res.url = good_link needs_commit = True else: print stats.add('Update link (not written)', ns_title) print stats.report() if write and needs_commit: model.repo.commit_and_remove()