def _get_pkg(self, pkg_ref): if not self._pkg_cache.has_key(pkg_ref): pkg = self.ckanclient.package_entity_get(pkg_ref) if self.ckanclient.last_status != 200: raise ScriptError("Could not get package ID %s: %r" % (pkg_ref, self.ckanclient.last_status)) remove_readonly_fields(pkg) self._pkg_cache[pkg_ref] = pkg return self._pkg_cache[pkg_ref]
def _get_pkg(self, pkg_ref): if not self._pkg_cache.has_key(pkg_ref): pkg = self.ckanclient.package_entity_get(pkg_ref) if self.ckanclient.last_status != 200: raise ScriptError('Could not get package ID %s: %r' % \ (pkg_ref, self.ckanclient.last_status)) remove_readonly_fields(pkg) self._pkg_cache[pkg_ref] = pkg return self._pkg_cache[pkg_ref]
def run(self): limit = 100 def search(page=None): opts = { # 'external_reference': 'ONSHUB', 'limit': limit} if page != None: opts['offset'] = page * limit return self.ckanclient.package_search( 'Education', # 'Source agency: Education', opts) res = search() print 'Found %i packages possibly related.' % res['count'] pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] for page in range(res['count'] / limit): res = search(page) pkg_refs = res['results'] for pkg_ref in pkg_refs: pkg = self.ckanclient.package_entity_get(pkg_ref) if 'ONS' not in pkg['extras'].get('import_source', ''): pkgs_rejected['Not imported from ONS'].append(pkg) continue if pkg.get('state', 'active') != 'active': pkgs_rejected['Package state = %r' % pkg.get('state')].append(pkg) continue source_agency = '|'.join([line.replace('Source agency:', '').strip() for line in pkg['notes'].split('\n') if 'Source agency' in line]) if source_agency != 'Education': pkgs_rejected['Source agency = %r' % source_agency].append(pkg) continue if 'Department for Education' in pkg['extras'].get('department', ''): pkgs_rejected['Department = %r' % pkg['extras'].get('department', '')].append(pkg) continue pkg_name = pkg['name'] dept = pkg['extras'].get('department') agency = pkg['extras'].get('agency') author = pkg['author'] print '%s :\n %r %r %r' % (pkg_name, dept, agency, author) if not self.dry_run: pkg['extras']['department'] = 'Department for Education' pkg['extras']['agency'] = '' pkg['author'] = 'Department for Education' remove_readonly_fields(pkg) self.ckanclient.package_entity_put(pkg) print '...done' pkgs_done.append(pkg) print 'Processed %i packages' % len(pkgs_done) print 'Rejected packages:' for reason, pkgs in pkgs_rejected.items(): print ' %i: %s' % (len(pkgs), reason)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = self.ckanclient.package_register_get() log.info('Working on %i packages', len(all_pkgs)) for pkg_ref in all_pkgs: log.info('Package: %s', pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error('Could not get: %r' % e) pkgs_rejected['Could not get package: %r' % e].append(pkg_ref) continue pkg_before_changes = copy.deepcopy(pkg) if pkg['state'] != 'active': msg = 'Not active (%s)' % pkg['state'] log.info('...%s: %r' % (msg, pkg['name'])) pkgs_rejected[msg].append(pkg) continue if pkg['extras'].get('external_reference') != 'ONSHUB': msg = 'Not ONS' log.info('...%s: %r' % (msg, pkg['name'])) pkgs_rejected[msg].append(pkg) continue if pkg['resources'] == []: pkg['state'] = 'deleted' if pkg == pkg_before_changes: log.info('...package unchanged: %r' % pkg['name']) pkgs_rejected['Package unchanged'].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put(pkg) except CkanApiError, e: log.error('Could not put: %r' % e) pkgs_rejected['Could not put package: %r' % e].append(pkg_ref) continue log.info('...done') pkgs_done.append(pkg)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = self.ckanclient.package_register_get() log.info('Working on %i packages', len(all_pkgs)) for pkg_ref in all_pkgs: log.info('Package: %s', pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error('Could not get: %r' % e) pkgs_rejected['Could not get package: %r' % e].append(pkg_ref) continue pkg_before_changes = copy.deepcopy(pkg) for attribute in mapped_attributes: orig_value = pkg['extras'].get(attribute) if not orig_value: continue mapped_value = mapped_attributes[attribute].get(orig_value) if mapped_value: pkg['extras'][attribute] = mapped_value log.info('%s: %r -> %r', \ attribute, orig_value, mapped_value) else: log.warn('Invalid value for %r: %r', \ attribute, orig_value) if pkg == pkg_before_changes: log.info('...package unchanged: %r' % pkg['name']) pkgs_rejected['Package unchanged: %r' % pkg['name']].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put(pkg) except CkanApiError, e: log.error('Could not put: %r' % e) pkgs_rejected['Could not put package: %r' % e].append(pkg_ref) continue log.info('...done') pkgs_done.append(pkg)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = self.ckanclient.package_register_get() log.info("Working on %i packages", len(all_pkgs)) for pkg_ref in all_pkgs: log.info("Package: %s", pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error("Could not get: %r" % e) pkgs_rejected["Could not get package: %r" % e].append(pkg_ref) continue pkg_before_changes = copy.deepcopy(pkg) for attribute in mapped_attributes: orig_value = pkg["extras"].get(attribute) if not orig_value: continue mapped_value = mapped_attributes[attribute].get(orig_value) if mapped_value: pkg["extras"][attribute] = mapped_value log.info("%s: %r -> %r", attribute, orig_value, mapped_value) else: log.warn("Invalid value for %r: %r", attribute, orig_value) if pkg == pkg_before_changes: log.info("...package unchanged: %r" % pkg["name"]) pkgs_rejected["Package unchanged: %r" % pkg["name"]].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put(pkg) except CkanApiError, e: log.error("Could not put: %r" % e) pkgs_rejected["Could not put package: %r" % e].append(pkg_ref) continue log.info("...done") pkgs_done.append(pkg)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = self.ckanclient.package_register_get() log.info('Working on %i packages', len(all_pkgs)) for pkg_ref in all_pkgs: log.info('Package: %s', pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error('Could not get: %r' % e) pkgs_rejected['Could not get package: %r' % e].append(pkg_ref) continue pkg_before_changes = copy.deepcopy(pkg) # mapped attributes for attribute in mapped_attributes: orig_value = pkg['extras'].get(attribute) if not orig_value: continue mapped_value = mapped_attributes[attribute].get(orig_value) if not mapped_value: mapped_value = mapped_attributes[attribute].get(orig_value.lower().strip()) if not mapped_value: if orig_value.lower() in mapped_attributes[attribute].values(): mapped_value = orig_value.lower() if mapped_value and orig_value != mapped_value: pkg['extras'][attribute] = mapped_value log.info('%s: %r -> %r', \ attribute, orig_value, mapped_value) else: log.warn('Invalid value for %r: %r', \ attribute, orig_value) # create publisher fields if self.update_all or not pkg['extras'].get('published_by'): dept = pkg['extras'].get('department') agency = pkg['extras'].get('agency') if dept: pub_by = self.get_organisation(dept) pub_via = self.get_organisation(agency) if agency else '' else: pub_by = self.get_organisation(agency) if agency else '' pub_via = '' if not pub_by or pub_via: log.warn('No publisher for package: %s', pkg['name']) log.info('%s:\n %r/%r ->\n %r/%r', \ pkg['name'], dept, agency, pub_by, pub_via) pkg['extras']['published_by'] = pub_by pkg['extras']['published_via'] = pub_via if pkg == pkg_before_changes: log.info('...package unchanged: %r' % pkg['name']) pkgs_rejected['Package unchanged'].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put(pkg) except CkanApiError, e: log.error('Could not put: %r' % e) pkgs_rejected['Could not put package: %r' % e].append(pkg_ref) continue log.info('...done') pkgs_done.append(pkg)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = self.ckanclient.package_register_get() log.info('Working on %i packages', len(all_pkgs)) for pkg_ref in all_pkgs: log.info('Package: %s', pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error('Could not get: %r' % e) pkgs_rejected['Could not get package: %r' % e].append(pkg_ref) continue pkg_before_changes = copy.deepcopy(pkg) # mapped attributes for attribute in mapped_attributes: orig_value = pkg['extras'].get(attribute) if not orig_value: continue mapped_value = mapped_attributes[attribute].get(orig_value) if not mapped_value: mapped_value = mapped_attributes[attribute].get( orig_value.lower().strip()) if not mapped_value: if orig_value.lower( ) in mapped_attributes[attribute].values(): mapped_value = orig_value.lower() if mapped_value and orig_value != mapped_value: pkg['extras'][attribute] = mapped_value log.info('%s: %r -> %r', \ attribute, orig_value, mapped_value) else: log.warn('Invalid value for %r: %r', \ attribute, orig_value) # create publisher fields if self.update_all or not pkg['extras'].get('published_by'): dept = pkg['extras'].get('department') agency = pkg['extras'].get('agency') if dept: pub_by = self.get_organisation(dept) pub_via = self.get_organisation( agency) if agency else '' else: pub_by = self.get_organisation( agency) if agency else '' pub_via = '' if not pub_by or pub_via: log.warn('No publisher for package: %s', pkg['name']) log.info('%s:\n %r/%r ->\n %r/%r', \ pkg['name'], dept, agency, pub_by, pub_via) pkg['extras']['published_by'] = pub_by pkg['extras']['published_via'] = pub_via if pkg == pkg_before_changes: log.info('...package unchanged: %r' % pkg['name']) pkgs_rejected['Package unchanged'].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put(pkg) except CkanApiError, e: log.error('Could not put: %r' % e) pkgs_rejected['Could not put package: %r' % e].append(pkg_ref) continue log.info('...done') pkgs_done.append(pkg)
def run(self): limit = 100 def search(page=None): opts = { # 'external_reference': 'ONSHUB', 'limit': limit } if page != None: opts['offset'] = page * limit return self.ckanclient.package_search( 'Education', # 'Source agency: Education', opts) res = search() print 'Found %i packages possibly related.' % res['count'] pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] for page in range(res['count'] / limit): res = search(page) pkg_refs = res['results'] for pkg_ref in pkg_refs: pkg = self.ckanclient.package_entity_get(pkg_ref) if 'ONS' not in pkg['extras'].get('import_source', ''): pkgs_rejected['Not imported from ONS'].append(pkg) continue if pkg.get('state', 'active') != 'active': pkgs_rejected['Package state = %r' % pkg.get('state')].append(pkg) continue source_agency = '|'.join([ line.replace('Source agency:', '').strip() for line in pkg['notes'].split('\n') if 'Source agency' in line ]) if source_agency != 'Education': pkgs_rejected['Source agency = %r' % source_agency].append(pkg) continue if 'Department for Education' in pkg['extras'].get( 'department', ''): pkgs_rejected['Department = %r' % pkg['extras'].get( 'department', '')].append(pkg) continue pkg_name = pkg['name'] dept = pkg['extras'].get('department') agency = pkg['extras'].get('agency') author = pkg['author'] print '%s :\n %r %r %r' % (pkg_name, dept, agency, author) if not self.dry_run: pkg['extras']['department'] = 'Department for Education' pkg['extras']['agency'] = '' pkg['author'] = 'Department for Education' remove_readonly_fields(pkg) self.ckanclient.package_entity_put(pkg) print '...done' pkgs_done.append(pkg) print 'Processed %i packages' % len(pkgs_done) print 'Rejected packages:' for reason, pkgs in pkgs_rejected.items(): print ' %i: %s' % (len(pkgs), reason)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = sorted(self.ckanclient.package_register_get()) log.info('Working on %i packages', len(all_pkgs)) for pkg_ref in all_pkgs: log.info('Package: %s', pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error('Could not get: %r' % e) pkgs_rejected['Could not get package: %r' % e].append(pkg_ref) if self.force: continue else: log.error('Exiting due to error') break pkg_before_changes = copy.deepcopy(pkg) if pkg['state'] != 'active': msg = 'Not active (%s)' % pkg['state'] log.info('...%s: %r' % (msg, pkg['name'])) pkgs_rejected[msg].append(pkg) continue is_ons = pkg['extras'].get('import_source', '').startswith('ONS') if not is_ons: msg = 'Not ONS' log.info('...%s: %r' % (msg, pkg['name'])) pkgs_rejected[msg].append(pkg) continue # comment out name existing_name = pkg['name'] pkg['name'] = '_' + pkg['name'][:99] # delete pkg['state'] = 'deleted' if pkg == pkg_before_changes: log.info('...package unchanged: %r' % pkg['name']) pkgs_rejected['Package unchanged'].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put( pkg, package_name=existing_name) except CkanApiError, e: log.error('Could not put: %r' % e) pkgs_rejected['Could not PUT package: %r' % e].append(pkg_ref) if self.force: continue else: log.error('Exiting due to error') break # just check the state is correct as older CKANs don't # work properly if self.ckanclient.last_message['state'] != pkg['state'] and \ pkg['state'] == 'deleted': self.ckanclient.package_entity_delete(pkg['name']) log.info('...deleted separately') log.info('...done') pkgs_done.append(pkg)
def run(self): pkgs_done = [] pkgs_rejected = defaultdict(list) # reason: [pkgs] all_pkgs = sorted(self.ckanclient.package_register_get()) log.info('Working on %i packages', len(all_pkgs)) for pkg_ref in all_pkgs: log.info('Package: %s', pkg_ref) try: try: pkg = self.ckanclient.package_entity_get(pkg_ref) except CkanApiError, e: log.error('Could not get: %r' % e) pkgs_rejected['Could not get package: %r' % e].append(pkg_ref) if self.force: continue else: log.error('Exiting due to error') break pkg_before_changes = copy.deepcopy(pkg) if pkg['state'] != 'active': msg = 'Not active (%s)' % pkg['state'] log.info('...%s: %r' % (msg, pkg['name'])) pkgs_rejected[msg].append(pkg) continue is_ons = pkg['extras'].get('import_source', '').startswith('ONS') if not is_ons: msg = 'Not ONS' log.info('...%s: %r' % (msg, pkg['name'])) pkgs_rejected[msg].append(pkg) continue # comment out name existing_name = pkg['name'] pkg['name'] = '_' + pkg['name'][:99] # delete pkg['state'] = 'deleted' if pkg == pkg_before_changes: log.info('...package unchanged: %r' % pkg['name']) pkgs_rejected['Package unchanged'].append(pkg) continue if not self.dry_run: remove_readonly_fields(pkg) try: self.ckanclient.package_entity_put(pkg, package_name=existing_name) except CkanApiError, e: log.error('Could not put: %r' % e) pkgs_rejected['Could not PUT package: %r' % e].append(pkg_ref) if self.force: continue else: log.error('Exiting due to error') break # just check the state is correct as older CKANs don't # work properly if self.ckanclient.last_message['state'] != pkg['state'] and \ pkg['state'] == 'deleted': self.ckanclient.package_entity_delete(pkg['name']) log.info('...deleted separately') log.info('...done') pkgs_done.append(pkg)