def run(cls, config_ini_or_ckan_url, dataset_names): ckan = common.get_ckanapi(config_ini_or_ckan_url) stats = Stats() for dataset_name in dataset_names: try: ckan.call_action('dataset_delete', {'id': dataset_name}) print stats.add('Deleted (or was already deleted)', dataset_name) except (KeyboardInterrupt, SystemExit): raise except Exception, e: if 'CKANAPIError' in str(e): print e print 'Not calling API correctly - aborting' sys.exit(1) print stats.add('Error %s' % type(e).__name__, '%s %s' % (dataset_name, e))
def run(cls, config_ini_or_ckan_url, dataset_names): ckan = common.get_ckanapi(config_ini_or_ckan_url) stats = Stats() for dataset_name in dataset_names: dataset_name = common.name_stripped_of_url(dataset_name) try: ckan.call_action('dataset_delete', {'id': dataset_name}) print stats.add('Deleted (or was already deleted)', dataset_name) except (KeyboardInterrupt, SystemExit): raise except Exception, e: if 'CKANAPIError' in str(e): print e print 'Not calling API correctly - aborting' sys.exit(1) print stats.add('Error %s' % type(e).__name__, '%s %s' % (dataset_name, e))
def main(source, source_type, destination, save_relevant_datasets_json, write, dataset_filter=None, res_url_filter=None): if source_type == 'json': all_datasets = get_datasets_from_json(source) elif source_type == 'jsonl': all_datasets = get_datasets_from_jsonl(source) else: all_datasets = get_datasets_from_ckan(source) datasets = [] # legacy ones revamped_datasets = [] # ones created on 3rd October 2016 launch revamped_datasets_by_org = {} revamped_resources = {} csv_out_rows = [] csv_corrected_rows = [] try: # find all the legacy organogram datasets all_datasets = list(all_datasets) # since we need to iterate it twice for dataset in all_datasets: if dataset_filter and dataset['name'] != dataset_filter: continue if res_url_filter and \ res_url_filter not in [r['url'] for r in dataset['resources']]: continue # check it an organogram dataset dataset_str = repr(dataset).lower() if 'rganog' not in dataset_str \ and 'roles and salaries' not in dataset_str \ and 'pay and post' not in dataset_str \ and 'posts and pay' not in dataset_str \ and 'organisation chart' not in dataset_str \ and 'organization chart' not in dataset_str \ and 'org chart' not in dataset_str: stats_datasets.add('Ignored - not organograms', dataset['name']) continue if dataset['name'] in ( 'eastbourne-borough-council-public-toilets', 'staff-organograms-and-pay-government-offices', ) \ or dataset['id'] in ( '47f69ebb-9939-419f-880d-1b976676cb0e', ): stats_datasets.add('Ignored - not organograms', dataset['name']) continue if asbool(dataset.get('unpublished')): stats_datasets.add('Ignored - unpublished', dataset['name']) continue extras = dict((extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') == 'organograms_v2': continue if extras.get('import_source') == 'harvest': stats_datasets.add('Ignored - harvested so can\'t edit it', dataset['name']) continue # legacy dataset datasets.append(dataset) # find the revamped organogram datasets for dataset in all_datasets: extras = dict((extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') != 'organograms_v2': continue org_id = dataset['owner_org'] revamped_datasets.append(dataset) assert org_id not in revamped_datasets_by_org, org_id revamped_datasets_by_org[org_id] = dataset for res in dataset['resources']: date = date_to_year_month(res['date']) revamped_resources[(org_id, date)] = res continue if save_relevant_datasets_json: filename = 'datasets_organograms.json' if not (dataset_filter or res_url_filter): output = json.dumps( datasets + revamped_datasets, indent=4, separators=(',', ': '), # pretty print) ) with open(filename, 'wb') as f: f.write(output) print 'Written %s' % filename else: print 'Not written %s because you filtered by a ' \ 'dataset/resource' % filename all_resource_ids_to_delete = defaultdict(list) # dataset_name: res_id_list dataset_names_to_delete = set() for dataset in datasets: org_id = dataset['owner_org'] # save csv as it has been save_csv_rows(csv_out_rows, dataset, None, None) original_dataset = copy.deepcopy(dataset) delete_dataset = False dataset_to_merge_to = \ get_dataset_to_merge_to(dataset, revamped_datasets_by_org) # detect dates for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue stats = timeseries_convert.add_date_to_resource( res, dataset=dataset) # resource corrections resources_to_delete = [] for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue resource_corrections(res, dataset, extras, revamped_resources, revamped_datasets_by_org, dataset_to_merge_to, org_id, resources_to_delete, stats_res) for res in resources_to_delete: dataset['resources'].remove(res) if not dataset['resources']: delete_dataset = True elif resources_to_delete and not dataset_to_merge_to: all_resource_ids_to_delete[dataset['name']].extend( res['id'] for res in resources_to_delete) org_id = dataset['owner_org'] # it might have changed for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue if res.get('resource_type') != 'documentation' and not res.get('date'): stats_dates.add('Missing date', dataset['name']) break else: stats_dates.add('Ok dates', dataset['name']) # record changes if delete_dataset: stats_datasets.add('Delete dataset - no resources', dataset['name']) dataset_names_to_delete.add(dataset['name']) continue elif original_dataset != dataset: stats_datasets.add('Updated dataset', dataset['name']) has_changed = True else: stats_datasets.add('Unchanged dataset', dataset['name']) has_changed = False if dataset_to_merge_to: stats_merge.add('Merge', dataset_to_merge_to) else: stats_merge.add('No merge', dataset['name']) # save csv with corrections save_csv_rows(csv_corrected_rows, dataset, has_changed, dataset_to_merge_to) except: traceback.print_exc() import pdb; pdb.set_trace() stats_merge.report_value_limit = 500 stats_res.report_value_limit = 500 print '\nDatasets\n', stats_datasets print '\nDataset merges\n', stats_merge print '\nDates\n', stats_dates print '\nResources\n', stats_res # save csvs if dataset_filter or res_url_filter: for row in csv_corrected_rows: if res_url_filter and row['res_url'] != res_url_filter: continue pprint(row) print 'Not written csv because you specified a particular dataset' else: headers = [ 'name', 'org_title', 'org_id', 'notes', 'res_id', 'res_name', 'res_url', 'res_format', 'res_date', 'res_type', 'has_changed', 'merge_to_dataset', ] for csv_rows, out_filename in ( (csv_out_rows, 'organogram_legacy_datasets.csv'), (csv_corrected_rows, 'organogram_legacy_datasets_corrected.csv'), ): with open(out_filename, 'wb') as csv_write_file: csv_writer = unicodecsv.DictWriter(csv_write_file, fieldnames=headers, encoding='utf-8') csv_writer.writeheader() for row in sorted(csv_rows, key=lambda r: r['res_url']): csv_writer.writerow(row) print 'Written', out_filename # group merges by the revamped_dataset resources_to_merge = defaultdict(list) # revamped_dataset_name: resource_list resources_to_update = defaultdict(list) # dataset_name: resource_list for row in csv_corrected_rows: if row['has_changed'] is False: continue res = dict( id=row['res_id'], description=row['res_name'], # description is required url=row['res_url'], format=row['res_format'], date=row['res_date'], resource_type=row['res_type']) if row['merge_to_dataset']: res['id'] = None # ignore the id resources_to_merge[row['merge_to_dataset']].append(res) # also delete the merged dataset if row['name'] not in dataset_names_to_delete: dataset_names_to_delete.add(row['name']) else: resources_to_update[row['name']].append(res) # write changes - merges etc try: if destination: if write: write_caveat = '' else: write_caveat = ' (NOP without --write)' print 'Writing changes to datasets' + write_caveat stats_write_res = Stats() stats_write_dataset = Stats() ckan = common.get_ckanapi(destination) import ckanapi print 'Updating datasets' for dataset_name, res_list in resources_to_update.iteritems(): dataset = ckan.action.package_show(id=dataset_name) resources_by_id = dict((r['id'], r) for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (dataset_name, res_list.index(res)) res_to_update = resources_by_id.get(res['id']) if res_to_update: res_changed = False for key in res.keys(): if res[key] != res_to_update.get(key): res_to_update[key] = res[key] dataset_changed = True res_changed = True if res_changed: stats_write_res.add( 'update - ok' + write_caveat, res_ref) else: stats_write_res.add( 'update - not needed', res_ref) else: stats_write_res.add( 'update - could not find resource id', dataset_name) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add( 'Update done' + write_caveat, dataset_name) else: stats_write_dataset.add( 'Update not needed', dataset_name) print 'Merging datasets' for revamped_dataset_name, res_list in \ resources_to_merge.iteritems(): try: dataset = ckan.action.package_show(id=revamped_dataset_name) except ckanapi.NotFound: stats_write_dataset.add( 'Merge - dataset not found', revamped_dataset_name) continue existing_res_urls = set(r['url'] for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (revamped_dataset_name, res_list.index(res)) if res['url'] in existing_res_urls: stats_write_res.add( 'merge - no change - resource URL already there', res_ref) else: dataset_changed = True res['description'] += ' (from legacy dataset)' dataset['resources'].append(res) stats_write_res.add( 'merge - add' + write_caveat, res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add( 'Merge done' + write_caveat, revamped_dataset_name) else: stats_write_dataset.add('Merge not needed', revamped_dataset_name) print 'Deleting resources' for dataset_name, res_id_list in \ all_resource_ids_to_delete.iteritems(): if dataset_name in dataset_names_to_delete: stats_write_dataset.add( 'Delete resources not needed as deleting dataset later', dataset_name) continue try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add( 'Delete res - dataset not found', dataset_name) continue existing_resources = \ dict((r['id'], r) for r in dataset['resources']) dataset_changed = False for res_id in res_id_list: res_ref = '%s-%s' % (dataset_name, res_id_list.index(res_id)) existing_resource = existing_resources.get(res_id) if existing_resource: dataset_changed = True dataset['resources'].remove(existing_resource) stats_write_res.add( 'delete res - done' + write_caveat, res_ref) else: stats_write_res.add( 'delete res - could not find res id', res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add( 'Delete res done' + write_caveat, dataset_name) else: stats_write_dataset.add( 'Delete res not needed', dataset_name) print 'Deleting datasets' for dataset_name in dataset_names_to_delete: try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add( 'Delete dataset - not found', dataset_name) else: if write: ckan.action.package_delete(id=dataset_name) stats_write_dataset.add( 'Delete dataset - done' + write_caveat, dataset_name) print '\nResources\n', stats_write_res print '\nDatasets\n', stats_write_dataset else: print 'Not written changes to datasets' except: traceback.print_exc() import pdb; pdb.set_trace()
def main(source, source_type, destination, save_relevant_datasets_json, write, dataset_filter=None, res_url_filter=None): if source_type == 'json': all_datasets = get_datasets_from_json(source) elif source_type == 'jsonl': all_datasets = get_datasets_from_jsonl(source) else: all_datasets = get_datasets_from_ckan(source) datasets = [] # legacy ones revamped_datasets = [] # ones created on 3rd October 2016 launch revamped_datasets_by_org = {} revamped_resources = {} csv_out_rows = [] csv_corrected_rows = [] try: # find all the legacy organogram datasets all_datasets = list(all_datasets) # since we need to iterate it twice for dataset in all_datasets: if dataset_filter and dataset['name'] != dataset_filter: continue if res_url_filter and \ res_url_filter not in [r['url'] for r in dataset['resources']]: continue # check it an organogram dataset dataset_str = repr(dataset).lower() if 'rganog' not in dataset_str \ and 'roles and salaries' not in dataset_str \ and 'pay and post' not in dataset_str \ and 'posts and pay' not in dataset_str \ and 'organisation chart' not in dataset_str \ and 'organization chart' not in dataset_str \ and 'org chart' not in dataset_str: stats_datasets.add('Ignored - not organograms', dataset['name']) continue if dataset['name'] in ( 'eastbourne-borough-council-public-toilets', 'staff-organograms-and-pay-government-offices', ) \ or dataset['id'] in ( '47f69ebb-9939-419f-880d-1b976676cb0e', ): stats_datasets.add('Ignored - not organograms', dataset['name']) continue if asbool(dataset.get('unpublished')): stats_datasets.add('Ignored - unpublished', dataset['name']) continue extras = dict( (extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') == 'organograms_v2': continue if extras.get('import_source') == 'harvest': stats_datasets.add('Ignored - harvested so can\'t edit it', dataset['name']) continue # legacy dataset datasets.append(dataset) # find the revamped organogram datasets for dataset in all_datasets: extras = dict( (extra['key'], extra['value']) for extra in dataset['extras']) if extras.get('import_source') != 'organograms_v2': continue org_id = dataset['owner_org'] revamped_datasets.append(dataset) assert org_id not in revamped_datasets_by_org, org_id revamped_datasets_by_org[org_id] = dataset for res in dataset['resources']: date = date_to_year_month(res['date']) revamped_resources[(org_id, date)] = res continue if save_relevant_datasets_json: filename = 'datasets_organograms.json' if not (dataset_filter or res_url_filter): output = json.dumps( datasets + revamped_datasets, indent=4, separators=(',', ': '), # pretty print) ) with open(filename, 'wb') as f: f.write(output) print 'Written %s' % filename else: print 'Not written %s because you filtered by a ' \ 'dataset/resource' % filename all_resource_ids_to_delete = defaultdict( list) # dataset_name: res_id_list dataset_names_to_delete = set() for dataset in datasets: org_id = dataset['owner_org'] # save csv as it has been save_csv_rows(csv_out_rows, dataset, None, None) original_dataset = copy.deepcopy(dataset) delete_dataset = False dataset_to_merge_to = \ get_dataset_to_merge_to(dataset, revamped_datasets_by_org) # detect dates for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue stats = timeseries_convert.add_date_to_resource( res, dataset=dataset) # resource corrections resources_to_delete = [] for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue resource_corrections(res, dataset, extras, revamped_resources, revamped_datasets_by_org, dataset_to_merge_to, org_id, resources_to_delete, stats_res) for res in resources_to_delete: dataset['resources'].remove(res) if not dataset['resources']: delete_dataset = True elif resources_to_delete and not dataset_to_merge_to: all_resource_ids_to_delete[dataset['name']].extend( res['id'] for res in resources_to_delete) org_id = dataset['owner_org'] # it might have changed for res in dataset['resources']: if res_url_filter and res['url'] != res_url_filter: continue if res.get('resource_type') != 'documentation' and not res.get( 'date'): stats_dates.add('Missing date', dataset['name']) break else: stats_dates.add('Ok dates', dataset['name']) # record changes if delete_dataset: stats_datasets.add('Delete dataset - no resources', dataset['name']) dataset_names_to_delete.add(dataset['name']) continue elif original_dataset != dataset: stats_datasets.add('Updated dataset', dataset['name']) has_changed = True else: stats_datasets.add('Unchanged dataset', dataset['name']) has_changed = False if dataset_to_merge_to: stats_merge.add('Merge', dataset_to_merge_to) else: stats_merge.add('No merge', dataset['name']) # save csv with corrections save_csv_rows(csv_corrected_rows, dataset, has_changed, dataset_to_merge_to) except: traceback.print_exc() import pdb pdb.set_trace() stats_merge.report_value_limit = 500 stats_res.report_value_limit = 500 print '\nDatasets\n', stats_datasets print '\nDataset merges\n', stats_merge print '\nDates\n', stats_dates print '\nResources\n', stats_res # save csvs if dataset_filter or res_url_filter: for row in csv_corrected_rows: if res_url_filter and row['res_url'] != res_url_filter: continue pprint(row) print 'Not written csv because you specified a particular dataset' else: headers = [ 'name', 'org_title', 'org_id', 'notes', 'res_id', 'res_name', 'res_url', 'res_format', 'res_date', 'res_type', 'has_changed', 'merge_to_dataset', ] for csv_rows, out_filename in ( (csv_out_rows, 'organogram_legacy_datasets.csv'), (csv_corrected_rows, 'organogram_legacy_datasets_corrected.csv'), ): with open(out_filename, 'wb') as csv_write_file: csv_writer = unicodecsv.DictWriter(csv_write_file, fieldnames=headers, encoding='utf-8') csv_writer.writeheader() for row in sorted(csv_rows, key=lambda r: r['res_url']): csv_writer.writerow(row) print 'Written', out_filename # group merges by the revamped_dataset resources_to_merge = defaultdict( list) # revamped_dataset_name: resource_list resources_to_update = defaultdict(list) # dataset_name: resource_list for row in csv_corrected_rows: if row['has_changed'] is False: continue res = dict( id=row['res_id'], description=row['res_name'], # description is required url=row['res_url'], format=row['res_format'], date=row['res_date'], resource_type=row['res_type']) if row['merge_to_dataset']: res['id'] = None # ignore the id resources_to_merge[row['merge_to_dataset']].append(res) # also delete the merged dataset if row['name'] not in dataset_names_to_delete: dataset_names_to_delete.add(row['name']) else: resources_to_update[row['name']].append(res) # write changes - merges etc try: if destination: if write: write_caveat = '' else: write_caveat = ' (NOP without --write)' print 'Writing changes to datasets' + write_caveat stats_write_res = Stats() stats_write_dataset = Stats() ckan = common.get_ckanapi(destination) import ckanapi print 'Updating datasets' for dataset_name, res_list in resources_to_update.iteritems(): dataset = ckan.action.package_show(id=dataset_name) resources_by_id = dict( (r['id'], r) for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (dataset_name, res_list.index(res)) res_to_update = resources_by_id.get(res['id']) if res_to_update: res_changed = False for key in res.keys(): if res[key] != res_to_update.get(key): res_to_update[key] = res[key] dataset_changed = True res_changed = True if res_changed: stats_write_res.add('update - ok' + write_caveat, res_ref) else: stats_write_res.add('update - not needed', res_ref) else: stats_write_res.add( 'update - could not find resource id', dataset_name) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add('Update done' + write_caveat, dataset_name) else: stats_write_dataset.add('Update not needed', dataset_name) print 'Merging datasets' for revamped_dataset_name, res_list in \ resources_to_merge.iteritems(): try: dataset = ckan.action.package_show( id=revamped_dataset_name) except ckanapi.NotFound: stats_write_dataset.add('Merge - dataset not found', revamped_dataset_name) continue existing_res_urls = set(r['url'] for r in dataset['resources']) dataset_changed = False for res in res_list: res_ref = '%s-%s' % (revamped_dataset_name, res_list.index(res)) if res['url'] in existing_res_urls: stats_write_res.add( 'merge - no change - resource URL already there', res_ref) else: dataset_changed = True res['description'] += ' (from legacy dataset)' dataset['resources'].append(res) stats_write_res.add('merge - add' + write_caveat, res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add('Merge done' + write_caveat, revamped_dataset_name) else: stats_write_dataset.add('Merge not needed', revamped_dataset_name) print 'Deleting resources' for dataset_name, res_id_list in \ all_resource_ids_to_delete.iteritems(): if dataset_name in dataset_names_to_delete: stats_write_dataset.add( 'Delete resources not needed as deleting dataset later', dataset_name) continue try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add('Delete res - dataset not found', dataset_name) continue existing_resources = \ dict((r['id'], r) for r in dataset['resources']) dataset_changed = False for res_id in res_id_list: res_ref = '%s-%s' % (dataset_name, res_id_list.index(res_id)) existing_resource = existing_resources.get(res_id) if existing_resource: dataset_changed = True dataset['resources'].remove(existing_resource) stats_write_res.add('delete res - done' + write_caveat, res_ref) else: stats_write_res.add( 'delete res - could not find res id', res_ref) if dataset_changed: if write: ckan.action.package_update(**dataset) stats_write_dataset.add('Delete res done' + write_caveat, dataset_name) else: stats_write_dataset.add('Delete res not needed', dataset_name) print 'Deleting datasets' for dataset_name in dataset_names_to_delete: try: dataset = ckan.action.package_show(id=dataset_name) except ckanapi.NotFound: stats_write_dataset.add('Delete dataset - not found', dataset_name) else: if write: ckan.action.package_delete(id=dataset_name) stats_write_dataset.add( 'Delete dataset - done' + write_caveat, dataset_name) print '\nResources\n', stats_write_res print '\nDatasets\n', stats_write_dataset else: print 'Not written changes to datasets' except: traceback.print_exc() import pdb pdb.set_trace()