def run(cls, config_ini_or_ckan_url, dataset_names): ckan = common.get_ckanapi(config_ini_or_ckan_url) stats = Stats() for dataset_name in dataset_names: dataset_name = common.name_stripped_of_url(dataset_name) try: ckan.call_action('dataset_delete', {'id': dataset_name}) print stats.add('Deleted (or was already deleted)', dataset_name) except (KeyboardInterrupt, SystemExit): raise except Exception, e: if 'CKANAPIError' in str(e): print e print 'Not calling API correctly - aborting' sys.exit(1) print stats.add('Error %s' % type(e).__name__, '%s %s' % (dataset_name, e))
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend([dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date(resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add('Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get('resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add('Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation'] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % (i+1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % (field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id']) date_format = {'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY'} input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print [res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace()
def command(cls, config_ini, dataset_names, options): common.load_config(config_ini) common.register_translator() from pylons import config apikey = config['dgu.merge_datasets.apikey'] ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey) #ckan = ckanapi.LocalCKAN() if options.publisher: org_name = common.name_stripped_of_url(options.publisher) if options.search: results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100) dataset_names.extend( [dataset['name'] for dataset in results['results']]) else: org = ckan.action.organization_show(id=org_name, include_datasets=True) dataset_names.extend([d['name'] for d in org['packages']]) datasets = [] datasets_by_name = {} def get_extra(dataset, key): for extra in dataset['extras']: if extra['key'] == key: return extra['value'] for dataset_name in dataset_names: print 'Dataset: %s' % dataset_name for dataset_name in dataset_names: # strip off the url part of the dataset name, if there is one dataset_name = common.name_stripped_of_url(dataset_name) dataset = ckan.action.package_show(id=dataset_name) harvest_source_ref = get_extra(dataset, 'harvest_source_reference') if harvest_source_ref: print '** Discarding dataset %s due to harvest source: %s **' \ % (dataset_name, harvest_source_ref) continue datasets.append(dataset) datasets_by_name[dataset['name']] = dataset datasets.sort(key=lambda x: x['metadata_modified']) # aggregate resources def resource_identity(res_dict, dataset_name): return (res_dict.get('date'), res_dict['url'], res_dict.get('title') or res_dict['description'], res_dict.get('format'), dataset_name) combined_resources = {} # identity res_stats = Stats() for dataset in datasets: for resource in dataset['resources']: identity = resource_identity(resource, dataset['name']) resource['dataset_name'] = dataset['name'] if identity in combined_resources: print res_stats.add( 'Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity])) else: combined_resources[identity] = resource resources = combined_resources.values() # find dates for resources # NB This has been pulled out into timeseries_convert.py - # TODO call that instead of having the code here too. if options.frequency: url_munge_re = re.compile('(%20|-|_|\.)') def fields_to_hunt_for_date(res): date = res.get('date') if date: yield 'date', date title = res.get('title') if title: yield 'title', title yield 'description', res['description'] yield 'url', url_munge_re.sub(' ', res['url']) if not options.update: dataset = datasets_by_name[res['dataset_name']] yield 'dataset-title', dataset['title'] yield 'dataset-notes', dataset['notes'] ensure_regexes_are_initialized() global regexes for resource in resources: for field_name, field_value in fields_to_hunt_for_date( resource): if options.frequency in ('monthly', 'quarterly', 'twice annually'): month, year = hunt_for_month_and_year(field_value) if year and month: resource['date'] = '%02d/%s' % (month, year) res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break elif options.frequency == 'annually': year = regexes['year'].search(field_value) if year: resource['date'] = year.groups()[0] res_stats.add( 'Found date in %s' % field_name, '%s %r' % (resource['date'], resource)) if resource.get( 'resource_type') == 'documentation': resource['resource_type'] = 'file' res_stats.add('Converted additional resource', resource) break else: if resource.get('resource_type') == 'documentation': print res_stats.add( 'Could not find date but it\'s Additional Resource', resource) continue print res_stats.add('Could not find date', resource) continue print 'Resources: \n', res_stats resources_without_date = [ res for res in resources if not res.get('date') and res.get('resource_type') != 'documentation' ] for i, res in enumerate(resources_without_date): print 'Resources without dates %s/%s' % ( i + 1, len(resources_without_date)) for field_name, field_value in fields_to_hunt_for_date(res): print ' %s: %s' % ( field_name, field_value.encode('latin-1', 'ignore')) print 'https://data.gov.uk/dataset/%s/resource/%s' % ( res['dataset_name'], res['id']) date_format = { 'annually': 'YYYY', 'monthly': 'MM/YYYY', 'twice annually': 'MM/YYYY', 'quarterly': 'MM/YYYY' } input_ = raw_input( 'Date (%s) or DOCS to make it an Additional Resource: ' % date_format[options.frequency]) if input_.strip().lower() == 'docs': res['date'] = '' res['resource_type'] = 'documentation' else: res['date'] = input_ resources.sort(key=lambda x: x.get('date', '').split('/')[::-1]) # Ensure there is not a mixture of resources with and without a date have_dates = None for res in resources: if res.get('resource_type') == 'documentation': continue if have_dates is None: have_dates = bool(res.get('date')) else: has_date = bool(res.get('date')) if has_date != have_dates: print[res.get('date') for res in resources] print 'Cannot mix resources with dates and others without!' import pdb pdb.set_trace() # Remove 'dataset_name' and others fields from resources ignore_res_fields = set( ('dataset_name', 'created', 'position', 'revision_id', 'id', 'tracking_summary', 'qa', 'archiver')) for res in resources: for field in ignore_res_fields & set(res.keys()): del res[field] # Merge dataset fields def get_all_fields_and_values(datasets): ignore_fields = set(( 'id', 'resources', 'last_major_modification', 'data_dict', 'revision_timestamp', 'num_tags', 'metadata_created', 'metadata_modified', 'odi_certificate', 'extras', # they are at top level already 'timeseries_resources', 'individual_resources', 'additional_resources', 'revision_id', 'organization', 'tracking_summary', 'num_resources', 'license_title', 'author', 'author_email', 'maintainer', 'maintainer_email', 'temporal_granularity', 'geographic_granularity', 'state', 'isopen', 'url', 'date_update_future', 'date_updated', 'date_released', 'precision', 'taxonomy_url', 'temporal_coverage-from', 'temporal_coverage-to', 'published_via', 'creator_user_id', 'qa', 'archiver', )) first_fields = [ 'title', 'name', 'notes', 'theme-primary', 'theme-secondary' ] all_field_values = defaultdict(list) for dataset in datasets: for field in dataset: if field not in ignore_fields and dataset[field]: all_field_values[field].append(dataset[field]) for field in first_fields: yield field, all_field_values.get(field, []) for field in all_field_values: if field not in first_fields: yield field, all_field_values[field] spend_data_defaults = { 'geographic_coverage': None, 'theme-primary': 'Government Spending', 'theme-secondary': None, 'update_frequency': 'monthly', } combined_dataset = {'resources': resources} all_fields_and_values = get_all_fields_and_values(datasets) for field, values in all_fields_and_values: if field == 'notes': values = [value.strip() for value in values] if field == 'tags': # just merge them up-front and # dont offer user any choice tags_by_name = {} for dataset_tags in values: for tag in dataset_tags: if tag['name'] not in tags_by_name: tags_by_name[tag['name']] = tag values = [tags_by_name.values()] if field in ('codelist', 'schema'): # just merge them up-front # And convert the dict into just an id string ids = set() for dataset_values in values: for value_dict in dataset_values: ids.add(value_dict['id']) values = [list(ids)] print '\n%s:' % field pprint(list(enumerate(values))) if options.spend and field in spend_data_defaults: value = spend_data_defaults[field] print 'Spend data defaults to: %s' % value values = [value] if value is not None else None # dont be case-sensitive for boolean fields if field == 'core-dataset': values = [v.lower() for v in values] try: values_identicle = len(set(values)) == 1 except TypeError: if values and len(values): val1 = values[0] for val in values[1:]: if val != val1: values_identicle = False break else: values_identicle = True if (not values) or (not len(values)): pass elif values_identicle: value = values[0] elif field == 'name': while True: from ckan.lib.munge import munge_title_to_name munged_title = munge_title_to_name( combined_dataset['title']) print munge_title_to_name( datasets[0]['organization']['title']) value = raw_input('Type new value (%s): ' % (munged_title)) if not value: value = munged_title if len(value) < 3: print 'Too short' continue if value in values: print 'That name is taken' continue existing = ckan.action.package_autocomplete(q=value) if value in existing: print 'That name is taken on CKAN' continue break else: while True: response = raw_input( '%s: value (number) or type new one: ' % field) try: value_index = int(response) value = values[value_index] print value except ValueError: # fix pound signs if the user pasted from the repr'd version response = re.sub(r'\\xa3', u'\xa3', response) value = response if not value and field in ('title', 'owner_org', 'notes', 'license_id'): print 'You must have a value for this field!' continue break if value: combined_dataset[field] = value # Store print '\nMerged dataset:\n' pprint(combined_dataset) response = raw_input( 'Press enter to write or pdb to edit in pdb first: ') if response == 'pdb': import pdb pdb.set_trace() try: if options.update: ckan.action.dataset_update(**combined_dataset) else: ckan.action.dataset_create(**combined_dataset) except Exception, e: print e import pdb pdb.set_trace()