Example #1
0
    def run(cls, config_ini_or_ckan_url, dataset_names):
        ckan = common.get_ckanapi(config_ini_or_ckan_url)

        stats = Stats()
        for dataset_name in dataset_names:
            dataset_name = common.name_stripped_of_url(dataset_name)
            try:
                ckan.call_action('dataset_delete',
                                 {'id': dataset_name})
                print stats.add('Deleted (or was already deleted)', dataset_name)
            except (KeyboardInterrupt, SystemExit):
                raise
            except Exception, e:
                if 'CKANAPIError' in str(e):
                    print e
                    print 'Not calling API correctly - aborting'
                    sys.exit(1)
                print stats.add('Error %s' % type(e).__name__,
                                '%s %s' % (dataset_name, e))
Example #2
0
    def run(cls, config_ini_or_ckan_url, dataset_names):
        ckan = common.get_ckanapi(config_ini_or_ckan_url)

        stats = Stats()
        for dataset_name in dataset_names:
            dataset_name = common.name_stripped_of_url(dataset_name)
            try:
                ckan.call_action('dataset_delete', {'id': dataset_name})
                print stats.add('Deleted (or was already deleted)',
                                dataset_name)
            except (KeyboardInterrupt, SystemExit):
                raise
            except Exception, e:
                if 'CKANAPIError' in str(e):
                    print e
                    print 'Not calling API correctly - aborting'
                    sys.exit(1)
                print stats.add('Error %s' % type(e).__name__,
                                '%s %s' % (dataset_name, e))
Example #3
0
    def command(cls, config_ini, dataset_names, options):
        common.load_config(config_ini)
        common.register_translator()

        from pylons import config
        apikey = config['dgu.merge_datasets.apikey']
        ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey)
        #ckan = ckanapi.LocalCKAN()

        if options.publisher:
            org_name = common.name_stripped_of_url(options.publisher)
            if options.search:
                results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100)
                dataset_names.extend([dataset['name']
                                      for dataset in results['results']])
            else:
                org = ckan.action.organization_show(id=org_name,
                                                    include_datasets=True)
                dataset_names.extend([d['name'] for d in org['packages']])


        datasets = []
        datasets_by_name = {}

        def get_extra(dataset, key):
            for extra in dataset['extras']:
                if extra['key'] == key:
                    return extra['value']
        for dataset_name in dataset_names:
            print 'Dataset: %s' % dataset_name
        for dataset_name in dataset_names:
            # strip off the url part of the dataset name, if there is one
            dataset_name = common.name_stripped_of_url(dataset_name)
            dataset = ckan.action.package_show(id=dataset_name)
            harvest_source_ref = get_extra(dataset, 'harvest_source_reference')
            if harvest_source_ref:
                print '** Discarding dataset %s due to harvest source: %s **' \
                    % (dataset_name, harvest_source_ref)
                continue
            datasets.append(dataset)
            datasets_by_name[dataset['name']] = dataset
        datasets.sort(key=lambda x: x['metadata_modified'])

        # aggregate resources
        def resource_identity(res_dict, dataset_name):
            return (res_dict.get('date'), res_dict['url'],
                    res_dict.get('title') or res_dict['description'],
                    res_dict.get('format'),
                    dataset_name)
        combined_resources = {}  # identity
        res_stats = Stats()
        for dataset in datasets:
            for resource in dataset['resources']:
                identity = resource_identity(resource, dataset['name'])
                resource['dataset_name'] = dataset['name']
                if identity in combined_resources:
                    print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity]))
                else:
                    combined_resources[identity] = resource
        resources = combined_resources.values()

        # find dates for resources
        # NB This has been pulled out into timeseries_convert.py -
        # TODO call that instead of having the code here too.
        if options.frequency:
            url_munge_re = re.compile('(%20|-|_|\.)')

            def fields_to_hunt_for_date(res):
                date = res.get('date')
                if date:
                    yield 'date', date
                title = res.get('title')
                if title:
                    yield 'title', title
                yield 'description', res['description']
                yield 'url', url_munge_re.sub(' ', res['url'])
                if not options.update:
                    dataset = datasets_by_name[res['dataset_name']]
                    yield 'dataset-title', dataset['title']
                    yield 'dataset-notes', dataset['notes']

            ensure_regexes_are_initialized()
            global regexes
            for resource in resources:
                for field_name, field_value in fields_to_hunt_for_date(resource):
                    if options.frequency in ('monthly', 'quarterly', 'twice annually'):
                        month, year = hunt_for_month_and_year(field_value)
                        if year and month:
                            resource['date'] = '%02d/%s' % (month, year)
                            res_stats.add('Found date in %s' % field_name,
                                          '%s %r' %
                                          (resource['date'], resource))
                            if resource.get('resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource', resource)
                            break
                    elif options.frequency == 'annually':
                        year = regexes['year'].search(field_value)
                        if year:
                            resource['date'] = year.groups()[0]
                            res_stats.add('Found date in %s' % field_name,
                                          '%s %r' %
                                          (resource['date'], resource))
                            if resource.get('resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource', resource)
                            break
                else:
                    if resource.get('resource_type') == 'documentation':
                        print res_stats.add('Could not find date but it\'s Additional Resource', resource)
                        continue
                    print res_stats.add('Could not find date', resource)
                    continue

            print 'Resources: \n', res_stats

            resources_without_date = [res for res in resources
                                      if not res.get('date') and
                                      res.get('resource_type') != 'documentation']
            for i, res in enumerate(resources_without_date):
                print 'Resources without dates %s/%s' % (i+1, len(resources_without_date))
                for field_name, field_value in fields_to_hunt_for_date(res):
                    print '  %s: %s' % (field_name, field_value.encode('latin-1', 'ignore'))
                print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id'])
                date_format = {'annually': 'YYYY',
                               'monthly': 'MM/YYYY',
                               'twice annually': 'MM/YYYY',
                               'quarterly': 'MM/YYYY'}
                input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' %
                                   date_format[options.frequency])
                if input_.strip().lower() == 'docs':
                    res['date'] = ''
                    res['resource_type'] = 'documentation'
                else:
                    res['date'] = input_

            resources.sort(key=lambda x: x.get('date', '').split('/')[::-1])

        # Ensure there is not a mixture of resources with and without a date
        have_dates = None
        for res in resources:
            if res.get('resource_type') == 'documentation':
                continue
            if have_dates is None:
                have_dates = bool(res.get('date'))
            else:
                has_date = bool(res.get('date'))
                if has_date != have_dates:
                    print [res.get('date') for res in resources]
                    print 'Cannot mix resources with dates and others without!'
                    import pdb
                    pdb.set_trace()
    def command(cls, config_ini, dataset_names, options):
        common.load_config(config_ini)
        common.register_translator()

        from pylons import config
        apikey = config['dgu.merge_datasets.apikey']
        ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey)
        #ckan = ckanapi.LocalCKAN()

        if options.publisher:
            org_name = common.name_stripped_of_url(options.publisher)
            if options.search:
                results = ckan.action.package_search(q=options.search,
                                                     fq='publisher:%s' %
                                                     org_name,
                                                     rows=100)
                dataset_names.extend(
                    [dataset['name'] for dataset in results['results']])
            else:
                org = ckan.action.organization_show(id=org_name,
                                                    include_datasets=True)
                dataset_names.extend([d['name'] for d in org['packages']])

        datasets = []
        datasets_by_name = {}

        def get_extra(dataset, key):
            for extra in dataset['extras']:
                if extra['key'] == key:
                    return extra['value']

        for dataset_name in dataset_names:
            print 'Dataset: %s' % dataset_name
        for dataset_name in dataset_names:
            # strip off the url part of the dataset name, if there is one
            dataset_name = common.name_stripped_of_url(dataset_name)
            dataset = ckan.action.package_show(id=dataset_name)
            harvest_source_ref = get_extra(dataset, 'harvest_source_reference')
            if harvest_source_ref:
                print '** Discarding dataset %s due to harvest source: %s **' \
                    % (dataset_name, harvest_source_ref)
                continue
            datasets.append(dataset)
            datasets_by_name[dataset['name']] = dataset
        datasets.sort(key=lambda x: x['metadata_modified'])

        # aggregate resources
        def resource_identity(res_dict, dataset_name):
            return (res_dict.get('date'), res_dict['url'],
                    res_dict.get('title') or res_dict['description'],
                    res_dict.get('format'), dataset_name)

        combined_resources = {}  # identity
        res_stats = Stats()
        for dataset in datasets:
            for resource in dataset['resources']:
                identity = resource_identity(resource, dataset['name'])
                resource['dataset_name'] = dataset['name']
                if identity in combined_resources:
                    print res_stats.add(
                        'Discarding duplicate', '\n%s duplicate of \n%s' %
                        (resource, combined_resources[identity]))
                else:
                    combined_resources[identity] = resource
        resources = combined_resources.values()

        # find dates for resources
        # NB This has been pulled out into timeseries_convert.py -
        # TODO call that instead of having the code here too.
        if options.frequency:
            url_munge_re = re.compile('(%20|-|_|\.)')

            def fields_to_hunt_for_date(res):
                date = res.get('date')
                if date:
                    yield 'date', date
                title = res.get('title')
                if title:
                    yield 'title', title
                yield 'description', res['description']
                yield 'url', url_munge_re.sub(' ', res['url'])
                if not options.update:
                    dataset = datasets_by_name[res['dataset_name']]
                    yield 'dataset-title', dataset['title']
                    yield 'dataset-notes', dataset['notes']

            ensure_regexes_are_initialized()
            global regexes
            for resource in resources:
                for field_name, field_value in fields_to_hunt_for_date(
                        resource):
                    if options.frequency in ('monthly', 'quarterly',
                                             'twice annually'):
                        month, year = hunt_for_month_and_year(field_value)
                        if year and month:
                            resource['date'] = '%02d/%s' % (month, year)
                            res_stats.add(
                                'Found date in %s' % field_name,
                                '%s %r' % (resource['date'], resource))
                            if resource.get(
                                    'resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource',
                                              resource)
                            break
                    elif options.frequency == 'annually':
                        year = regexes['year'].search(field_value)
                        if year:
                            resource['date'] = year.groups()[0]
                            res_stats.add(
                                'Found date in %s' % field_name,
                                '%s %r' % (resource['date'], resource))
                            if resource.get(
                                    'resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource',
                                              resource)
                            break
                else:
                    if resource.get('resource_type') == 'documentation':
                        print res_stats.add(
                            'Could not find date but it\'s Additional Resource',
                            resource)
                        continue
                    print res_stats.add('Could not find date', resource)
                    continue

            print 'Resources: \n', res_stats

            resources_without_date = [
                res for res in resources if not res.get('date')
                and res.get('resource_type') != 'documentation'
            ]
            for i, res in enumerate(resources_without_date):
                print 'Resources without dates %s/%s' % (
                    i + 1, len(resources_without_date))
                for field_name, field_value in fields_to_hunt_for_date(res):
                    print '  %s: %s' % (
                        field_name, field_value.encode('latin-1', 'ignore'))
                print 'https://data.gov.uk/dataset/%s/resource/%s' % (
                    res['dataset_name'], res['id'])
                date_format = {
                    'annually': 'YYYY',
                    'monthly': 'MM/YYYY',
                    'twice annually': 'MM/YYYY',
                    'quarterly': 'MM/YYYY'
                }
                input_ = raw_input(
                    'Date (%s) or DOCS to make it an Additional Resource: ' %
                    date_format[options.frequency])
                if input_.strip().lower() == 'docs':
                    res['date'] = ''
                    res['resource_type'] = 'documentation'
                else:
                    res['date'] = input_

            resources.sort(key=lambda x: x.get('date', '').split('/')[::-1])

        # Ensure there is not a mixture of resources with and without a date
        have_dates = None
        for res in resources:
            if res.get('resource_type') == 'documentation':
                continue
            if have_dates is None:
                have_dates = bool(res.get('date'))
            else:
                has_date = bool(res.get('date'))
                if has_date != have_dates:
                    print[res.get('date') for res in resources]
                    print 'Cannot mix resources with dates and others without!'
                    import pdb
                    pdb.set_trace()

        # Remove 'dataset_name' and others fields from resources
        ignore_res_fields = set(
            ('dataset_name', 'created', 'position', 'revision_id', 'id',
             'tracking_summary', 'qa', 'archiver'))
        for res in resources:
            for field in ignore_res_fields & set(res.keys()):
                del res[field]

        # Merge dataset fields
        def get_all_fields_and_values(datasets):
            ignore_fields = set((
                'id',
                'resources',
                'last_major_modification',
                'data_dict',
                'revision_timestamp',
                'num_tags',
                'metadata_created',
                'metadata_modified',
                'odi_certificate',
                'extras',  # they are at top level already
                'timeseries_resources',
                'individual_resources',
                'additional_resources',
                'revision_id',
                'organization',
                'tracking_summary',
                'num_resources',
                'license_title',
                'author',
                'author_email',
                'maintainer',
                'maintainer_email',
                'temporal_granularity',
                'geographic_granularity',
                'state',
                'isopen',
                'url',
                'date_update_future',
                'date_updated',
                'date_released',
                'precision',
                'taxonomy_url',
                'temporal_coverage-from',
                'temporal_coverage-to',
                'published_via',
                'creator_user_id',
                'qa',
                'archiver',
            ))
            first_fields = [
                'title', 'name', 'notes', 'theme-primary', 'theme-secondary'
            ]
            all_field_values = defaultdict(list)
            for dataset in datasets:
                for field in dataset:
                    if field not in ignore_fields and dataset[field]:
                        all_field_values[field].append(dataset[field])
            for field in first_fields:
                yield field, all_field_values.get(field, [])
            for field in all_field_values:
                if field not in first_fields:
                    yield field, all_field_values[field]

        spend_data_defaults = {
            'geographic_coverage': None,
            'theme-primary': 'Government Spending',
            'theme-secondary': None,
            'update_frequency': 'monthly',
        }
        combined_dataset = {'resources': resources}
        all_fields_and_values = get_all_fields_and_values(datasets)
        for field, values in all_fields_and_values:
            if field == 'notes':
                values = [value.strip() for value in values]
            if field == 'tags':
                # just merge them up-front and
                # dont offer user any choice
                tags_by_name = {}
                for dataset_tags in values:
                    for tag in dataset_tags:
                        if tag['name'] not in tags_by_name:
                            tags_by_name[tag['name']] = tag
                values = [tags_by_name.values()]
            if field in ('codelist', 'schema'):
                # just merge them up-front
                # And convert the dict into just an id string
                ids = set()
                for dataset_values in values:
                    for value_dict in dataset_values:
                        ids.add(value_dict['id'])
                values = [list(ids)]
            print '\n%s:' % field
            pprint(list(enumerate(values)))
            if options.spend and field in spend_data_defaults:
                value = spend_data_defaults[field]
                print 'Spend data defaults to: %s' % value
                values = [value] if value is not None else None
            # dont be case-sensitive for boolean fields
            if field == 'core-dataset':
                values = [v.lower() for v in values]
            try:
                values_identicle = len(set(values)) == 1
            except TypeError:
                if values and len(values):
                    val1 = values[0]
                    for val in values[1:]:
                        if val != val1:
                            values_identicle = False
                            break
                    else:
                        values_identicle = True
            if (not values) or (not len(values)):
                pass
            elif values_identicle:
                value = values[0]
            elif field == 'name':
                while True:
                    from ckan.lib.munge import munge_title_to_name
                    munged_title = munge_title_to_name(
                        combined_dataset['title'])
                    print munge_title_to_name(
                        datasets[0]['organization']['title'])
                    value = raw_input('Type new value (%s): ' % (munged_title))
                    if not value:
                        value = munged_title
                    if len(value) < 3:
                        print 'Too short'
                        continue
                    if value in values:
                        print 'That name is taken'
                        continue
                    existing = ckan.action.package_autocomplete(q=value)
                    if value in existing:
                        print 'That name is taken on CKAN'
                        continue
                    break
            else:
                while True:
                    response = raw_input(
                        '%s: value (number) or type new one: ' % field)
                    try:
                        value_index = int(response)
                        value = values[value_index]
                        print value
                    except ValueError:
                        # fix pound signs if the user pasted from the repr'd version
                        response = re.sub(r'\\xa3', u'\xa3', response)
                        value = response
                    if not value and field in ('title', 'owner_org', 'notes',
                                               'license_id'):
                        print 'You must have a value for this field!'
                        continue
                    break
            if value:
                combined_dataset[field] = value

        # Store
        print '\nMerged dataset:\n'
        pprint(combined_dataset)

        response = raw_input(
            'Press enter to write or pdb to edit in pdb first: ')
        if response == 'pdb':
            import pdb
            pdb.set_trace()
        try:
            if options.update:
                ckan.action.dataset_update(**combined_dataset)
            else:
                ckan.action.dataset_create(**combined_dataset)
        except Exception, e:
            print e
            import pdb
            pdb.set_trace()