def command(cls, config_ini, org_names):
        common.load_config(config_ini)
        common.register_translator()
        from ckan.plugins import toolkit
        from ckan import model
        orgs = [toolkit.get_action('organization_show')(
                data_dict={'id': org_name})
                for org_name in org_names]
        source_org, dest_org = orgs
        assert source_org
        assert dest_org
        search_results = toolkit.get_action('package_search')(
            data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000))
        print 'Datasets: %s' % search_results['count']
        stats = Stats()
        if len(search_results['results']) != search_results['count']:
            assert 0, 'need to implement paging'

        #context = {
        #    'user': get_script_user(__name__)['name'],
        #    'ignore_auth': True,
        #    'model': model}
        rev = model.repo.new_revision()
        rev.author = 'script-%s.py' % __file__
        for dataset in search_results['results']:
            model.Package.get(dataset['id']).owner_org = dest_org['id']
            #dataset_ = toolkit.get_action('package_patch')(
            #    context=context,
            #    data_dict=dict(id=dataset['id'], owner_org=dest_org['id']))
            print stats.add('Changed owner_org', dataset['name'])
        print stats.report()
        print 'Writing'
        model.Session.commit()
def dgu_update(apikey):
    from ckanext.dgu.forms import validators
    import ckanapi
    dgu = ckanapi.RemoteCKAN('http://data.gov.uk',
                             user_agent=__file__,
                             apikey=apikey)
    dgu_categories = dict(validators.categories)
    dgu_categories_by_title = dict(
        (title, id) for id, title in validators.categories)
    stats_category = Stats()
    stats_state = Stats()
    org_names_request = requests.get(
        'http://data.gov.uk/api/action/organization_list')
    # NB Not using all_fields as it doesn't include extras, like category
    org_names = json.loads(org_names_request.content)['result']
    opennames = nomenklatura.Dataset('public-bodies-uk')
    for org_name in org_names:
        org_request = requests.get(
            'http://data.gov.uk/api/action/organization_show?id=%s' % org_name)
        org = json.loads(org_request.content)['result']
        # convert the extras into a dict
        org['extras'] = dict(
            (extra['key'], extra['value']) for extra in org['extras'])
        try:
            entity = opennames.entity_by_name(org['title'])
        except NoMatch:
            # BTW it hasn't been added for review
            msg = 'Org not found in nomenklatura'
            print stats_category.add(msg, org_name)
            stats_state.add(msg, org_name)
            continue
        entity = entity.dereference()
        changed_org = dgu_update_category(org_name, org, entity,
                                          stats_category, dgu_categories,
                                          dgu_categories_by_title)
        if changed_org:
            # convert the extras back into a list of dicts
            org['extras'] = [{
                'key': key,
                'value': value
            } for key, value in org['extras'].items()]
            try:
                org = dgu.action.organization_update(**org)
            except ckanapi.errors.CKANAPIError, e:
                if '504 Gateway Time-out' in str(e):
                    print stats_category.add('Time-out writing', org_name)
                else:
                    raise
def dgu_reconcile():
    from ckanext.dgu.forms import validators
    stats = Stats()
    messages = Messages()
    org_names_request = requests.get(
        'http://data.gov.uk/api/action/organization_list')
    dgu_categories = dict(validators.categories)

    # NB Not using all_fields as it doesn't include extras, like category
    org_names = json.loads(org_names_request.content)['result']
    for org_name in org_names:
        org_request = requests.get(
            'http://data.gov.uk/api/action/organization_show?id=%s' % org_name)
        org = json.loads(org_request.content)['result']
        # convert the extras into a dict
        org['extras'] = dict(
            (extra['key'], extra['value']) for extra in org['extras'])
        attributes = {
            'dgu-name': org['name'],
            'dgu-uri': 'http://data.gov.uk/publisher/%s' % org['name'],
        }
        category = org['extras'].get('category')
        merge_attributes = {
            'abbreviation':
            org['extras'].get('abbreviation'),
            'category':
            dgu_categories[category] if category in dgu_categories else None,
        }
        _merge_org(org['title'], attributes, merge_attributes, stats, messages)
    print stats
    _print_messages(messages)
def gds_reconcile():
    stats = Stats()
    messages = Messages()
    gds_page_number = 0
    while True:
        gds_page_number += 1
        url = 'https://www.gov.uk/api/organisations?page=%d' % gds_page_number
        print url
        gds_page = requests.get(url)
        orgs = json.loads(gds_page.content)['results']
        if not orgs:
            break
        for org in orgs:
            status = 'closed' if org['details'][
                'govuk_status'] == 'closed' else 'active'
            # TODO for closed departments, scrape gov.uk to find out the
            # replacement department. e.g.
            # https://www.gov.uk/api/organisations/department-of-constitutional-affairs
            attributes = {
                'govuk-id': org['id'],
                'govuk-url': org['web_url'],
                'category': org['format'],  # e.g. "Ministerial department"
            }
            merge_attributes = {
                'abbreviation': org['details']['abbreviation'],
                'status': status  # "closed"/"active"
            }
            _merge_org(org['title'], attributes, merge_attributes, stats,
                       messages)
    print stats
    _print_messages(messages)
Esempio n. 5
0
    def run(cls, config_ini_or_ckan_url, dataset_names):
        ckan = common.get_ckanapi(config_ini_or_ckan_url)

        stats = Stats()
        for dataset_name in dataset_names:
            try:
                ckan.call_action('dataset_delete',
                                 {'id': dataset_name})
                print stats.add('Deleted (or was already deleted)', dataset_name)
            except (KeyboardInterrupt, SystemExit):
                raise
            except Exception, e:
                if 'CKANAPIError' in str(e):
                    print e
                    print 'Not calling API correctly - aborting'
                    sys.exit(1)
                print stats.add('Error %s' % type(e).__name__,
                                '%s %s' % (dataset_name, e))
Esempio n. 6
0
    def command(cls, config_ini, org_names):
        common.load_config(config_ini)
        common.register_translator()
        from ckan.plugins import toolkit
        from ckan import model
        orgs = [
            toolkit.get_action('organization_show')(data_dict={
                'id': org_name
            }) for org_name in org_names
        ]
        source_org, dest_org = orgs
        assert source_org
        assert dest_org
        search_results = toolkit.get_action('package_search')(
            data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000))
        print 'Datasets: %s' % search_results['count']
        stats = Stats()
        if len(search_results['results']) != search_results['count']:
            assert 0, 'need to implement paging'

        #context = {
        #    'user': get_script_user(__name__)['name'],
        #    'ignore_auth': True,
        #    'model': model}
        rev = model.repo.new_revision()
        rev.author = 'script-%s.py' % __file__
        for dataset in search_results['results']:
            model.Package.get(dataset['id']).owner_org = dest_org['id']
            #dataset_ = toolkit.get_action('package_patch')(
            #    context=context,
            #    data_dict=dict(id=dataset['id'], owner_org=dest_org['id']))
            print stats.add('Changed owner_org', dataset['name'])
        print stats.report()
        print 'Writing'
        model.Session.commit()
Esempio n. 7
0
def add_date_to_resource(resource, just_year=False, dataset=None, stats=None):
    stats = stats or Stats()
    parsed_date = False
    if not just_year:
        # month and year
        for field_name, field_value in fields_to_hunt_for_date(
                resource, dataset):
            month, year = hunt_for_month_and_year(field_value)
            if year and month:
                resource['date'] = '%02d/%s' % (month, year)
                stats.add('Found date in %s' % field_name,
                          '%s %r' % (resource['date'], resource))
                if resource.get('resource_type') == 'documentation':
                    resource['resource_type'] = 'file'
                    stats.add('Converted additional resource', resource)
                parsed_date = True
                break

    if not parsed_date:
        for field_name, field_value in fields_to_hunt_for_date(resource):

            # year
            year = re.search(r'%s(20\d{2})%s' % (BOUNDARY, BOUNDARY),
                             field_value)
            if year:
                resource['date'] = year.groups()[0]
                stats.add('Found date in %s' % field_name,
                          '%s %r' % (resource['date'], resource))
                if resource.get('resource_type') == 'documentation':
                    resource['resource_type'] = 'file'
                    stats.add('Converted additional resource', resource)
                parsed_date = True
                break

    if not parsed_date:
        if resource.get('resource_type') == 'documentation':
            stats.add(
                'Could not find date but it\'s an Additional '
                'Resource', resource)
            return stats
        stats.add('Could not find date', resource)
        return stats
    return stats
Esempio n. 8
0
def add_date_to_resources(resources,
                          just_year=False,
                          dataset=None,
                          stats=None):
    '''Given a list of resource dicts, it tries to add a date value to them
    all. Sets date to e.g. '09/2012'.

    Specify just_year if it is an annual dataset and you want to ignore months.

    Specify a dataset if you want to look for dates in its title/description -
    suitable if you are merging several datasets into a series.
    '''
    stats = stats or Stats()
    for resource in resources:
        add_date_to_resource(resource,
                             just_year=just_year,
                             dataset=dataset,
                             stats=stats)
    return stats
Esempio n. 9
0
def dgu_update(apikey):
    from ckanext.dgu.forms import validators
    import ckanapi

    dgu = ckanapi.RemoteCKAN("http://data.gov.uk", user_agent=__file__, apikey=apikey)
    dgu_categories = dict(validators.categories)
    dgu_categories_by_title = dict((title, id) for id, title in validators.categories)
    stats_category = Stats()
    stats_state = Stats()
    org_names_request = requests.get("http://data.gov.uk/api/action/organization_list")
    # NB Not using all_fields as it doesn't include extras, like category
    org_names = json.loads(org_names_request.content)["result"]
    opennames = nomenklatura.Dataset("public-bodies-uk")
    for org_name in org_names:
        org_request = requests.get("http://data.gov.uk/api/action/organization_show?id=%s" % org_name)
        org = json.loads(org_request.content)["result"]
        # convert the extras into a dict
        org["extras"] = dict((extra["key"], extra["value"]) for extra in org["extras"])
        try:
            entity = opennames.entity_by_name(org["title"])
        except NoMatch:
            # BTW it hasn't been added for review
            msg = "Org not found in nomenklatura"
            print stats_category.add(msg, org_name)
            stats_state.add(msg, org_name)
            continue
        entity = entity.dereference()
        changed_org = dgu_update_category(
            org_name, org, entity, stats_category, dgu_categories, dgu_categories_by_title
        )
        if changed_org:
            # convert the extras back into a list of dicts
            org["extras"] = [{"key": key, "value": value} for key, value in org["extras"].items()]
            try:
                org = dgu.action.organization_update(**org)
            except ckanapi.errors.CKANAPIError, e:
                if "504 Gateway Time-out" in str(e):
                    print stats_category.add("Time-out writing", org_name)
                else:
                    raise
Esempio n. 10
0
    def run(cls, config_ini_or_ckan_url, dataset_names):
        ckan = common.get_ckanapi(config_ini_or_ckan_url)

        stats = Stats()
        for dataset_name in dataset_names:
            dataset_name = common.name_stripped_of_url(dataset_name)
            try:
                ckan.call_action('dataset_delete', {'id': dataset_name})
                print stats.add('Deleted (or was already deleted)',
                                dataset_name)
            except (KeyboardInterrupt, SystemExit):
                raise
            except Exception, e:
                if 'CKANAPIError' in str(e):
                    print e
                    print 'Not calling API correctly - aborting'
                    sys.exit(1)
                print stats.add('Error %s' % type(e).__name__,
                                '%s %s' % (dataset_name, e))
"""
Set the resource.format from the QA value for these resources:
* where the resource.format is empty (or just whitespace)
* where resource.format is not a poor one and QA is most likely more accurate

This will trigger an archive/packagezip/qa cycle, so should not be used too
frequently.
"""
from sqlalchemy.exc import IntegrityError
from optparse import OptionParser

from ckan.logic import ValidationError
from ckanext.dgu.bin import common
from running_stats import Stats

res_stats = Stats()
ds_stats = Stats()

UPDATE_FORMATS = {
    'CSV / ZIP': 'CSV',  # legacy - we now drop mention of the zip
    'XML': set((
        'WFS',  # previously we set UKLP WFS resources as XML but we can detect WFS now
        'Atom Feed',
        'SHP',
        'WCS',
        'WMTS',
        )),
    'DBASE': 'SHP',
    'ZIP': 'SHP',
    }
Esempio n. 12
0
def dgu_account(args):
    ckan = get_ckan(args.ckan)
    uploads = get_uploads()

    stats = Stats()
    publishers = {}  # by email.lower()
    for upload in uploads:
        version = datetime.datetime.strptime(upload['version'], '%d/%m/%Y')
        if version < datetime.datetime(2015, 1, 1):
            #stats.add('Ignore - before 2015',
            #          '%s %s' % (upload['version'], upload['submitter_email']))
            continue
        if '@' not in upload['submitter_email']:
            stats.add('Ignore - bad email address', upload['submitter_email'])
            continue
        if upload['submitter_email'].lower() not in publishers:
            publishers[upload['submitter_email'].lower()] = []
            stats.add('Added', upload['submitter_email'])
        else:
            stats.add('Appended', upload['submitter_email'])
        publishers[upload['submitter_email'].lower()].append(
            dict(email=upload['submitter_email'],
                 org_name=upload['org_name'],
                 version=version))

    print 'Email addresses:'
    print stats

    cache_filename = '.users.%s.cache' % (args.ckan.replace(':', '-'))
    if os.path.exists(cache_filename):
        print 'Getting users from %s' % cache_filename
        with open(cache_filename, 'rb') as f:
            users_str = f.read()
        users = json.loads(users_str)
    else:
        print 'Getting users from %s' % args.ckan
        # NB this doesn't work remotely because varnish times out,
        # so run from prod3 itself against 8080 from ~/organograms
        users = ckan.action.user_list()
        print 'Saving users to %s' % cache_filename
        users_str = json.dumps(users)
        with open(cache_filename, 'wb') as f:
            f.write(users_str)
    print '%s users' % len(users)
    users_by_email = dict([(user['email'], user) for user in users])

    def get_user(email_variants):
        for email_variant in email_variants:
            if email_variant in users_by_email:
                return users_by_email[email_variant]

    stats = Stats()
    user_table = []
    for email_lower in publishers:
        user_row = dict(email=email_lower)

        versions = (upload['version'] for upload in publishers[email_lower])
        latest_version = sorted(versions)[-1]
        user_row['source of contact'] = '%s organogram published' \
            % datetime.datetime.strftime(latest_version, '%Y-%m')

        # find the organization
        org_names_raw = set(
            (upload['org_name'] for upload in publishers[email_lower]))
        orgs = []
        for org_name_raw in org_names_raw:
            title = canonize(org_name_raw)
            match = DguOrgs.by_canonized_title().get(
                title) or Aliases.get_from_canonized(title)
            assert match, 'No match: %s' % org_name_raw
            if isinstance(match, basestring):
                match = DguOrgs.by_title()[match]
            if match not in orgs:
                orgs.append(match)
        user_row['organization'] = ' / '.join([org['title'] for org in orgs])

        # see if they are a user on data.gov.uk
        email_variants = set(
            (upload['email'] for upload in publishers[email_lower]))
        user = get_user(email_variants)
        user_table.append(user_row)

        emails_str = '/'.join(email_variants)
        if not user:
            user_row['has dgu login'] = '******'
            print stats.add('Not registered', emails_str)
            continue
        # assume has confirmed email
        user_row['has dgu login'] = '******'
        user_row['name'] = user['fullname']
        user_row['email'] = user['email']

        # see if this user is an editor/admin for the organization
        user_permissions = []
        for org in orgs:
            editors_and_admins = (user['name'] for user in org['users'])
            if user['name'] in editors_and_admins:
                user_permissions.append('yes')
                print stats.add('Already an editor/admin',
                                '%s %s' % (emails_str, org['title']))
            else:
                user_permissions.append('no')
                admins = (user['name'] for user in org['users']
                          if user['capacity'] == 'admin')
                if admins:
                    print stats.add(
                        'Need to get permission. Admin exists', '%s %s %s' %
                        (emails_str, org['title'], ', '.join('"%s"' % a
                                                             for a in admins)))
                else:
                    print stats.add('Need to get permission. No admin',
                                    '%s %s' % (emails_str, org['title']))
        user_row['editor or admin'] = ' / '.join(user_permissions)

    def extract_email(stat):
        emails = stat.split(' ')[0]  # the first word
        email = emails.split('/')[0]  # ignore variants
        return email

    print '\nFor emailing:'
    print '-------------'
    print '\nNot registered:'
    print ', '.join(stats['Not registered'])
    print '\nAlready an editor/admin:'
    print ', '.join([
        extract_email(email_and_org)
        for email_and_org in stats['Already an editor/admin']
    ])
    print '\nNeed to get permission. Admin exists:'
    print ', '.join([
        extract_email(email_and_org)
        for email_and_org in stats['Need to get permission. Admin exists']
    ])

    print '\nTable:'
    print '-------------'
    headers = ('name', 'email', 'organization', 'has dgu login',
               'editor or admin', 'source of contact')
    print '\t'.join(headers)
    for row in user_table:
        print '\t'.join(row.get(header, '') for header in headers)

    print '\nPermissions'
    print '-------------'
    print stats
    def command(cls, config_ini, dataset_names, options):
        common.load_config(config_ini)
        common.register_translator()

        from pylons import config
        apikey = config['dgu.merge_datasets.apikey']
        ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey)
        #ckan = ckanapi.LocalCKAN()

        if options.publisher:
            org_name = common.name_stripped_of_url(options.publisher)
            if options.search:
                results = ckan.action.package_search(q=options.search,
                                                     fq='publisher:%s' %
                                                     org_name,
                                                     rows=100)
                dataset_names.extend(
                    [dataset['name'] for dataset in results['results']])
            else:
                org = ckan.action.organization_show(id=org_name,
                                                    include_datasets=True)
                dataset_names.extend([d['name'] for d in org['packages']])

        datasets = []
        datasets_by_name = {}

        def get_extra(dataset, key):
            for extra in dataset['extras']:
                if extra['key'] == key:
                    return extra['value']

        for dataset_name in dataset_names:
            print 'Dataset: %s' % dataset_name
        for dataset_name in dataset_names:
            # strip off the url part of the dataset name, if there is one
            dataset_name = common.name_stripped_of_url(dataset_name)
            dataset = ckan.action.package_show(id=dataset_name)
            harvest_source_ref = get_extra(dataset, 'harvest_source_reference')
            if harvest_source_ref:
                print '** Discarding dataset %s due to harvest source: %s **' \
                    % (dataset_name, harvest_source_ref)
                continue
            datasets.append(dataset)
            datasets_by_name[dataset['name']] = dataset
        datasets.sort(key=lambda x: x['metadata_modified'])

        # aggregate resources
        def resource_identity(res_dict, dataset_name):
            return (res_dict.get('date'), res_dict['url'],
                    res_dict.get('title') or res_dict['description'],
                    res_dict.get('format'), dataset_name)

        combined_resources = {}  # identity
        res_stats = Stats()
        for dataset in datasets:
            for resource in dataset['resources']:
                identity = resource_identity(resource, dataset['name'])
                resource['dataset_name'] = dataset['name']
                if identity in combined_resources:
                    print res_stats.add(
                        'Discarding duplicate', '\n%s duplicate of \n%s' %
                        (resource, combined_resources[identity]))
                else:
                    combined_resources[identity] = resource
        resources = combined_resources.values()

        # find dates for resources
        # NB This has been pulled out into timeseries_convert.py -
        # TODO call that instead of having the code here too.
        if options.frequency:
            url_munge_re = re.compile('(%20|-|_|\.)')

            def fields_to_hunt_for_date(res):
                date = res.get('date')
                if date:
                    yield 'date', date
                title = res.get('title')
                if title:
                    yield 'title', title
                yield 'description', res['description']
                yield 'url', url_munge_re.sub(' ', res['url'])
                if not options.update:
                    dataset = datasets_by_name[res['dataset_name']]
                    yield 'dataset-title', dataset['title']
                    yield 'dataset-notes', dataset['notes']

            ensure_regexes_are_initialized()
            global regexes
            for resource in resources:
                for field_name, field_value in fields_to_hunt_for_date(
                        resource):
                    if options.frequency in ('monthly', 'quarterly',
                                             'twice annually'):
                        month, year = hunt_for_month_and_year(field_value)
                        if year and month:
                            resource['date'] = '%02d/%s' % (month, year)
                            res_stats.add(
                                'Found date in %s' % field_name,
                                '%s %r' % (resource['date'], resource))
                            if resource.get(
                                    'resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource',
                                              resource)
                            break
                    elif options.frequency == 'annually':
                        year = regexes['year'].search(field_value)
                        if year:
                            resource['date'] = year.groups()[0]
                            res_stats.add(
                                'Found date in %s' % field_name,
                                '%s %r' % (resource['date'], resource))
                            if resource.get(
                                    'resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource',
                                              resource)
                            break
                else:
                    if resource.get('resource_type') == 'documentation':
                        print res_stats.add(
                            'Could not find date but it\'s Additional Resource',
                            resource)
                        continue
                    print res_stats.add('Could not find date', resource)
                    continue

            print 'Resources: \n', res_stats

            resources_without_date = [
                res for res in resources if not res.get('date')
                and res.get('resource_type') != 'documentation'
            ]
            for i, res in enumerate(resources_without_date):
                print 'Resources without dates %s/%s' % (
                    i + 1, len(resources_without_date))
                for field_name, field_value in fields_to_hunt_for_date(res):
                    print '  %s: %s' % (
                        field_name, field_value.encode('latin-1', 'ignore'))
                print 'https://data.gov.uk/dataset/%s/resource/%s' % (
                    res['dataset_name'], res['id'])
                date_format = {
                    'annually': 'YYYY',
                    'monthly': 'MM/YYYY',
                    'twice annually': 'MM/YYYY',
                    'quarterly': 'MM/YYYY'
                }
                input_ = raw_input(
                    'Date (%s) or DOCS to make it an Additional Resource: ' %
                    date_format[options.frequency])
                if input_.strip().lower() == 'docs':
                    res['date'] = ''
                    res['resource_type'] = 'documentation'
                else:
                    res['date'] = input_

            resources.sort(key=lambda x: x.get('date', '').split('/')[::-1])

        # Ensure there is not a mixture of resources with and without a date
        have_dates = None
        for res in resources:
            if res.get('resource_type') == 'documentation':
                continue
            if have_dates is None:
                have_dates = bool(res.get('date'))
            else:
                has_date = bool(res.get('date'))
                if has_date != have_dates:
                    print[res.get('date') for res in resources]
                    print 'Cannot mix resources with dates and others without!'
                    import pdb
                    pdb.set_trace()

        # Remove 'dataset_name' and others fields from resources
        ignore_res_fields = set(
            ('dataset_name', 'created', 'position', 'revision_id', 'id',
             'tracking_summary', 'qa', 'archiver'))
        for res in resources:
            for field in ignore_res_fields & set(res.keys()):
                del res[field]

        # Merge dataset fields
        def get_all_fields_and_values(datasets):
            ignore_fields = set((
                'id',
                'resources',
                'last_major_modification',
                'data_dict',
                'revision_timestamp',
                'num_tags',
                'metadata_created',
                'metadata_modified',
                'odi_certificate',
                'extras',  # they are at top level already
                'timeseries_resources',
                'individual_resources',
                'additional_resources',
                'revision_id',
                'organization',
                'tracking_summary',
                'num_resources',
                'license_title',
                'author',
                'author_email',
                'maintainer',
                'maintainer_email',
                'temporal_granularity',
                'geographic_granularity',
                'state',
                'isopen',
                'url',
                'date_update_future',
                'date_updated',
                'date_released',
                'precision',
                'taxonomy_url',
                'temporal_coverage-from',
                'temporal_coverage-to',
                'published_via',
                'creator_user_id',
                'qa',
                'archiver',
            ))
            first_fields = [
                'title', 'name', 'notes', 'theme-primary', 'theme-secondary'
            ]
            all_field_values = defaultdict(list)
            for dataset in datasets:
                for field in dataset:
                    if field not in ignore_fields and dataset[field]:
                        all_field_values[field].append(dataset[field])
            for field in first_fields:
                yield field, all_field_values.get(field, [])
            for field in all_field_values:
                if field not in first_fields:
                    yield field, all_field_values[field]

        spend_data_defaults = {
            'geographic_coverage': None,
            'theme-primary': 'Government Spending',
            'theme-secondary': None,
            'update_frequency': 'monthly',
        }
        combined_dataset = {'resources': resources}
        all_fields_and_values = get_all_fields_and_values(datasets)
        for field, values in all_fields_and_values:
            if field == 'notes':
                values = [value.strip() for value in values]
            if field == 'tags':
                # just merge them up-front and
                # dont offer user any choice
                tags_by_name = {}
                for dataset_tags in values:
                    for tag in dataset_tags:
                        if tag['name'] not in tags_by_name:
                            tags_by_name[tag['name']] = tag
                values = [tags_by_name.values()]
            if field in ('codelist', 'schema'):
                # just merge them up-front
                # And convert the dict into just an id string
                ids = set()
                for dataset_values in values:
                    for value_dict in dataset_values:
                        ids.add(value_dict['id'])
                values = [list(ids)]
            print '\n%s:' % field
            pprint(list(enumerate(values)))
            if options.spend and field in spend_data_defaults:
                value = spend_data_defaults[field]
                print 'Spend data defaults to: %s' % value
                values = [value] if value is not None else None
            # dont be case-sensitive for boolean fields
            if field == 'core-dataset':
                values = [v.lower() for v in values]
            try:
                values_identicle = len(set(values)) == 1
            except TypeError:
                if values and len(values):
                    val1 = values[0]
                    for val in values[1:]:
                        if val != val1:
                            values_identicle = False
                            break
                    else:
                        values_identicle = True
            if (not values) or (not len(values)):
                pass
            elif values_identicle:
                value = values[0]
            elif field == 'name':
                while True:
                    from ckan.lib.munge import munge_title_to_name
                    munged_title = munge_title_to_name(
                        combined_dataset['title'])
                    print munge_title_to_name(
                        datasets[0]['organization']['title'])
                    value = raw_input('Type new value (%s): ' % (munged_title))
                    if not value:
                        value = munged_title
                    if len(value) < 3:
                        print 'Too short'
                        continue
                    if value in values:
                        print 'That name is taken'
                        continue
                    existing = ckan.action.package_autocomplete(q=value)
                    if value in existing:
                        print 'That name is taken on CKAN'
                        continue
                    break
            else:
                while True:
                    response = raw_input(
                        '%s: value (number) or type new one: ' % field)
                    try:
                        value_index = int(response)
                        value = values[value_index]
                        print value
                    except ValueError:
                        # fix pound signs if the user pasted from the repr'd version
                        response = re.sub(r'\\xa3', u'\xa3', response)
                        value = response
                    if not value and field in ('title', 'owner_org', 'notes',
                                               'license_id'):
                        print 'You must have a value for this field!'
                        continue
                    break
            if value:
                combined_dataset[field] = value

        # Store
        print '\nMerged dataset:\n'
        pprint(combined_dataset)

        response = raw_input(
            'Press enter to write or pdb to edit in pdb first: ')
        if response == 'pdb':
            import pdb
            pdb.set_trace()
        try:
            if options.update:
                ckan.action.dataset_update(**combined_dataset)
            else:
                ckan.action.dataset_create(**combined_dataset)
        except Exception, e:
            print e
            import pdb
            pdb.set_trace()
    def import_(cls, csv_filepath):
        log = global_log

        from ckan import model
        stats_category = Stats()

        pub_categories = csv.reader(open(csv_filepath, 'rb'))
        header = pub_categories.next()
        assert_equal('"%s"\n' % '","'.join(header), cls.header)
        for id, title, parent, category, spending_published_by in pub_categories:
            pub = model.Session.query(model.Group).get(id)
            if not pub:
                print stats_category.add('Publisher ID not known', '%s %s' % (id, title))
                continue
            category = category.strip()

            # set category
            existing_category = pub.extras.get('category')
            if not category and not existing_category:
                print stats_category.add('No category info - ignored', title)
                continue
            if not category and existing_category:
                print stats_category.add('Category deleted', '%s %s' % (existing_category, title))
                rev = model.repo.new_revision()
                rev.author = 'script_' + __file__
                pub.extras['category'] = None
                model.Session.commit()
                continue
            if category not in categories_dict.keys():
                print stats_category.add('Category %s not known - ignored' % category, title)
                continue
            if existing_category != category:
                print stats_category.add('Changing category',
                    '%s->%s %s' % (existing_category or '(none)', category, title))
                rev = model.repo.new_revision()
                rev.author = 'script_' + __file__
                pub.extras['category'] = category
                model.Session.commit()
            else:
                print stats_category.add('No change',
                        '%s %s' % (existing_category or '(none)', title))
                log.info('Leaving category for %r as %s', title, category)

            # set spending_published_by
            existing_spb = pub.extras.get('spending_published_by')
            if not spending_published_by:
                log.info('No spending_published_by for %r', title)
                continue
            spb_publisher = model.Group.get(spending_published_by)
            if not spb_publisher:
                spb_publisher = model.Group.search_by_name_or_title(spending_published_by)
                if not spb_publisher:
                    warn('Spending_published_by not known %s - skipping %s %s',
                         spending_published_by, id, title)
                    import pdb; pdb.set_trace()
                    continue
            spending_published_by = spb_publisher.name
            if existing_spb != spending_published_by:
                log.info('Changing SPB %r %s -> %s',
                         title, existing_spb or '(none)', spending_published_by)
                model.repo.new_revision()
                pub.extras['spending_published_by'] = spending_published_by
                model.Session.commit()
            else:
                log.info('Leaving SPB for %r as %s', title, spending_published_by)

        model.Session.remove()

        print stats_category
        log.info('Warnings: %r', warnings)
Esempio n. 15
0
e.g.

["[", "\\"", "G", "o", "v", "e", "r", "n", "m", "e", "n", "t", "", "S", "p", "e", "n", "d",     "i", "n", "g", "\\"", "]"] -> ["Government Spending"]

["C", "r", "i", "m", "e", "", "&", "", "J", "u", "s", "t", "i", "c", "e"] -> ["Crime & Justice"]

'''

import json
from optparse import OptionParser

import common
from running_stats import Stats

stats_format = Stats()
stats_outcome = Stats()

LOOKUP = {
    '["[", "]"]': '[]',
    '["[", "\\"", "G", "o", "v", "e", "r", "n", "m", "e", "n", "t", "", "S", "p", "e", "n", "d", "i", "n", "g", "\\"", "]"]': '["Government Spending"]',
    '["[", "\\"", "B", "u", "s", "i", "n", "e", "s", "s", "", "&", "", "E", "c", "o", "n", "o", "m", "y", "\\"", "]"]': '["Business & Economy"]',
    '["[", "", "", "\\"", "", "", "M", "", "", "a", "", "", "p", "", "", "p", "", "", "i", "", "", "n", "", "", "g", "", "", "\\"", "", "", "]"]': '["Mapping"]',
    '["[", "\\"", "M", "a", "p", "p", "i", "n", "g", "\\"", "]"]': '["Mapping"]',
    '["[", "\\"", "E", "n", "v", "i", "r", "o", "n", "m", "e", "n", "t", "\\"", "]"]': '["Environment"]',
    '["[", "\\"", "G", "o", "v", "e", "r", "n", "m", "e", "n", "t", "\\"", "]"]': '["Government"]',
    '["[", "\\"", "B", "u", "s", "i", "n", "e", "s", "s", "", "&", "", "E", "c", "o", "n", "o", "m", "y", "\\"", "", "", "", "\\"", "G", "o", "v", "e", "r", "n", "m", "e", "n", "t", "", "S", "p", "e", "n", "d", "i", "n", "g", "\\"", "", "", "", "\\"", "H", "e", "a", "l", "t", "h", "\\"", "]"]': '["Business & Economy", "Government Spending", "Health"]',
}


class FixSecondaryTheme3(object):
Esempio n. 16
0
'''
Remove no longer used Resouce extra 'cache_filepath' and related.
'''
import math
from copy import deepcopy

import common
from optparse import OptionParser
from ckan import model

from running_stats import Stats

stats_rp = Stats()
stats_re = Stats()
stats_dp = Stats()
stats_de = Stats()

# These are ckan resource properties that an old version of the archiver filled
# in and are no longer updated. They are now stored in the Archival table.
res_properties_to_make_null = set((
    'cache_last_updated',
    'size',
    'hash',
    'last_modified',
    'mimetype',
    'cache_url',
    ))

# These are custom extra fields added by old versions of the archiver &
# qa, whereas now they store them in the Archival table.
res_extras_to_remove = set((
import json
import os.path
import unicodecsv
import copy
import re
from pprint import pprint
import datetime
from collections import defaultdict

from paste.deploy.converters import asbool

import common
import timeseries_convert
from running_stats import Stats

stats_datasets = Stats()
stats_merge = Stats()
stats_dates = Stats()
stats_res = Stats()


def main(source,
         source_type,
         destination,
         save_relevant_datasets_json,
         write,
         dataset_filter=None,
         res_url_filter=None):

    if source_type == 'json':
        all_datasets = get_datasets_from_json(source)
def main(source, source_type, destination,
         save_relevant_datasets_json,
         write,
         dataset_filter=None, res_url_filter=None):

    if source_type == 'json':
        all_datasets = get_datasets_from_json(source)
    elif source_type == 'jsonl':
        all_datasets = get_datasets_from_jsonl(source)
    else:
        all_datasets = get_datasets_from_ckan(source)

    datasets = []  # legacy ones
    revamped_datasets = []  # ones created on 3rd October 2016 launch
    revamped_datasets_by_org = {}
    revamped_resources = {}
    csv_out_rows = []
    csv_corrected_rows = []
    try:
        # find all the legacy organogram datasets
        all_datasets = list(all_datasets)  # since we need to iterate it twice
        for dataset in all_datasets:

            if dataset_filter and dataset['name'] != dataset_filter:
                continue
            if res_url_filter and \
                res_url_filter not in [r['url'] for r in dataset['resources']]:
                continue

            # check it an organogram dataset
            dataset_str = repr(dataset).lower()
            if 'rganog' not in dataset_str \
                    and 'roles and salaries' not in dataset_str \
                    and 'pay and post' not in dataset_str \
                    and 'posts and pay' not in dataset_str \
                    and 'organisation chart' not in dataset_str \
                    and 'organization chart' not in dataset_str \
                    and 'org chart' not in dataset_str:
                stats_datasets.add('Ignored - not organograms',
                                   dataset['name'])
                continue
            if dataset['name'] in (
                    'eastbourne-borough-council-public-toilets',
                    'staff-organograms-and-pay-government-offices',
                    ) \
                    or dataset['id'] in (
                        '47f69ebb-9939-419f-880d-1b976676cb0e',
                    ):
                stats_datasets.add('Ignored - not organograms',
                                   dataset['name'])
                continue
            if asbool(dataset.get('unpublished')):
                stats_datasets.add('Ignored - unpublished',
                                   dataset['name'])
                continue
            extras = dict((extra['key'], extra['value'])
                          for extra in dataset['extras'])
            if extras.get('import_source') == 'organograms_v2':
                continue
            if extras.get('import_source') == 'harvest':
                stats_datasets.add('Ignored - harvested so can\'t edit it',
                                   dataset['name'])
                continue

            # legacy dataset
            datasets.append(dataset)

        # find the revamped organogram datasets
        for dataset in all_datasets:
            extras = dict((extra['key'], extra['value'])
                          for extra in dataset['extras'])
            if extras.get('import_source') != 'organograms_v2':
                continue

            org_id = dataset['owner_org']
            revamped_datasets.append(dataset)
            assert org_id not in revamped_datasets_by_org, org_id
            revamped_datasets_by_org[org_id] = dataset
            for res in dataset['resources']:
                date = date_to_year_month(res['date'])
                revamped_resources[(org_id, date)] = res
            continue

        if save_relevant_datasets_json:
            filename = 'datasets_organograms.json'
            if not (dataset_filter or res_url_filter):
                output = json.dumps(
                    datasets + revamped_datasets,
                    indent=4, separators=(',', ': '),  # pretty print)
                    )
                with open(filename, 'wb') as f:
                    f.write(output)
                print 'Written %s' % filename
            else:
                print 'Not written %s because you filtered by a ' \
                    'dataset/resource' % filename

        all_resource_ids_to_delete = defaultdict(list)  # dataset_name: res_id_list
        dataset_names_to_delete = set()
        for dataset in datasets:
            org_id = dataset['owner_org']

            # save csv as it has been
            save_csv_rows(csv_out_rows, dataset, None, None)

            original_dataset = copy.deepcopy(dataset)
            delete_dataset = False

            dataset_to_merge_to = \
                get_dataset_to_merge_to(dataset, revamped_datasets_by_org)

            # detect dates
            for res in dataset['resources']:
                if res_url_filter and res['url'] != res_url_filter:
                    continue
                stats = timeseries_convert.add_date_to_resource(
                    res, dataset=dataset)

            # resource corrections
            resources_to_delete = []
            for res in dataset['resources']:
                if res_url_filter and res['url'] != res_url_filter:
                    continue
                resource_corrections(res, dataset, extras,
                                     revamped_resources,
                                     revamped_datasets_by_org,
                                     dataset_to_merge_to,
                                     org_id,
                                     resources_to_delete,
                                     stats_res)
            for res in resources_to_delete:
                dataset['resources'].remove(res)
            if not dataset['resources']:
                delete_dataset = True
            elif resources_to_delete and not dataset_to_merge_to:
                all_resource_ids_to_delete[dataset['name']].extend(
                    res['id'] for res in resources_to_delete)
            org_id = dataset['owner_org']  # it might have changed

            for res in dataset['resources']:
                if res_url_filter and res['url'] != res_url_filter:
                    continue
                if res.get('resource_type') != 'documentation' and not res.get('date'):
                    stats_dates.add('Missing date', dataset['name'])
                    break
            else:
                stats_dates.add('Ok dates', dataset['name'])

            # record changes
            if delete_dataset:
                stats_datasets.add('Delete dataset - no resources', dataset['name'])
                dataset_names_to_delete.add(dataset['name'])
                continue
            elif original_dataset != dataset:
                stats_datasets.add('Updated dataset', dataset['name'])
                has_changed = True
            else:
                stats_datasets.add('Unchanged dataset', dataset['name'])
                has_changed = False

            if dataset_to_merge_to:
                stats_merge.add('Merge', dataset_to_merge_to)
            else:
                stats_merge.add('No merge', dataset['name'])

            # save csv with corrections
            save_csv_rows(csv_corrected_rows, dataset, has_changed, dataset_to_merge_to)

    except:
        traceback.print_exc()
        import pdb; pdb.set_trace()

    stats_merge.report_value_limit = 500
    stats_res.report_value_limit = 500
    print '\nDatasets\n', stats_datasets
    print '\nDataset merges\n', stats_merge
    print '\nDates\n', stats_dates
    print '\nResources\n', stats_res

    # save csvs
    if dataset_filter or res_url_filter:
        for row in csv_corrected_rows:
            if res_url_filter and row['res_url'] != res_url_filter:
                continue
            pprint(row)
        print 'Not written csv because you specified a particular dataset'
    else:
        headers = [
            'name', 'org_title', 'org_id', 'notes',
            'res_id', 'res_name', 'res_url', 'res_format',
            'res_date', 'res_type',
            'has_changed',
            'merge_to_dataset',
            ]
        for csv_rows, out_filename in (
                (csv_out_rows, 'organogram_legacy_datasets.csv'),
                (csv_corrected_rows, 'organogram_legacy_datasets_corrected.csv'),
                ):
            with open(out_filename, 'wb') as csv_write_file:
                csv_writer = unicodecsv.DictWriter(csv_write_file,
                                                   fieldnames=headers,
                                                   encoding='utf-8')
                csv_writer.writeheader()
                for row in sorted(csv_rows, key=lambda r: r['res_url']):
                    csv_writer.writerow(row)
            print 'Written', out_filename

    # group merges by the revamped_dataset
    resources_to_merge = defaultdict(list)  # revamped_dataset_name: resource_list
    resources_to_update = defaultdict(list)  # dataset_name: resource_list
    for row in csv_corrected_rows:
        if row['has_changed'] is False:
            continue
        res = dict(
            id=row['res_id'],
            description=row['res_name'],  # description is required
            url=row['res_url'],
            format=row['res_format'],
            date=row['res_date'],
            resource_type=row['res_type'])
        if row['merge_to_dataset']:
            res['id'] = None  # ignore the id
            resources_to_merge[row['merge_to_dataset']].append(res)
            # also delete the merged dataset
            if row['name'] not in dataset_names_to_delete:
                dataset_names_to_delete.add(row['name'])
        else:
            resources_to_update[row['name']].append(res)

    # write changes - merges etc
    try:
        if destination:
            if write:
                write_caveat = ''
            else:
                write_caveat = ' (NOP without --write)'
            print 'Writing changes to datasets' + write_caveat
            stats_write_res = Stats()
            stats_write_dataset = Stats()
            ckan = common.get_ckanapi(destination)
            import ckanapi

            print 'Updating datasets'
            for dataset_name, res_list in resources_to_update.iteritems():
                dataset = ckan.action.package_show(id=dataset_name)
                resources_by_id = dict((r['id'], r) for r in dataset['resources'])
                dataset_changed = False
                for res in res_list:
                    res_ref = '%s-%s' % (dataset_name, res_list.index(res))
                    res_to_update = resources_by_id.get(res['id'])
                    if res_to_update:
                        res_changed = False
                        for key in res.keys():
                            if res[key] != res_to_update.get(key):
                                res_to_update[key] = res[key]
                                dataset_changed = True
                                res_changed = True
                        if res_changed:
                            stats_write_res.add(
                                'update - ok' + write_caveat, res_ref)
                        else:
                            stats_write_res.add(
                                'update - not needed', res_ref)
                    else:
                        stats_write_res.add(
                            'update - could not find resource id', dataset_name)
                if dataset_changed:
                    if write:
                        ckan.action.package_update(**dataset)
                    stats_write_dataset.add(
                        'Update done' + write_caveat, dataset_name)
                else:
                    stats_write_dataset.add(
                        'Update not needed', dataset_name)

            print 'Merging datasets'
            for revamped_dataset_name, res_list in \
                    resources_to_merge.iteritems():
                try:
                    dataset = ckan.action.package_show(id=revamped_dataset_name)
                except ckanapi.NotFound:
                    stats_write_dataset.add(
                        'Merge - dataset not found', revamped_dataset_name)
                    continue
                existing_res_urls = set(r['url'] for r in dataset['resources'])
                dataset_changed = False
                for res in res_list:
                    res_ref = '%s-%s' % (revamped_dataset_name, res_list.index(res))
                    if res['url'] in existing_res_urls:
                        stats_write_res.add(
                            'merge - no change - resource URL already there',
                            res_ref)
                    else:
                        dataset_changed = True
                        res['description'] += ' (from legacy dataset)'
                        dataset['resources'].append(res)
                        stats_write_res.add(
                            'merge - add' + write_caveat, res_ref)
                if dataset_changed:
                    if write:
                        ckan.action.package_update(**dataset)
                    stats_write_dataset.add(
                        'Merge done' + write_caveat, revamped_dataset_name)
                else:
                    stats_write_dataset.add('Merge not needed', revamped_dataset_name)

            print 'Deleting resources'
            for dataset_name, res_id_list in \
                    all_resource_ids_to_delete.iteritems():
                if dataset_name in dataset_names_to_delete:
                    stats_write_dataset.add(
                        'Delete resources not needed as deleting dataset later',
                        dataset_name)
                    continue
                try:
                    dataset = ckan.action.package_show(id=dataset_name)
                except ckanapi.NotFound:
                    stats_write_dataset.add(
                        'Delete res - dataset not found', dataset_name)
                    continue
                existing_resources = \
                    dict((r['id'], r) for r in dataset['resources'])
                dataset_changed = False
                for res_id in res_id_list:
                    res_ref = '%s-%s' % (dataset_name, res_id_list.index(res_id))
                    existing_resource = existing_resources.get(res_id)
                    if existing_resource:
                        dataset_changed = True
                        dataset['resources'].remove(existing_resource)
                        stats_write_res.add(
                            'delete res - done' + write_caveat, res_ref)
                    else:
                        stats_write_res.add(
                            'delete res - could not find res id', res_ref)
                if dataset_changed:
                    if write:
                        ckan.action.package_update(**dataset)
                    stats_write_dataset.add(
                        'Delete res done' + write_caveat, dataset_name)
                else:
                    stats_write_dataset.add(
                        'Delete res not needed', dataset_name)

            print 'Deleting datasets'
            for dataset_name in dataset_names_to_delete:
                try:
                    dataset = ckan.action.package_show(id=dataset_name)
                except ckanapi.NotFound:
                    stats_write_dataset.add(
                        'Delete dataset - not found', dataset_name)
                else:
                    if write:
                        ckan.action.package_delete(id=dataset_name)
                    stats_write_dataset.add(
                        'Delete dataset - done' + write_caveat, dataset_name)

            print '\nResources\n', stats_write_res
            print '\nDatasets\n', stats_write_dataset
        else:
            print 'Not written changes to datasets'
    except:
        traceback.print_exc()
        import pdb; pdb.set_trace()
"""
Set the resource.format from the QA value for these resources:
* where the resource.format is empty (or just whitespace)
* where resource.format is not a poor one and QA is most likely more accurate

This will trigger an archive/packagezip/qa cycle, so should not be used too
frequently.
"""
from sqlalchemy.exc import IntegrityError
from optparse import OptionParser

from ckan.logic import ValidationError
from ckanext.dgu.bin import common
from running_stats import Stats

res_stats = Stats()
ds_stats = Stats()

UPDATE_FORMATS = {
    'CSV / ZIP':
    'CSV',  # legacy - we now drop mention of the zip
    'XML':
    set((
        'WFS',  # previously we set UKLP WFS resources as XML but we can detect WFS now
        'Atom Feed',
        'SHP',
        'WCS',
        'WMTS',
    )),
    'DBASE':
    'SHP',
Esempio n. 20
0
'''
Fix the themes

* to long form e.g. Crime -> Crime & Justice
* bad format e.g. {Crime}
* bad theme names recategorized
'''

import common
import json
from optparse import OptionParser

from running_stats import Stats

stats_primary = Stats()
stats_secondary = Stats()
stats_recategorize = Stats()

THEME_MAP = {
    u"Health": u"Health",
    u"Environment": u"Environment",
    u"Education": u"Education",
    u"Crime": u"Crime & Justice",
    u"Government": u"Government",
    u"Defence": u"Defence",
    u"Economy": u"Business & Economy",
    u"Transport": u"Transport",
    u"Spending": u"Government Spending",
    u"Society": u"Society",
    u"Mapping": u"Mapping",
    u"Towns": u"Towns & Cities",
Esempio n. 21
0
    def command(cls, config_ini, options, submissions_csv_filepath):

        # Inventive CSV. Columns:
        # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated
        with open(submissions_csv_filepath, 'rb') as f:
            csv = UnicodeCsvReader(f, encoding='iso-8859-1')
            header = csv.next()
            header = [col_name.strip().lower().replace(' ', '_') for col_name in header]
            Submission = namedtuple('Submission', header)
            submissions = [Submission(*row) for row in csv]

        if config_ini:
            # this is only for when running from the command-line
            #print 'Loading CKAN config...'
            common.load_config(config_ini)
            common.register_translator()
            #print '...done'

        from ckan import model
        from ckan.plugins import toolkit
        from ckanext.dgu.lib import helpers as dgu_helpers
        from ckanext.dgu.model.schema_codelist import Schema

        log = __import__('logging').getLogger(__name__)

        # Match the organizations in the submissions
        lga_orgs_by_dgu_org_name = {}
        accepted_submission_dgu_orgs = set()
        for submission in submissions:
            la_title = la_map.get(submission.laname, submission.laname)
            org = model.Session.query(model.Group) \
                       .filter_by(title=la_title) \
                       .first()
            assert org, 'Submission org title not found: %r' % la_title
            lga_orgs_by_dgu_org_name[org.name] = submission.laname
            if submission.acceptancestatus == 'Accepted':
                accepted_submission_dgu_orgs.add(org.name)

        stats = Stats()
        stats_incentive = Stats()
        results = []

        if options.write:
            rev = model.repo.new_revision()
            rev.author = 'script-%s.py' % __file__

        # Iterate over organizations
        if options.dataset:
            dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset})
            org_names = [dataset['organization']['name']]
        elif options.organization:
            org_names = [options.organization]
        elif options.incentive_only:
            org_names = sorted(accepted_submission_dgu_orgs)
        else:
            org_names = dgu_helpers.all_la_org_names()
        #print '%s organizations' % len(org_names)
        for org_name in org_names:
            org_title = model.Group.by_name(org_name).title
            lga_org = lga_orgs_by_dgu_org_name.get(org_name)

            # Iterate over the schemas
            if options.schema:
                schema = all_schemas_by_dgu_name[options.schema]
                if options.incentive_only and not schema.lga_name:
                    # not an incentive schema, so no results
                    schemas = []
                elif options.incentive_only:
                    schemas = [all_schemas_by_lga_name[submission.theme]
                               for submission in submissions
                               if submission.laname == lga_org
                               and submission.theme == schema.lga_name
                               and submission.acceptancestatus == 'Accepted']
                else:
                    schemas = [all_schemas_by_lga_name.get(
                               options.schema,
                               schema)]
            elif options.incentive_only:
                schemas = [all_schemas_by_lga_name[submission.theme]
                           for submission in submissions
                           if submission.laname == lga_org
                           and submission.acceptancestatus == 'Accepted']
            else:
                schemas = all_schemas
            #print '%s schemas' % len(schemas)
            for schema in schemas:

                # Find the relevant incentive submission
                if lga_org:
                    for submission in submissions:
                        if submission.laname == lga_org and \
                                submission.theme == schema.lga_name:
                            break
                    else:
                        submission = None
                else:
                    submission = None

                result = dict(
                    org_name=org_name,
                    org_title=org_title,
                    org_name_lga=submission.laname if submission else '',
                    schema_dgu_title=schema.dgu_schema_name,
                    schema_lga=schema.lga_name,
                    lga_application_number=submission.applicationnumber if submission else '',
                    lga_application_acceptance_status=submission.acceptancestatus if submission else '',
                    dataset_names=[],
                    dataset_titles=[],
                    dataset_schema_applied=[],
                    )

                stat_id = '%s %s' % (org_name, schema.lga_name)
                if submission:
                    stat_id += ' %s' % submission.applicationnumber

                def add_datasets_to_results(datasets, result):
                    for dataset in datasets:
                        if dataset['name'] not in result['dataset_names']:
                            result['dataset_names'].append(dataset['name'])
                            result['dataset_titles'].append(dataset['title'])
                            schema_applied = True if schema.dgu_schema_name in \
                                [s['title'] for s in dataset.get('schema', [])] \
                                else False
                            result['dataset_schema_applied'].append(schema_applied)
                            if not schema_applied and options.write:
                                pkg = model.Package.get(dataset['name'])
                                schema_obj = Schema.by_title(schema.dgu_schema_name)
                                assert schema_obj, schema.dgu_schema_name
                                try:
                                    schema_ids = json.loads(pkg.extras.get('schema') or '[]')
                                except ValueError:
                                    log.error('Not valid JSON in schema field: %s %r',
                                              dataset['name'], pkg.extras.get('schema'))
                                    schema_ids = []
                                schema_ids.append(schema_obj.id)
                                pkg.extras['schema'] = json.dumps(schema_ids)

                # Already a schema?
                data_dict = {'fq': 'publisher:%s ' % org_name +
                                   'schema_multi:"%s"' % schema.dgu_schema_name}
                datasets = toolkit.get_action('package_search')(data_dict=data_dict)
                if datasets['count'] > 0:
                    add_datasets_to_results(datasets['results'], result)
                    stats.add('OK - Dataset with schema',
                              stat_id + ' %s' % ';'.join(result['dataset_names']))
                    found_schema = True
                else:
                    found_schema = False

                # Submission specifies DGU dataset
                if submission and submission.dguurl:
                    match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl)
                    if match:
                        dataset_name = dataset_name_original = match.groups()[0]
                        # some have trailing /
                        dataset_name = dataset_name.strip('/')
                        # hampshire have a hash appended
                        if '#' in dataset_name:
                            dataset_name = dataset_name.split('#')[0]
                        # poole have a resource name appended
                        if '/resource' in dataset_name:
                            dataset_name = dataset_name.split('/resource')[0]
                        # manual corrections
                        if dataset_name in dataset_name_corrections:
                            dataset_name = dataset_name_corrections[dataset_name]
                        dataset = model.Package.by_name(dataset_name)
                        # salford ones added a '1'
                        if not dataset:
                            dataset = model.Package.by_name(dataset_name + '1')
                            if dataset:
                                dataset_name += '1'

                        if dataset and dataset.state == 'active':
                            dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id})
                            add_datasets_to_results([dataset_dict], result)
                            if dataset_name != dataset_name_original:
                                stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out',
                                          stat_id + ' %s' % dataset_name)
                            else:
                                stats_incentive.add('OK - DGU Dataset listed and it checks out',
                                          stat_id + ' %s' % dataset_name)
                        elif dataset:
                            stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!',
                                            '%s %s' % (stat_id, submission.dguurl))
                        else:
                            stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found',
                                            '%s %s' % (stat_id, submission.dguurl))
                    else:
                        stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format',
                                        '%s %s' % (stat_id, submission.dguurl))

                # Submission mentions dataset on LA site - maybe it is in DGU already?
                elif submission and submission.dataseturl:
                    datasets = model.Session.query(model.Package) \
                                    .join(model.ResourceGroup) \
                                    .join(model.Resource) \
                                    .filter(model.Resource.url==submission.dataseturl) \
                                    .filter(model.Package.state=='active') \
                                    .filter(model.Resource.state=='active') \
                                    .all()
                    dataset_dicts = [
                        toolkit.get_action('package_show')(data_dict={'id': dataset.id})
                        for dataset in datasets]
                    add_datasets_to_results(dataset_dicts, result)
                    if len(datasets) > 1:
                        stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets',
                                            '%s %s' % (stat_id, datasets[0].name))
                    elif len(datasets) == 0:
                        stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU',
                                            stat_id)
                    else:
                        stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset',
                                            '%s %s' % (stat_id, datasets[0].name))

                # Search for datasets in the catalogue
                datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name)
                if datasets is None:
                    if not found_schema:
                        stats.add('Search revealed none', stat_id)
                elif len(datasets) > 1:
                    add_datasets_to_results(datasets, result)
                    if not found_schema:
                        stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets]))
                elif datasets:
                    add_datasets_to_results(datasets, result)
                    if not found_schema:
                        stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name']))
                else:
                    if not found_schema:
                        stats.add('No dataset for submission', stat_id)

                results.append(result)

        rows_with_datasets_count = \
            len([result for result in results
                 if any(result['dataset_schema_applied'])])
        rows_with_datasets_or_candidate_datasets_count = \
            len([result for result in results
                 if result['dataset_schema_applied']])

        if options.print_:
            print '\n Incentive stats\n' + stats_incentive.report()
            print '\n Overall stats\n' + stats.report()

        if options.write:
            print 'Writing'
            model.Session.commit()

        return {'table': results,
                'rows_with_datasets_count': rows_with_datasets_count,
                'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}
Esempio n. 22
0
    def command(cls, config_ini, dataset_names, options):
        common.load_config(config_ini)
        common.register_translator()

        from pylons import config
        apikey = config['dgu.merge_datasets.apikey']
        ckan = ckanapi.RemoteCKAN('https://data.gov.uk', apikey=apikey)
        #ckan = ckanapi.LocalCKAN()

        if options.publisher:
            org_name = common.name_stripped_of_url(options.publisher)
            if options.search:
                results = ckan.action.package_search(q=options.search, fq='publisher:%s' % org_name, rows=100)
                dataset_names.extend([dataset['name']
                                      for dataset in results['results']])
            else:
                org = ckan.action.organization_show(id=org_name,
                                                    include_datasets=True)
                dataset_names.extend([d['name'] for d in org['packages']])


        datasets = []
        datasets_by_name = {}

        def get_extra(dataset, key):
            for extra in dataset['extras']:
                if extra['key'] == key:
                    return extra['value']
        for dataset_name in dataset_names:
            print 'Dataset: %s' % dataset_name
        for dataset_name in dataset_names:
            # strip off the url part of the dataset name, if there is one
            dataset_name = common.name_stripped_of_url(dataset_name)
            dataset = ckan.action.package_show(id=dataset_name)
            harvest_source_ref = get_extra(dataset, 'harvest_source_reference')
            if harvest_source_ref:
                print '** Discarding dataset %s due to harvest source: %s **' \
                    % (dataset_name, harvest_source_ref)
                continue
            datasets.append(dataset)
            datasets_by_name[dataset['name']] = dataset
        datasets.sort(key=lambda x: x['metadata_modified'])

        # aggregate resources
        def resource_identity(res_dict, dataset_name):
            return (res_dict.get('date'), res_dict['url'],
                    res_dict.get('title') or res_dict['description'],
                    res_dict.get('format'),
                    dataset_name)
        combined_resources = {}  # identity
        res_stats = Stats()
        for dataset in datasets:
            for resource in dataset['resources']:
                identity = resource_identity(resource, dataset['name'])
                resource['dataset_name'] = dataset['name']
                if identity in combined_resources:
                    print res_stats.add('Discarding duplicate', '\n%s duplicate of \n%s' % (resource, combined_resources[identity]))
                else:
                    combined_resources[identity] = resource
        resources = combined_resources.values()

        # find dates for resources
        # NB This has been pulled out into timeseries_convert.py -
        # TODO call that instead of having the code here too.
        if options.frequency:
            url_munge_re = re.compile('(%20|-|_|\.)')

            def fields_to_hunt_for_date(res):
                date = res.get('date')
                if date:
                    yield 'date', date
                title = res.get('title')
                if title:
                    yield 'title', title
                yield 'description', res['description']
                yield 'url', url_munge_re.sub(' ', res['url'])
                if not options.update:
                    dataset = datasets_by_name[res['dataset_name']]
                    yield 'dataset-title', dataset['title']
                    yield 'dataset-notes', dataset['notes']

            ensure_regexes_are_initialized()
            global regexes
            for resource in resources:
                for field_name, field_value in fields_to_hunt_for_date(resource):
                    if options.frequency in ('monthly', 'quarterly', 'twice annually'):
                        month, year = hunt_for_month_and_year(field_value)
                        if year and month:
                            resource['date'] = '%02d/%s' % (month, year)
                            res_stats.add('Found date in %s' % field_name,
                                          '%s %r' %
                                          (resource['date'], resource))
                            if resource.get('resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource', resource)
                            break
                    elif options.frequency == 'annually':
                        year = regexes['year'].search(field_value)
                        if year:
                            resource['date'] = year.groups()[0]
                            res_stats.add('Found date in %s' % field_name,
                                          '%s %r' %
                                          (resource['date'], resource))
                            if resource.get('resource_type') == 'documentation':
                                resource['resource_type'] = 'file'
                                res_stats.add('Converted additional resource', resource)
                            break
                else:
                    if resource.get('resource_type') == 'documentation':
                        print res_stats.add('Could not find date but it\'s Additional Resource', resource)
                        continue
                    print res_stats.add('Could not find date', resource)
                    continue

            print 'Resources: \n', res_stats

            resources_without_date = [res for res in resources
                                      if not res.get('date') and
                                      res.get('resource_type') != 'documentation']
            for i, res in enumerate(resources_without_date):
                print 'Resources without dates %s/%s' % (i+1, len(resources_without_date))
                for field_name, field_value in fields_to_hunt_for_date(res):
                    print '  %s: %s' % (field_name, field_value.encode('latin-1', 'ignore'))
                print 'https://data.gov.uk/dataset/%s/resource/%s' % (res['dataset_name'], res['id'])
                date_format = {'annually': 'YYYY',
                               'monthly': 'MM/YYYY',
                               'twice annually': 'MM/YYYY',
                               'quarterly': 'MM/YYYY'}
                input_ = raw_input('Date (%s) or DOCS to make it an Additional Resource: ' %
                                   date_format[options.frequency])
                if input_.strip().lower() == 'docs':
                    res['date'] = ''
                    res['resource_type'] = 'documentation'
                else:
                    res['date'] = input_

            resources.sort(key=lambda x: x.get('date', '').split('/')[::-1])

        # Ensure there is not a mixture of resources with and without a date
        have_dates = None
        for res in resources:
            if res.get('resource_type') == 'documentation':
                continue
            if have_dates is None:
                have_dates = bool(res.get('date'))
            else:
                has_date = bool(res.get('date'))
                if has_date != have_dates:
                    print [res.get('date') for res in resources]
                    print 'Cannot mix resources with dates and others without!'
                    import pdb
                    pdb.set_trace()
https://github.com/datagovuk/ckanext-dgu/issues/348
"""
import ast
from optparse import OptionParser
import copy
import re

from sqlalchemy.exc import IntegrityError

from ckan.logic import ValidationError
from ckanext.dgu.bin import common
from ckanext.dgu.lib import helpers as dgu_helpers
from running_stats import Stats

stats = Stats()
ckan_license_ids = None


mangled_unicode_re = re.compile(r'\\u\d{4,6}')


class LicenceTidy(object):

    def __init__(self, config_or_url):
        self.ckan = common.get_ckanapi(config_or_url)

    def run(self, options):
        """ Iterate over datasets and tidy """

        if not options.write:
def main(source,
         source_type,
         destination,
         save_relevant_datasets_json,
         write,
         dataset_filter=None,
         res_url_filter=None):

    if source_type == 'json':
        all_datasets = get_datasets_from_json(source)
    elif source_type == 'jsonl':
        all_datasets = get_datasets_from_jsonl(source)
    else:
        all_datasets = get_datasets_from_ckan(source)

    datasets = []  # legacy ones
    revamped_datasets = []  # ones created on 3rd October 2016 launch
    revamped_datasets_by_org = {}
    revamped_resources = {}
    csv_out_rows = []
    csv_corrected_rows = []
    try:
        # find all the legacy organogram datasets
        all_datasets = list(all_datasets)  # since we need to iterate it twice
        for dataset in all_datasets:

            if dataset_filter and dataset['name'] != dataset_filter:
                continue
            if res_url_filter and \
                res_url_filter not in [r['url'] for r in dataset['resources']]:
                continue

            # check it an organogram dataset
            dataset_str = repr(dataset).lower()
            if 'rganog' not in dataset_str \
                    and 'roles and salaries' not in dataset_str \
                    and 'pay and post' not in dataset_str \
                    and 'posts and pay' not in dataset_str \
                    and 'organisation chart' not in dataset_str \
                    and 'organization chart' not in dataset_str \
                    and 'org chart' not in dataset_str:
                stats_datasets.add('Ignored - not organograms',
                                   dataset['name'])
                continue
            if dataset['name'] in (
                    'eastbourne-borough-council-public-toilets',
                    'staff-organograms-and-pay-government-offices',
                    ) \
                    or dataset['id'] in (
                        '47f69ebb-9939-419f-880d-1b976676cb0e',
                    ):
                stats_datasets.add('Ignored - not organograms',
                                   dataset['name'])
                continue
            if asbool(dataset.get('unpublished')):
                stats_datasets.add('Ignored - unpublished', dataset['name'])
                continue
            extras = dict(
                (extra['key'], extra['value']) for extra in dataset['extras'])
            if extras.get('import_source') == 'organograms_v2':
                continue
            if extras.get('import_source') == 'harvest':
                stats_datasets.add('Ignored - harvested so can\'t edit it',
                                   dataset['name'])
                continue

            # legacy dataset
            datasets.append(dataset)

        # find the revamped organogram datasets
        for dataset in all_datasets:
            extras = dict(
                (extra['key'], extra['value']) for extra in dataset['extras'])
            if extras.get('import_source') != 'organograms_v2':
                continue

            org_id = dataset['owner_org']
            revamped_datasets.append(dataset)
            assert org_id not in revamped_datasets_by_org, org_id
            revamped_datasets_by_org[org_id] = dataset
            for res in dataset['resources']:
                date = date_to_year_month(res['date'])
                revamped_resources[(org_id, date)] = res
            continue

        if save_relevant_datasets_json:
            filename = 'datasets_organograms.json'
            if not (dataset_filter or res_url_filter):
                output = json.dumps(
                    datasets + revamped_datasets,
                    indent=4,
                    separators=(',', ': '),  # pretty print)
                )
                with open(filename, 'wb') as f:
                    f.write(output)
                print 'Written %s' % filename
            else:
                print 'Not written %s because you filtered by a ' \
                    'dataset/resource' % filename

        all_resource_ids_to_delete = defaultdict(
            list)  # dataset_name: res_id_list
        dataset_names_to_delete = set()
        for dataset in datasets:
            org_id = dataset['owner_org']

            # save csv as it has been
            save_csv_rows(csv_out_rows, dataset, None, None)

            original_dataset = copy.deepcopy(dataset)
            delete_dataset = False

            dataset_to_merge_to = \
                get_dataset_to_merge_to(dataset, revamped_datasets_by_org)

            # detect dates
            for res in dataset['resources']:
                if res_url_filter and res['url'] != res_url_filter:
                    continue
                stats = timeseries_convert.add_date_to_resource(
                    res, dataset=dataset)

            # resource corrections
            resources_to_delete = []
            for res in dataset['resources']:
                if res_url_filter and res['url'] != res_url_filter:
                    continue
                resource_corrections(res, dataset, extras, revamped_resources,
                                     revamped_datasets_by_org,
                                     dataset_to_merge_to, org_id,
                                     resources_to_delete, stats_res)
            for res in resources_to_delete:
                dataset['resources'].remove(res)
            if not dataset['resources']:
                delete_dataset = True
            elif resources_to_delete and not dataset_to_merge_to:
                all_resource_ids_to_delete[dataset['name']].extend(
                    res['id'] for res in resources_to_delete)
            org_id = dataset['owner_org']  # it might have changed

            for res in dataset['resources']:
                if res_url_filter and res['url'] != res_url_filter:
                    continue
                if res.get('resource_type') != 'documentation' and not res.get(
                        'date'):
                    stats_dates.add('Missing date', dataset['name'])
                    break
            else:
                stats_dates.add('Ok dates', dataset['name'])

            # record changes
            if delete_dataset:
                stats_datasets.add('Delete dataset - no resources',
                                   dataset['name'])
                dataset_names_to_delete.add(dataset['name'])
                continue
            elif original_dataset != dataset:
                stats_datasets.add('Updated dataset', dataset['name'])
                has_changed = True
            else:
                stats_datasets.add('Unchanged dataset', dataset['name'])
                has_changed = False

            if dataset_to_merge_to:
                stats_merge.add('Merge', dataset_to_merge_to)
            else:
                stats_merge.add('No merge', dataset['name'])

            # save csv with corrections
            save_csv_rows(csv_corrected_rows, dataset, has_changed,
                          dataset_to_merge_to)

    except:
        traceback.print_exc()
        import pdb
        pdb.set_trace()

    stats_merge.report_value_limit = 500
    stats_res.report_value_limit = 500
    print '\nDatasets\n', stats_datasets
    print '\nDataset merges\n', stats_merge
    print '\nDates\n', stats_dates
    print '\nResources\n', stats_res

    # save csvs
    if dataset_filter or res_url_filter:
        for row in csv_corrected_rows:
            if res_url_filter and row['res_url'] != res_url_filter:
                continue
            pprint(row)
        print 'Not written csv because you specified a particular dataset'
    else:
        headers = [
            'name',
            'org_title',
            'org_id',
            'notes',
            'res_id',
            'res_name',
            'res_url',
            'res_format',
            'res_date',
            'res_type',
            'has_changed',
            'merge_to_dataset',
        ]
        for csv_rows, out_filename in (
            (csv_out_rows, 'organogram_legacy_datasets.csv'),
            (csv_corrected_rows, 'organogram_legacy_datasets_corrected.csv'),
        ):
            with open(out_filename, 'wb') as csv_write_file:
                csv_writer = unicodecsv.DictWriter(csv_write_file,
                                                   fieldnames=headers,
                                                   encoding='utf-8')
                csv_writer.writeheader()
                for row in sorted(csv_rows, key=lambda r: r['res_url']):
                    csv_writer.writerow(row)
            print 'Written', out_filename

    # group merges by the revamped_dataset
    resources_to_merge = defaultdict(
        list)  # revamped_dataset_name: resource_list
    resources_to_update = defaultdict(list)  # dataset_name: resource_list
    for row in csv_corrected_rows:
        if row['has_changed'] is False:
            continue
        res = dict(
            id=row['res_id'],
            description=row['res_name'],  # description is required
            url=row['res_url'],
            format=row['res_format'],
            date=row['res_date'],
            resource_type=row['res_type'])
        if row['merge_to_dataset']:
            res['id'] = None  # ignore the id
            resources_to_merge[row['merge_to_dataset']].append(res)
            # also delete the merged dataset
            if row['name'] not in dataset_names_to_delete:
                dataset_names_to_delete.add(row['name'])
        else:
            resources_to_update[row['name']].append(res)

    # write changes - merges etc
    try:
        if destination:
            if write:
                write_caveat = ''
            else:
                write_caveat = ' (NOP without --write)'
            print 'Writing changes to datasets' + write_caveat
            stats_write_res = Stats()
            stats_write_dataset = Stats()
            ckan = common.get_ckanapi(destination)
            import ckanapi

            print 'Updating datasets'
            for dataset_name, res_list in resources_to_update.iteritems():
                dataset = ckan.action.package_show(id=dataset_name)
                resources_by_id = dict(
                    (r['id'], r) for r in dataset['resources'])
                dataset_changed = False
                for res in res_list:
                    res_ref = '%s-%s' % (dataset_name, res_list.index(res))
                    res_to_update = resources_by_id.get(res['id'])
                    if res_to_update:
                        res_changed = False
                        for key in res.keys():
                            if res[key] != res_to_update.get(key):
                                res_to_update[key] = res[key]
                                dataset_changed = True
                                res_changed = True
                        if res_changed:
                            stats_write_res.add('update - ok' + write_caveat,
                                                res_ref)
                        else:
                            stats_write_res.add('update - not needed', res_ref)
                    else:
                        stats_write_res.add(
                            'update - could not find resource id',
                            dataset_name)
                if dataset_changed:
                    if write:
                        ckan.action.package_update(**dataset)
                    stats_write_dataset.add('Update done' + write_caveat,
                                            dataset_name)
                else:
                    stats_write_dataset.add('Update not needed', dataset_name)

            print 'Merging datasets'
            for revamped_dataset_name, res_list in \
                    resources_to_merge.iteritems():
                try:
                    dataset = ckan.action.package_show(
                        id=revamped_dataset_name)
                except ckanapi.NotFound:
                    stats_write_dataset.add('Merge - dataset not found',
                                            revamped_dataset_name)
                    continue
                existing_res_urls = set(r['url'] for r in dataset['resources'])
                dataset_changed = False
                for res in res_list:
                    res_ref = '%s-%s' % (revamped_dataset_name,
                                         res_list.index(res))
                    if res['url'] in existing_res_urls:
                        stats_write_res.add(
                            'merge - no change - resource URL already there',
                            res_ref)
                    else:
                        dataset_changed = True
                        res['description'] += ' (from legacy dataset)'
                        dataset['resources'].append(res)
                        stats_write_res.add('merge - add' + write_caveat,
                                            res_ref)
                if dataset_changed:
                    if write:
                        ckan.action.package_update(**dataset)
                    stats_write_dataset.add('Merge done' + write_caveat,
                                            revamped_dataset_name)
                else:
                    stats_write_dataset.add('Merge not needed',
                                            revamped_dataset_name)

            print 'Deleting resources'
            for dataset_name, res_id_list in \
                    all_resource_ids_to_delete.iteritems():
                if dataset_name in dataset_names_to_delete:
                    stats_write_dataset.add(
                        'Delete resources not needed as deleting dataset later',
                        dataset_name)
                    continue
                try:
                    dataset = ckan.action.package_show(id=dataset_name)
                except ckanapi.NotFound:
                    stats_write_dataset.add('Delete res - dataset not found',
                                            dataset_name)
                    continue
                existing_resources = \
                    dict((r['id'], r) for r in dataset['resources'])
                dataset_changed = False
                for res_id in res_id_list:
                    res_ref = '%s-%s' % (dataset_name,
                                         res_id_list.index(res_id))
                    existing_resource = existing_resources.get(res_id)
                    if existing_resource:
                        dataset_changed = True
                        dataset['resources'].remove(existing_resource)
                        stats_write_res.add('delete res - done' + write_caveat,
                                            res_ref)
                    else:
                        stats_write_res.add(
                            'delete res - could not find res id', res_ref)
                if dataset_changed:
                    if write:
                        ckan.action.package_update(**dataset)
                    stats_write_dataset.add('Delete res done' + write_caveat,
                                            dataset_name)
                else:
                    stats_write_dataset.add('Delete res not needed',
                                            dataset_name)

            print 'Deleting datasets'
            for dataset_name in dataset_names_to_delete:
                try:
                    dataset = ckan.action.package_show(id=dataset_name)
                except ckanapi.NotFound:
                    stats_write_dataset.add('Delete dataset - not found',
                                            dataset_name)
                else:
                    if write:
                        ckan.action.package_delete(id=dataset_name)
                    stats_write_dataset.add(
                        'Delete dataset - done' + write_caveat, dataset_name)

            print '\nResources\n', stats_write_res
            print '\nDatasets\n', stats_write_dataset
        else:
            print 'Not written changes to datasets'
    except:
        traceback.print_exc()
        import pdb
        pdb.set_trace()
import traceback
import sys
from collections import OrderedDict
import unicodecsv

# see install instructions above
from pymongo import MongoClient
from pymongo.errors import OperationFailure
from sqlalchemy import distinct
from sqlalchemy import func

from running_stats import Stats
import common

args = None
stats = Stats()

EVENT_TYPES = [
    'create-account',
    'login',
    'publish-form-new-submit-success',
    'dataset-created',
    'publish-form-edit-submit-success',
    'dataset-updated',
]

QUARTER_PROJECTION = {
    '$concat': [
        {
            '$substr': [{
                '$year': '$date'
Esempio n. 26
0
'''
Generates the location data for the data.gov.uk alpha.

'''
import argparse
import json
import traceback
import csv
from pprint import pprint

import requests
import requests_cache

from running_stats import Stats

stats_types = Stats()
stats = Stats()
args = None
max_pk = None

one_day = 60 * 60 * 24
requests_cache.install_cache('.drupal_dump', expire_after=one_day)

# Is there a register? Probably easier to hard-code
countries_of_the_uk = [
    ('England', ),  # leave room for geo-data
    ('Wales', ),
    ('Scotland', ),
    ('Northern Ireland', ),
]
regions_of_the_uk = [