def command(cls, config_ini, org_names):
        common.load_config(config_ini)
        common.register_translator()
        from ckan.plugins import toolkit
        from ckan import model
        orgs = [toolkit.get_action('organization_show')(
                data_dict={'id': org_name})
                for org_name in org_names]
        source_org, dest_org = orgs
        assert source_org
        assert dest_org
        search_results = toolkit.get_action('package_search')(
            data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000))
        print 'Datasets: %s' % search_results['count']
        stats = Stats()
        if len(search_results['results']) != search_results['count']:
            assert 0, 'need to implement paging'

        #context = {
        #    'user': get_script_user(__name__)['name'],
        #    'ignore_auth': True,
        #    'model': model}
        rev = model.repo.new_revision()
        rev.author = 'script-%s.py' % __file__
        for dataset in search_results['results']:
            model.Package.get(dataset['id']).owner_org = dest_org['id']
            #dataset_ = toolkit.get_action('package_patch')(
            #    context=context,
            #    data_dict=dict(id=dataset['id'], owner_org=dest_org['id']))
            print stats.add('Changed owner_org', dataset['name'])
        print stats.report()
        print 'Writing'
        model.Session.commit()
Example #2
0
    def command(cls, config_ini, org_names):
        common.load_config(config_ini)
        common.register_translator()
        from ckan.plugins import toolkit
        from ckan import model
        orgs = [
            toolkit.get_action('organization_show')(data_dict={
                'id': org_name
            }) for org_name in org_names
        ]
        source_org, dest_org = orgs
        assert source_org
        assert dest_org
        search_results = toolkit.get_action('package_search')(
            data_dict=dict(fq='publisher:%s' % source_org['name'], rows=1000))
        print 'Datasets: %s' % search_results['count']
        stats = Stats()
        if len(search_results['results']) != search_results['count']:
            assert 0, 'need to implement paging'

        #context = {
        #    'user': get_script_user(__name__)['name'],
        #    'ignore_auth': True,
        #    'model': model}
        rev = model.repo.new_revision()
        rev.author = 'script-%s.py' % __file__
        for dataset in search_results['results']:
            model.Package.get(dataset['id']).owner_org = dest_org['id']
            #dataset_ = toolkit.get_action('package_patch')(
            #    context=context,
            #    data_dict=dict(id=dataset['id'], owner_org=dest_org['id']))
            print stats.add('Changed owner_org', dataset['name'])
        print stats.report()
        print 'Writing'
        model.Session.commit()
                if options.write:
                    try:
                        self.ckan.action.package_update(**pkg)
                        ds_stats.add('Dataset updated', pkg['name'])
                    except ValidationError, ve:
                        print ds_stats.add('Validation error on update',
                                           pkg['name'])
                        print ve
                    except IntegrityError:
                        print ds_stats.add('Integrity error on update',
                                           pkg['name'])
                else:
                    ds_stats.add('Dataset would be updated', pkg['name'])
            else:
                ds_stats.add('No change', pkg['name'])
        print '\nResources:\n', res_stats.report(show_time_taken=True)
        print '\nDatasets:\n', ds_stats.report(show_time_taken=True)


usage = __doc__ + '''
Usage:
    python set_missing_resource_formats.py <CKAN config.ini or URL> [-d DATASET_NAME] [-o ORGANISATION_NAME] -w'''

if __name__ == '__main__':
    parser = OptionParser(usage=usage)
    parser.add_option('-d', '--dataset', dest='dataset')
    parser.add_option('-o', '--organization', dest='organization')
    parser.add_option("-w",
                      "--write",
                      action="store_true",
                      dest="write",
                if options.write:
                    try:
                        self.ckan.action.package_update(**pkg)
                        ds_stats.add('Dataset updated', pkg['name'])
                    except ValidationError, ve:
                        print ds_stats.add('Validation error on update',
                                           pkg['name'])
                        print ve
                    except IntegrityError:
                        print ds_stats.add('Integrity error on update',
                                           pkg['name'])
                else:
                    ds_stats.add('Dataset would be updated', pkg['name'])
            else:
                ds_stats.add('No change', pkg['name'])
        print '\nResources:\n', res_stats.report(show_time_taken=True)
        print '\nDatasets:\n', ds_stats.report(show_time_taken=True)


usage = __doc__ + '''
Usage:
    python set_missing_resource_formats.py <CKAN config.ini or URL> [-d DATASET_NAME] [-o ORGANISATION_NAME] -w'''

if __name__ == '__main__':
    parser = OptionParser(usage=usage)
    parser.add_option('-d', '--dataset', dest='dataset')
    parser.add_option('-o', '--organization', dest='organization')
    parser.add_option("-w", "--write",
                      action="store_true",
                      dest="write",
                      default=False,
                if options.write:
                    try:
                        self.ckan.action.package_update(**dataset)
                        print stats.add('Dataset updated', dataset['name'])
                    except ValidationError, ve:
                        print stats.add('Validation error on update',
                                        dataset['name'])
                        print ve
                    except IntegrityError:
                        print stats.add('Integrity error on update',
                                        dataset['name'])
                else:
                    stats.add('Dataset would be updated', dataset['name'])
            else:
                stats.add('No change', dataset['name'])
        print '\nDatasets:\n', stats.report(show_time_taken=True)

    def get_tidied_dataset(self, dataset):
        is_dataset_updated = False
        license_id = dataset['license_id'] or ''
        extras = dict((extra['key'], extra['value'])
                      for extra in dataset['extras'])
        licence = extras.get('licence') or ''

        if licence:
            # INSPIRE datasets are a python list repr'd
            # ast.literal_eval() is safer than eval()
            try:
                licence_bits = ast.literal_eval(licence) or []
                is_dataset_updated = True
            except (ValueError, SyntaxError):
Example #6
0
    def command(cls, config_ini, options, submissions_csv_filepath):

        # Inventive CSV. Columns:
        # applicationnumber, applicationdate, jobrole, laname, officerauthorised, theme, responsedate, acceptancestatus, odicertificateurl, dguurl, inventoryurl, localcodes, dataseturl, schemaurl, guidanceurl, frequencyofpublishing, foinumberest, submissioncomplete, lastlaupdate, techreviewstatus, lasttechupdate, adminreviewstatus, paymentamount, closed, lastadminupdate, applicantnotes, administrationnotes, technicalnotes, lastupdated
        with open(submissions_csv_filepath, 'rb') as f:
            csv = UnicodeCsvReader(f, encoding='iso-8859-1')
            header = csv.next()
            header = [col_name.strip().lower().replace(' ', '_') for col_name in header]
            Submission = namedtuple('Submission', header)
            submissions = [Submission(*row) for row in csv]

        if config_ini:
            # this is only for when running from the command-line
            #print 'Loading CKAN config...'
            common.load_config(config_ini)
            common.register_translator()
            #print '...done'

        from ckan import model
        from ckan.plugins import toolkit
        from ckanext.dgu.lib import helpers as dgu_helpers
        from ckanext.dgu.model.schema_codelist import Schema

        log = __import__('logging').getLogger(__name__)

        # Match the organizations in the submissions
        lga_orgs_by_dgu_org_name = {}
        accepted_submission_dgu_orgs = set()
        for submission in submissions:
            la_title = la_map.get(submission.laname, submission.laname)
            org = model.Session.query(model.Group) \
                       .filter_by(title=la_title) \
                       .first()
            assert org, 'Submission org title not found: %r' % la_title
            lga_orgs_by_dgu_org_name[org.name] = submission.laname
            if submission.acceptancestatus == 'Accepted':
                accepted_submission_dgu_orgs.add(org.name)

        stats = Stats()
        stats_incentive = Stats()
        results = []

        if options.write:
            rev = model.repo.new_revision()
            rev.author = 'script-%s.py' % __file__

        # Iterate over organizations
        if options.dataset:
            dataset = toolkit.get_action('package_show')(data_dict={'id': options.dataset})
            org_names = [dataset['organization']['name']]
        elif options.organization:
            org_names = [options.organization]
        elif options.incentive_only:
            org_names = sorted(accepted_submission_dgu_orgs)
        else:
            org_names = dgu_helpers.all_la_org_names()
        #print '%s organizations' % len(org_names)
        for org_name in org_names:
            org_title = model.Group.by_name(org_name).title
            lga_org = lga_orgs_by_dgu_org_name.get(org_name)

            # Iterate over the schemas
            if options.schema:
                schema = all_schemas_by_dgu_name[options.schema]
                if options.incentive_only and not schema.lga_name:
                    # not an incentive schema, so no results
                    schemas = []
                elif options.incentive_only:
                    schemas = [all_schemas_by_lga_name[submission.theme]
                               for submission in submissions
                               if submission.laname == lga_org
                               and submission.theme == schema.lga_name
                               and submission.acceptancestatus == 'Accepted']
                else:
                    schemas = [all_schemas_by_lga_name.get(
                               options.schema,
                               schema)]
            elif options.incentive_only:
                schemas = [all_schemas_by_lga_name[submission.theme]
                           for submission in submissions
                           if submission.laname == lga_org
                           and submission.acceptancestatus == 'Accepted']
            else:
                schemas = all_schemas
            #print '%s schemas' % len(schemas)
            for schema in schemas:

                # Find the relevant incentive submission
                if lga_org:
                    for submission in submissions:
                        if submission.laname == lga_org and \
                                submission.theme == schema.lga_name:
                            break
                    else:
                        submission = None
                else:
                    submission = None

                result = dict(
                    org_name=org_name,
                    org_title=org_title,
                    org_name_lga=submission.laname if submission else '',
                    schema_dgu_title=schema.dgu_schema_name,
                    schema_lga=schema.lga_name,
                    lga_application_number=submission.applicationnumber if submission else '',
                    lga_application_acceptance_status=submission.acceptancestatus if submission else '',
                    dataset_names=[],
                    dataset_titles=[],
                    dataset_schema_applied=[],
                    )

                stat_id = '%s %s' % (org_name, schema.lga_name)
                if submission:
                    stat_id += ' %s' % submission.applicationnumber

                def add_datasets_to_results(datasets, result):
                    for dataset in datasets:
                        if dataset['name'] not in result['dataset_names']:
                            result['dataset_names'].append(dataset['name'])
                            result['dataset_titles'].append(dataset['title'])
                            schema_applied = True if schema.dgu_schema_name in \
                                [s['title'] for s in dataset.get('schema', [])] \
                                else False
                            result['dataset_schema_applied'].append(schema_applied)
                            if not schema_applied and options.write:
                                pkg = model.Package.get(dataset['name'])
                                schema_obj = Schema.by_title(schema.dgu_schema_name)
                                assert schema_obj, schema.dgu_schema_name
                                try:
                                    schema_ids = json.loads(pkg.extras.get('schema') or '[]')
                                except ValueError:
                                    log.error('Not valid JSON in schema field: %s %r',
                                              dataset['name'], pkg.extras.get('schema'))
                                    schema_ids = []
                                schema_ids.append(schema_obj.id)
                                pkg.extras['schema'] = json.dumps(schema_ids)

                # Already a schema?
                data_dict = {'fq': 'publisher:%s ' % org_name +
                                   'schema_multi:"%s"' % schema.dgu_schema_name}
                datasets = toolkit.get_action('package_search')(data_dict=data_dict)
                if datasets['count'] > 0:
                    add_datasets_to_results(datasets['results'], result)
                    stats.add('OK - Dataset with schema',
                              stat_id + ' %s' % ';'.join(result['dataset_names']))
                    found_schema = True
                else:
                    found_schema = False

                # Submission specifies DGU dataset
                if submission and submission.dguurl:
                    match = re.match('http://data.gov.uk/dataset/(.*)', submission.dguurl)
                    if match:
                        dataset_name = dataset_name_original = match.groups()[0]
                        # some have trailing /
                        dataset_name = dataset_name.strip('/')
                        # hampshire have a hash appended
                        if '#' in dataset_name:
                            dataset_name = dataset_name.split('#')[0]
                        # poole have a resource name appended
                        if '/resource' in dataset_name:
                            dataset_name = dataset_name.split('/resource')[0]
                        # manual corrections
                        if dataset_name in dataset_name_corrections:
                            dataset_name = dataset_name_corrections[dataset_name]
                        dataset = model.Package.by_name(dataset_name)
                        # salford ones added a '1'
                        if not dataset:
                            dataset = model.Package.by_name(dataset_name + '1')
                            if dataset:
                                dataset_name += '1'

                        if dataset and dataset.state == 'active':
                            dataset_dict = toolkit.get_action('package_show')(data_dict={'id': dataset.id})
                            add_datasets_to_results([dataset_dict], result)
                            if dataset_name != dataset_name_original:
                                stats_incentive.add('OK - DGU Dataset listed and with corrections it checks out',
                                          stat_id + ' %s' % dataset_name)
                            else:
                                stats_incentive.add('OK - DGU Dataset listed and it checks out',
                                          stat_id + ' %s' % dataset_name)
                        elif dataset:
                            stats_incentive.add('ERROR - DGU Dataset listed BUT it is deleted!',
                                            '%s %s' % (stat_id, submission.dguurl))
                        else:
                            stats_incentive.add('ERROR - DGU Dataset listed BUT it is not found',
                                            '%s %s' % (stat_id, submission.dguurl))
                    else:
                        stats_incentive.add('ERROR - DGU Dataset listed BUT the URL is not the correct format',
                                        '%s %s' % (stat_id, submission.dguurl))

                # Submission mentions dataset on LA site - maybe it is in DGU already?
                elif submission and submission.dataseturl:
                    datasets = model.Session.query(model.Package) \
                                    .join(model.ResourceGroup) \
                                    .join(model.Resource) \
                                    .filter(model.Resource.url==submission.dataseturl) \
                                    .filter(model.Package.state=='active') \
                                    .filter(model.Resource.state=='active') \
                                    .all()
                    dataset_dicts = [
                        toolkit.get_action('package_show')(data_dict={'id': dataset.id})
                        for dataset in datasets]
                    add_datasets_to_results(dataset_dicts, result)
                    if len(datasets) > 1:
                        stats_incentive.add('No DGU Dataset, but Dataset URL matches multiple DGU datasets',
                                            '%s %s' % (stat_id, datasets[0].name))
                    elif len(datasets) == 0:
                        stats_incentive.add('No DGU Dataset and Dataset URL not found on DGU',
                                            stat_id)
                    else:
                        stats_incentive.add('No DGU Dataset, but Dataset URL matches DGU dataset',
                                            '%s %s' % (stat_id, datasets[0].name))

                # Search for datasets in the catalogue
                datasets = cls.find_dataset_for_schema(schema=schema, org_name=org_name)
                if datasets is None:
                    if not found_schema:
                        stats.add('Search revealed none', stat_id)
                elif len(datasets) > 1:
                    add_datasets_to_results(datasets, result)
                    if not found_schema:
                        stats.add('Found datasets (multiple) in search', '%s %r' % (stat_id, [d['name'] for d in datasets]))
                elif datasets:
                    add_datasets_to_results(datasets, result)
                    if not found_schema:
                        stats.add('Found dataset in search', '%s %s' % (stat_id, datasets[0]['name']))
                else:
                    if not found_schema:
                        stats.add('No dataset for submission', stat_id)

                results.append(result)

        rows_with_datasets_count = \
            len([result for result in results
                 if any(result['dataset_schema_applied'])])
        rows_with_datasets_or_candidate_datasets_count = \
            len([result for result in results
                 if result['dataset_schema_applied']])

        if options.print_:
            print '\n Incentive stats\n' + stats_incentive.report()
            print '\n Overall stats\n' + stats.report()

        if options.write:
            print 'Writing'
            model.Session.commit()

        return {'table': results,
                'rows_with_datasets_count': rows_with_datasets_count,
                'rows_with_datasets_or_candidate_datasets_count': rows_with_datasets_or_candidate_datasets_count}