Python PackageLoader Examples

Programming Language: Python

Namespace/Package Name: ckanext.importlib.loader

Class/Type: PackageLoader

Examples at hotexamples.com: 4

Python PackageLoader - 4 examples found. These are the top rated real world Python examples of ckanext.importlib.loader.PackageLoader extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

PackageLoader(1)

_get_package(1)

_package_search(1)

Example #1

Show file

 def __init__(self, ckanclient, dry_run=False, force=False):
     '''
     @param ckanclient: instance of ckanclient to make the changes
     @param dry_run: change nothing
     @param force: do not stop if there is an error with one package
     '''
     self.client = ckanclient
     self.dry_run = dry_run
     self.force = force
     self.loader = PackageLoader(self.client)

Example #2

Show file

File: ons_merge_duplicates.py Project: afjensen/ckanext-dgu

 def __init__(self, ckanclient, dry_run=False, force=False):
     '''
     @param ckanclient: instance of ckanclient to make the changes
     @param dry_run: change nothing
     @param force: do not stop if there is an error with one package
     '''
     self.client = ckanclient
     self.dry_run = dry_run
     self.force = force
     self.loader = PackageLoader(self.client)

Example #3

Show file

File: ons_merge_duplicates.py Project: afjensen/ckanext-dgu

class Tool:
    def __init__(self, ckanclient, dry_run=False, force=False):
        '''
        @param ckanclient: instance of ckanclient to make the changes
        @param dry_run: change nothing
        @param force: do not stop if there is an error with one package
        '''
        self.client = ckanclient
        self.dry_run = dry_run
        self.force = force
        self.loader = PackageLoader(self.client)

    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action('package_search', q='!external_reference:ONSHUB \"Source agency\"', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not find "Source agency: " line after all', pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '
        
        res = self.client.action('package_search', q='external_reference:ONSHUB \"%s\"' % prefix, sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action('package_search', q='external_reference:ONSHUB !groups:["" TO *]', sort='name asc', fq=' +site_id:"dgu" +state:active', wt='json', rows=100, escape_q=False)
        
        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(source_agency, self.client)
            if not publisher_name:
                log.error(stats.add('Could not map source agency %s' % source_agency, pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add('Error writing to publisher over API %s' % self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {'external_reference': 'ONSHUB',
                                          'state': 'active'}
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue                
            if not self.loader._pkg_matches_search_options(pkg, onshub_packages_search_options):
                log.error(merge_stats.add('Did not match ONSHUB search after all', pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {'title': pkg['title'],
                                   'groups': pkg['groups'][0] if pkg['groups'] else '',
                                   'external_reference': 'ONSHUB',
                                   'state': 'active'}
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself', pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s', pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r',
                         pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add('%i duplicates found and merged' % len(dupe_pkgs), pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))
                

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def do_merge(self, pkg, dupe_pkgs):
        '''Does the merge. Returns any error message or None if successful.'''
        # Select the package with the least _ in the name to keep
        pkgs_scored = sorted([pkg] + dupe_pkgs, key=lambda p: p['name'].count('_'))
        pkg = pkgs_scored[0]
        dupe_pkgs = pkgs_scored[1:]
        log.info('Keeping %s and merging in %r', pkg['name'],
                 [p['name'] for p in dupe_pkgs])
        copy_keys = ('description', 'url', 'format', 'hub-id', 'size', 'cache_filepath', 'last_modified', 'hash', 'mimetype', 'cache_url')
        for dupe_pkg in dupe_pkgs:
            for res in dupe_pkg['resources']:
                res_copy = dict([(key, res.get(key)) for key in copy_keys])
                pkg['resources'].append(res_copy)
        if not self.dry_run:
            # Write the package
            try:
                self.client.package_entity_put(pkg)
            except CkanApiError:
                log.error('Error (%s) editing package over API: %s' % \
                          (self.client.last_status,
                           self.client.last_message))
                return 'Could not edit package: %s' % self.client.last_status
            # Delete the duplicates
            for dupe_pkg in dupe_pkgs:
                try:
                    self.client.package_entity_delete(dupe_pkg['name'])
                except CkanApiError:
                    log.error('Error (%s) deleting over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    return 'Could not delete package: %s' % self.client.last_status
                
        return True

Example #4

Show file

class Tool:
    def __init__(self, ckanclient, dry_run=False, force=False):
        '''
        @param ckanclient: instance of ckanclient to make the changes
        @param dry_run: change nothing
        @param force: do not stop if there is an error with one package
        '''
        self.client = ckanclient
        self.dry_run = dry_run
        self.force = force
        self.loader = PackageLoader(self.client)

    def add_missing_onshub_extra(self):
        '''Some ONSHUB datasets were edited manually and due to a bug, many
        of the extras got lost. Here we restore the external_reference=ONSHUB
        extra.
        '''
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='!external_reference:ONSHUB \"Source agency\"',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing extras: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add(
                        'Could not find "Source agency: " line after all',
                        pkg['name']))
                continue

            # Add the extra
            pkg['extras']['external_reference'] = 'ONSHUB'
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Added extra', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def correct_home_office_titles(self):
        '''Home Office edited their ONSHUB titles to be prefixed with
        "UK National Statistics Publication Hub: ". These cannot be added
        to by the ons_loader in the future because of this title change so
        remove the prefix.
        e.g. scientific_procedures_on_living_animals_great_britain
        '''
        stats = StatsList()
        prefix = 'UK National Statistics Publication Hub: '

        res = self.client.action('package_search',
                                 q='external_reference:ONSHUB \"%s\"' % prefix,
                                 sort='name asc',
                                 fq=' +site_id:"dgu" +state:active',
                                 wt='json',
                                 rows=100,
                                 escape_q=False)

        log.info('ONSHUB datasets with HOME_OFFICE prefix: %i', res['count'])

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if not pkg['title'].startswith(prefix):
                log.error(stats.add('Prefix not there after all', pkg['name']))
                continue

            # Remove the prefix
            pkg['title'] = pkg['title'][len(prefix):]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            log.info(stats.add('Remove prefix', pkg['name']))

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def add_missing_publisher(self):
        stats = StatsList()

        res = self.client.action(
            'package_search',
            q='external_reference:ONSHUB !groups:["" TO *]',
            sort='name asc',
            fq=' +site_id:"dgu" +state:active',
            wt='json',
            rows=100,
            escape_q=False)

        log.info('ONSHUB datasets missing publisher: %i', res['count'])
        source_agency_re = re.compile('^Source agency: (.*)$', re.MULTILINE)

        for pkg in res['results']:
            # solr data_dict is not the correct sort of pkg dictionary so
            # get it via the API
            pkg = self.loader._get_package(pkg['name'])
            if pkg['groups']:
                log.error(stats.add('Package had a publisher', pkg['name']))
                continue
            match = source_agency_re.search(pkg['notes'])
            if not match:
                log.error(
                    stats.add('Could not match source agency', pkg['name']))
                continue
            # Find equivalent publisher
            source_agency = match.groups()[0]
            publisher_name = OnsImporter._source_to_publisher_(
                source_agency, self.client)
            if not publisher_name:
                log.error(
                    stats.add('Could not map source agency %s' % source_agency,
                              pkg['name']))
                continue
            pkg['groups'] = [publisher_name]
            if not self.dry_run:
                try:
                    self.client.package_entity_put(pkg)
                except CkanApiError:
                    log.error('Error (%s) adding publisher over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    stats.add(
                        'Error writing to publisher over API %s' %
                        self.client.last_status, pkg['name'])
                    continue
            stats.add('Added publisher %s' % publisher_name, pkg['name'])

        print stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def merge_duplicates(self):
        merge_stats = StatsList()

        onshub_packages_search_options = {
            'external_reference': 'ONSHUB',
            'state': 'active'
        }
        res = self.loader._package_search(onshub_packages_search_options)
        log.info('ONSHUB records: %i', res['count'])
        pkgs_already_merged = set()
        for pkg_ref in res['results']:
            pkg = self.loader._get_package(pkg_ref)
            if pkg['name'] in pkgs_already_merged:
                log.info(merge_stats.add('Already merged', pkg['name']))
                continue
            if not self.loader._pkg_matches_search_options(
                    pkg, onshub_packages_search_options):
                log.error(
                    merge_stats.add('Did not match ONSHUB search after all',
                                    pkg['name']))
                continue
            # look for duplicates
            dupe_search_options = {
                'title': pkg['title'],
                'groups': pkg['groups'][0] if pkg['groups'] else '',
                'external_reference': 'ONSHUB',
                'state': 'active'
            }
            res = self.loader._package_search(dupe_search_options)
            if not res['count']:
                log.error(merge_stats.add('Could not find itself',
                                          pkg['name']))
                continue
            dupe_pkgs = []
            for dupe_pkg_ref in res['results']:
                dupe_pkg = self.loader._get_package(dupe_pkg_ref)
                if dupe_pkg['name'] == pkg['name']:
                    continue
                if not self.loader._pkg_matches_search_options(
                        dupe_pkg, dupe_search_options):
                    log.warn('Did not match duplicate search after all %s %s',
                             pkg['name'], dupe_pkg['name'])
                    continue
                dupe_pkgs.append(dupe_pkg)
            if dupe_pkgs:
                log.info('Found duplicates for %s: %r', pkg['name'],
                         [pkg_['name'] for pkg_ in dupe_pkgs])
                # Fix duplicates
                merge_stats.add(
                    '%i duplicates found and merged' % len(dupe_pkgs),
                    pkg['name'])
                for dupe_pkg in dupe_pkgs:
                    pkgs_already_merged.add(dupe_pkg['name'])
                self.do_merge(pkg, dupe_pkgs)
            else:
                log.info(merge_stats.add('No duplicates', pkg['name']))

        print merge_stats.report()
        if self.dry_run:
            print 'NB: No packages changed - dry run.'

    def do_merge(self, pkg, dupe_pkgs):
        '''Does the merge. Returns any error message or None if successful.'''
        # Select the package with the least _ in the name to keep
        pkgs_scored = sorted([pkg] + dupe_pkgs,
                             key=lambda p: p['name'].count('_'))
        pkg = pkgs_scored[0]
        dupe_pkgs = pkgs_scored[1:]
        log.info('Keeping %s and merging in %r', pkg['name'],
                 [p['name'] for p in dupe_pkgs])
        copy_keys = ('description', 'url', 'format', 'hub-id', 'size',
                     'cache_filepath', 'last_modified', 'hash', 'mimetype',
                     'cache_url')
        for dupe_pkg in dupe_pkgs:
            for res in dupe_pkg['resources']:
                res_copy = dict([(key, res.get(key)) for key in copy_keys])
                pkg['resources'].append(res_copy)
        if not self.dry_run:
            # Write the package
            try:
                self.client.package_entity_put(pkg)
            except CkanApiError:
                log.error('Error (%s) editing package over API: %s' % \
                          (self.client.last_status,
                           self.client.last_message))
                return 'Could not edit package: %s' % self.client.last_status
            # Delete the duplicates
            for dupe_pkg in dupe_pkgs:
                try:
                    self.client.package_entity_delete(dupe_pkg['name'])
                except CkanApiError:
                    log.error('Error (%s) deleting over API: %s' % \
                              (self.client.last_status,
                               self.client.last_message))
                    return 'Could not delete package: %s' % self.client.last_status

        return True