Ejemplo n.º 1
0
    def command(self):
        from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH
        from ckanext.dgu.ons.importer import OnsImporter
        from ckanext.dgu.ons.loader import OnsLoader

        ApiCommand.command(self)
        log = logging.getLogger(__name__)

        try:
            if self.options.days:
                self.options.days = int(self.options.days)
            if self.options.start_date:
                self.options.start_date = self.parse_date(self.options.start_date)
            if self.options.end_date:
                self.options.end_date = self.parse_date(self.options.end_date)
            if self.options.month:
                self.options.month = self.parse_month(self.options.month)
            if self.options.months_since:
                self.options.months_since = self.parse_month(self.options.months_since)
            if not self.options.ons_cache_dir:
                self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

            if self.options.days or \
                self.options.start_date or \
                self.options.end_date:
                data_filepaths = OnsData.download_flexible(
                    days=self.options.days,
                    start_date=self.options.start_date,
                    end_date=self.options.end_date,
                    ons_cache_dir=self.options.ons_cache_dir)

            elif self.options.month:
                data_filepaths = OnsData.download_month(year=self.options.month.year,
                                                        month=self.options.month.month)
            elif self.options.months_since:
                data_filepaths = OnsData.download_months_since(
                    year=self.options.months_since.year,
                    month=self.options.months_since.month,
                    force_download=self.options.force_download)
            elif self.options.all_time:
                data_filepaths = OnsData.download_all(force_download=self.options.force_download)
            else:
                self.parser.error('Please specify a time period')

            filter_ = {}
            if self.options.publisher:
                filter_['publisher'] = self.options.publisher

            stats = StatsList()
            importer = OnsImporter(filepaths=data_filepaths,
                                   ckanclient=self.client, stats=stats,
                                   filter_=filter_)
            loader = OnsLoader(self.client, stats)

            loader.load_packages(importer.pkg_dict())
            log.info('Summary:\n' + stats.report())
        except:
            # Any problem, make sure it gets logged
            log.exception('ONS Loader exception')
            raise
Ejemplo n.º 2
0
def change_publisher(from_publisher_name, to_publisher_name, options):
    from ckan import model
    stats = StatsList()
    if options.write:
        rev = model.repo.new_revision()
        rev.author = 'script_dataset_change_publisher'
        needs_commit = False
    from_publisher = model.Group.get(from_publisher_name)
    to_publisher = model.Group.get(to_publisher_name)
    datasets = common.get_datasets(dataset_name=options.dataset,
                                   organization_ref=from_publisher_name)
    assert to_publisher
    for dataset in datasets:
        member = model.Session.query(model.Member) \
                      .filter_by(group_id=from_publisher.id) \
                      .filter_by(table_name='package') \
                      .filter_by(table_id=dataset.id) \
                      .first()
        if member:
            print stats.add('Change owner_id and Member', dataset.name)
        else:
            print stats.add('Change owner_id but no Member', dataset.name)
        if options.write:
            dataset.owner_org = to_publisher.id
            if member:
                member.group_id = to_publisher.id
            needs_commit = True

    print stats.report()
    if options.write and needs_commit:
        model.repo.commit_and_remove()
Ejemplo n.º 3
0
def change_publisher(from_publisher_name, to_publisher_name, options):
    from ckan import model
    stats = StatsList()
    if options.write:
        rev = model.repo.new_revision()
        rev.author = 'script_dataset_change_publisher'
        needs_commit = False
    from_publisher = model.Group.get(from_publisher_name)
    to_publisher = model.Group.get(to_publisher_name)
    datasets = common.get_datasets(dataset_name=options.dataset,
                                   organization_ref=from_publisher_name)
    assert to_publisher
    for dataset in datasets:
        member = model.Session.query(model.Member) \
                      .filter_by(group_id=from_publisher.id) \
                      .filter_by(table_name='package') \
                      .filter_by(table_id=dataset.id) \
                      .first()
        if member:
            print stats.add('Change owner_id and Member', dataset.name)
        else:
            print stats.add('Change owner_id but no Member', dataset.name)
        if options.write:
            dataset.owner_org = to_publisher.id
            if member:
                member.group_id = to_publisher.id
            needs_commit = True

    print stats.report()
    if options.write and needs_commit:
        model.repo.commit_and_remove()
'''

import os
import sys
import glob
from itertools import chain
import ckanext.dgu.bin.common as common
from optparse import OptionParser
from ckan import model
from ckanext.archiver.model import Archival

from ckanext.dgu.bin.running_stats import StatsList

DEFAULT_CACHE_DIR = '/media/hulk/ckan_resource_cache/'

stats = StatsList()


class CleanCachedResources(object):
    @classmethod
    def command(cls, config_ini, cache_dir, delete):
        common.load_config(config_ini)
        common.register_translator()

        #rev = model.repo.new_revision()
        #rev.author = 'fix_secondary_theme.py'

        no_archival = []
        deleted_res = []
        for f in glob.glob(os.path.join(cache_dir, '*/*/*')):
            a = model.Session.query(Archival).filter(
Ejemplo n.º 5
0
'''

import re
import os
import logging
import sys
import requests

from pylons import config
from nose.tools import assert_equal
from lxml import etree

from ckanext.dgu.bin import common
from ckanext.dgu.bin.running_stats import StatsList

service_stats = StatsList()
couple_stats = StatsList()
additional_couple_stats = StatsList()

class FindError(Exception):
    pass

class CoupledResources(object):
    @classmethod
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
Ejemplo n.º 6
0
    def command(self):
        self._load_config()

        log = logging.getLogger('ckanext')

        import ckan.model as model
        from ckanext.dgu.bin.running_stats import StatsList
        from ckanext.dgu.lib.publisher_matcher import PublisherMatcher

        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")

        self.working_directory = self.args[0]
        log.info("Working directory set to %s" % self.working_directory)

        start = time.time()
        self.authorities_file = self._get_authorities_csv()

        # Read in the WDTK publishers and store in matcher
        wdtk_publishers = {}  # slug: name
        matcher = PublisherMatcher()
        with open(self.authorities_file, 'rU') as f:
            reader = csv.reader(f)
            for row in reader:
                name, short_name, slug = row[0:3]
                matcher.add_external_publisher(slug, name, short_name)
                wdtk_publishers[slug] = name.replace('\x92',
                                                     "'").decode('utf8')

        # Match up DGU publishers
        publishers = model.Session.query(model.Group) \
            .filter(model.Group.type == 'publisher') \
            .filter(model.Group.state == 'active').all()
        log.info("Found %d publishers to process in DB" % len(publishers))
        match_stats = StatsList()
        for publisher in publishers:

            match = matcher.match_to_external_publisher(publisher.title)

            if not match:
                match = matcher.match_to_external_publisher(
                    publisher.extras.get('abbreviation', ''))

            if not match:
                match = matcher.match_to_external_publisher(
                    re.sub('[-_]+', ' ', publisher.name))

            if not match and publisher.name in direct_matches:
                match = direct_matches[publisher.name]
                log.info(match_stats.add('Direct match', publisher.name))
                continue

            # We don't want to write any details automatically if we have
            # any existing phone, email or web details for FOI.
            have_previous_details = any([
                publisher.extras.get('foi-phone'),
                publisher.extras.get('foi-email'),
                publisher.extras.get('foi-web')
            ])

            if not match:
                if have_previous_details:
                    log.info(
                        match_stats.add(
                            'No match but already have FOI details',
                            publisher.name))
                else:
                    log.info(
                        match_stats.add('No match and still needs FOI details',
                                        publisher.name))
                continue

            # Save the publisher
            log.info('%s matches WDTK %s', publisher.name, match)

            # Store the match. Used for publisher_sync and publicbodies/nomen work.
            if not DRY_RUN and publisher.get('wdtk-id') != match and \
               publisher.get('wdtk-title') != wdtk_publishers[match]:
                publisher.extras['wdtk-id'] = match
                publisher.extras['wdtk-title'] = wdtk_publishers[match]
                model.Session.commit()

            # Check if previous WDTK details are still correct
            wdtk_url = WDTK_REQUEST_URL % match
            if 'whatdotheyknow' in publisher.extras.get('foi-web', ''):
                if publisher.extras['foi-web'] == wdtk_url:
                    log.info(
                        match_stats.add(
                            'Match, but already have WDTK FOI details',
                            publisher.name))
                    continue
                else:
                    log.info(
                        match_stats.add(
                            'Match, and correcting WDTK FOI details',
                            publisher.name))
            elif have_previous_details:
                log.info(
                    match_stats.add('Match, but already have FOI details',
                                    publisher.name))
                continue
            else:
                log.info(
                    match_stats.add('Match and added FOI details',
                                    publisher.name))

            if not DRY_RUN:
                publisher.extras['foi-web'] = wdtk_url
                model.Session.commit()

        print 'Full list of publishers not matched:'
        for name in match_stats[
                'No match and still needs FOI details'] + match_stats[
                    'No match but already have FOI details']:
            print name, repr(model.Group.by_name(name).title)

        end = time.time()
        took = str(datetime.timedelta(seconds=end - start))
        log.info('Time taken: %s' % took)
        print match_stats.report()

        if DRY_RUN:
            print 'NB: No changes made - this was a dry run'
Ejemplo n.º 7
0
    def command(self):
        self._load_config()

        log = logging.getLogger('ckanext')

        import ckan.model as model
        from ckanext.dgu.bin.running_stats import StatsList
        from ckanext.dgu.lib.publisher_matcher import PublisherMatcher
        
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")

        self.working_directory = self.args[0]
        log.info("Working directory set to %s" % self.working_directory)

        start = time.time()
        self.authorities_file = self._get_authorities_csv()

        # Read in the WDTK publishers and store in matcher
        wdtk_publishers = {} # slug: name
        matcher = PublisherMatcher()
        with open(self.authorities_file, 'rU') as f:
            reader = csv.reader(f)
            for row in reader:
                name, short_name, slug = row[0:3]
                matcher.add_external_publisher(slug, name, short_name)
                wdtk_publishers[slug] = name.replace('\x92', "'").decode('utf8')

        # Match up DGU publishers
        publishers = model.Session.query(model.Group) \
            .filter(model.Group.type == 'publisher') \
            .filter(model.Group.state == 'active').all()
        log.info("Found %d publishers to process in DB" %
            len(publishers))
        match_stats = StatsList()
        for publisher in publishers:
            
            match = matcher.match_to_external_publisher(publisher.title)

            if not match:
                match = matcher.match_to_external_publisher(publisher.extras.get('abbreviation', ''))

            if not match:
                match = matcher.match_to_external_publisher(re.sub('[-_]+', ' ', publisher.name))

            if not match and publisher.name in direct_matches:
                match = direct_matches[publisher.name]
                log.info(match_stats.add('Direct match', publisher.name))
                continue
                
            # We don't want to write any details automatically if we have
            # any existing phone, email or web details for FOI.
            have_previous_details = any([publisher.extras.get('foi-phone'),
                                         publisher.extras.get('foi-email'),
                                         publisher.extras.get('foi-web')])

            if not match:
                if have_previous_details:
                    log.info(match_stats.add('No match but already have FOI details', publisher.name))
                else:
                    log.info(match_stats.add('No match and still needs FOI details', publisher.name))
                continue

            # Save the publisher
            log.info('%s matches WDTK %s', publisher.name, match)

            # Store the match. Used for publisher_sync and publicbodies/nomen work.
            if not DRY_RUN and publisher.get('wdtk-id') != match and \
               publisher.get('wdtk-title') != wdtk_publishers[match]:
                publisher.extras['wdtk-id'] = match
                publisher.extras['wdtk-title'] = wdtk_publishers[match]
                model.Session.commit()

            # Check if previous WDTK details are still correct
            wdtk_url = WDTK_REQUEST_URL % match
            if 'whatdotheyknow' in publisher.extras.get('foi-web', ''):
                if publisher.extras['foi-web'] == wdtk_url:
                    log.info(match_stats.add('Match, but already have WDTK FOI details', publisher.name))
                    continue
                else:
                    log.info(match_stats.add('Match, and correcting WDTK FOI details', publisher.name))
            elif have_previous_details:
                log.info(match_stats.add('Match, but already have FOI details', publisher.name))
                continue
            else:
                log.info(match_stats.add('Match and added FOI details', publisher.name))

            if not DRY_RUN:
                publisher.extras['foi-web'] = wdtk_url
                model.Session.commit()

        print 'Full list of publishers not matched:'
        for name in match_stats['No match and still needs FOI details'] + match_stats['No match but already have FOI details']:
            print name, repr(model.Group.by_name(name).title)

        end = time.time()
        took = str(datetime.timedelta(seconds=end-start))
        log.info('Time taken: %s' % took)
        print match_stats.report()

        if DRY_RUN:
            print 'NB: No changes made - this was a dry run'
Ejemplo n.º 8
0
'''

import re
import os
import logging
import sys
import requests

from pylons import config
from nose.tools import assert_equal
from lxml import etree

from ckanext.dgu.bin import common
from ckanext.dgu.bin.running_stats import StatsList

service_stats = StatsList()
couple_stats = StatsList()
additional_couple_stats = StatsList()

class FindError(Exception):
    pass

class CoupledResources(object):
    @classmethod
    def detect(cls):
        '''Finds datasets that are coupled and adds their
        harvest_source_reference to the HarvestObject and package extras.
        '''
        from ckan.lib.base import json
        from ckan import model
        from ckanext.harvest.model import HarvestObject
Ejemplo n.º 9
0
    def command(self):
        from ckanext.dgu.ons.downloader import OnsData, ONS_DEFAULT_CACHE_PATH
        from ckanext.dgu.ons.importer import OnsImporter
        from ckanext.dgu.ons.loader import OnsLoader

        ApiCommand.command(self)
        log = logging.getLogger(__name__)

        try:
            if self.options.days:
                self.options.days = int(self.options.days)
            if self.options.start_date:
                self.options.start_date = self.parse_date(
                    self.options.start_date)
            if self.options.end_date:
                self.options.end_date = self.parse_date(self.options.end_date)
            if self.options.month:
                self.options.month = self.parse_month(self.options.month)
            if self.options.months_since:
                self.options.months_since = self.parse_month(
                    self.options.months_since)
            if not self.options.ons_cache_dir:
                self.options.ons_cache_dir = ONS_DEFAULT_CACHE_PATH

            if self.options.days or \
                self.options.start_date or \
                self.options.end_date:
                data_filepaths = OnsData.download_flexible(
                    days=self.options.days,
                    start_date=self.options.start_date,
                    end_date=self.options.end_date,
                    ons_cache_dir=self.options.ons_cache_dir)

            elif self.options.month:
                data_filepaths = OnsData.download_month(
                    year=self.options.month.year,
                    month=self.options.month.month)
            elif self.options.months_since:
                data_filepaths = OnsData.download_months_since(
                    year=self.options.months_since.year,
                    month=self.options.months_since.month,
                    force_download=self.options.force_download)
            elif self.options.all_time:
                data_filepaths = OnsData.download_all(
                    force_download=self.options.force_download)
            else:
                self.parser.error('Please specify a time period')

            filter_ = {}
            if self.options.publisher:
                filter_['publisher'] = self.options.publisher

            stats = StatsList()
            importer = OnsImporter(filepaths=data_filepaths,
                                   ckanclient=self.client,
                                   stats=stats,
                                   filter_=filter_)
            loader = OnsLoader(self.client, stats)

            loader.load_packages(importer.pkg_dict())
            log.info('Summary:\n' + stats.report())
        except:
            # Any problem, make sure it gets logged
            log.exception('ONS Loader exception')
            raise
Ejemplo n.º 10
0
def fix_links(csv_filepath, write=False):
    from ckan import model
    stats = StatsList()
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Link fix from CSV'
        needs_commit = False
    with open(csv_filepath, 'rU') as f:
        reader = csv.reader(f)
        header = reader.next()
        assert header == ['NS Title', 'Bad link', 'Good link'], header
        for row in reader:
            ns_title, bad_link, good_link = row
            # Find the package and resource
            pkg_title = ns_title.split(' - ')[0]
            res_title = ' - '.join(ns_title.split(' - ')[1:])
            pkgs = model.Session.query(model.Package)\
                        .filter_by(title=pkg_title)\
                        .filter_by(state='active')\
                        .filter(model.Package.notes.like('%Source agency%'))\
                        .all()
            if not pkgs:
                print stats.add('Package title did not match', ns_title)
                continue
            if len(pkgs) > 1:
                print stats.add('Multiple package title matches', ns_title)
                continue
            pkg = pkgs[0]
            for res_ in pkg.resources:
                if res_.description[:len(
                        res_title)] == res_title and 'hub-id' in res_.extras:
                    res = res_
                    break
            else:
                print stats.add('Resource title did not match', ns_title)
                continue
            # Update the link
            if res.url == good_link:
                print stats.add('Resource URL already fixed', ns_title)
                continue
            if res.url != bad_link and res.url.startswith(
                    'http://webarchive.nationalarchives.gov.uk'):
                print stats.add(
                    'Resource is already pointing to the webarchive - leave it',
                    ns_title)
                continue
            if res.url != bad_link:
                print stats.add('Resource URL is not expected', ns_title)
                continue
            if write:
                print stats.add('Update link (written)', ns_title)
                res.url = good_link
                needs_commit = True
            else:
                print stats.add('Update link (not written)', ns_title)
    print stats.report()
    if write and needs_commit:
        model.repo.commit_and_remove()
Ejemplo n.º 11
0
def fix_duplicates(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Fix duplicate resources'
        needs_commit = False
    stats = StatsList()
    pkgs = model.Session.query(model.Package)\
                .filter_by(state='active')\
                .join(model.PackageExtra)\
                .filter_by(state='active')\
                .filter_by(key='external_reference')\
                .filter_by(value='ONSHUB')\
                .order_by(model.Package.name)
    if options.dataset:
        pkg = model.Package.get(options.dataset)
        pkgs = pkgs.filter(model.Package.id==pkg.id)
    pkgs = pkgs.all()
    for pkg in pkgs:
        previous_resources = {}

        def get_res_properties(resource):
            return {'url': resource.url,
                    'hub-id': resource.extras.get('hub-id'),
                    'date': resource.extras.get('date'),
                    'publish-date': resource.extras.get('publish-date')}

        def is_res_broken(resource):
            archival = Archival.get_for_resource(resource.id)
            if not archival:
                return None
            return archival.is_broken

        has_duplicates = False
        if not pkg.resources:
            print stats.add('No resources', pkg.name)
        for res in pkg.resources:
            res_properties = get_res_properties(res)
            res_identity = '%s %s' % (pkg.name, res.description)
            if res.description in previous_resources:
                has_duplicates = True
                prev_res = previous_resources[res.description]
                prev_res_properties = get_res_properties(prev_res)
                if res_properties == prev_res_properties:
                    needs_commit=True
                    print stats.add('Resource indentical - dedupe', res_identity)
                    merge_resources((res, prev_res), write)
                elif prev_res_properties['date'] != res_properties['date']:
                    print stats.add('Resource same description, different date in timeseries - ok', res_identity)
                elif prev_res_properties['hub-id'] and res_properties['hub-id'] and prev_res_properties['hub-id'] != res_properties['hub-id']:
                    print stats.add('Resource same description, different hub-id - ok', res_identity)
                elif prev_res_properties['hub-id'] and prev_res_properties['hub-id'] == res_properties['hub-id']:
                    needs_commit=True
                    print stats.add('Resource with same hub-id - merge', res_identity)
                    pprint(prev_res_properties)
                    pprint(res_properties)
                    merge_resources((res, prev_res), write)
                elif prev_res_properties['url'] == res_properties['url']:
                    needs_commit=True
                    print stats.add('Resource same description & url, different other properties - merge', res_identity)
                    pprint(prev_res_properties)
                    pprint(res_properties)
                    merge_resources((res, prev_res), write)
                elif is_res_broken(prev_res) or is_res_broken(res):
                    print stats.add('Resource same description, different properties, some breakage - delete one', res_identity)
                    if is_res_broken(prev_res):
                        print 'BROKEN:'
                    pprint(prev_res_properties)
                    if is_res_broken(res):
                        print 'BROKEN:'
                    pprint(res_properties)
                else:
                    print stats.add('Resource same description, different properties - manual decision', res_identity)
                    pprint(prev_res_properties)
                    pprint(res_properties)
            previous_resources[res.description] = res

        if not has_duplicates:
            print stats.add('Package without duplicates', pkg.name)
    print stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'
Ejemplo n.º 12
0
def fix_links(csv_filepath, write=False):
    from ckan import model
    stats = StatsList()
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Link fix from CSV'
        needs_commit = False
    with open(csv_filepath, 'rU') as f:
        reader = csv.reader(f)
        header = reader.next()
        assert header == ['NS Title', 'Bad link', 'Good link'], header
        for row in reader:
            ns_title, bad_link, good_link = row
            # Find the package and resource
            pkg_title = ns_title.split(' - ')[0]
            res_title = ' - '.join(ns_title.split(' - ')[1:])
            pkgs = model.Session.query(model.Package)\
                        .filter_by(title=pkg_title)\
                        .filter_by(state='active')\
                        .filter(model.Package.notes.like('%Source agency%'))\
                        .all()
            if not pkgs:
                print stats.add('Package title did not match', ns_title)
                continue
            if len(pkgs) > 1:
                print stats.add('Multiple package title matches', ns_title)
                continue
            pkg = pkgs[0]
            for res_ in pkg.resources:
                if res_.description[:len(res_title)] == res_title and 'hub-id' in res_.extras:
                    res = res_
                    break
            else:
                print stats.add('Resource title did not match', ns_title)
                continue
            # Update the link
            if res.url == good_link:
                print stats.add('Resource URL already fixed', ns_title)
                continue
            if res.url != bad_link and res.url.startswith('http://webarchive.nationalarchives.gov.uk'):
                print stats.add('Resource is already pointing to the webarchive - leave it', ns_title)
                continue
            if res.url != bad_link:
                print stats.add('Resource URL is not expected', ns_title)
                continue
            if write:
                print stats.add('Update link (written)', ns_title)
                res.url = good_link
                needs_commit = True
            else:
                print stats.add('Update link (not written)', ns_title)
    print stats.report()
    if write and needs_commit:
        model.repo.commit_and_remove()