def change_publisher(from_publisher_name, to_publisher_name, options):
    from ckan import model
    stats = StatsList()
    if options.write:
        rev = model.repo.new_revision()
        rev.author = 'script_dataset_change_publisher'
        needs_commit = False
    from_publisher = model.Group.get(from_publisher_name)
    to_publisher = model.Group.get(to_publisher_name)
    datasets = common.get_datasets(dataset_name=options.dataset,
                                   organization_ref=from_publisher_name)
    assert to_publisher
    for dataset in datasets:
        member = model.Session.query(model.Member) \
                      .filter_by(group_id=from_publisher.id) \
                      .filter_by(table_name='package') \
                      .filter_by(table_id=dataset.id) \
                      .first()
        if member:
            print stats.add('Change owner_id and Member', dataset.name)
        else:
            print stats.add('Change owner_id but no Member', dataset.name)
        if options.write:
            dataset.owner_org = to_publisher.id
            if member:
                member.group_id = to_publisher.id
            needs_commit = True

    print stats.report()
    if options.write and needs_commit:
        model.repo.commit_and_remove()
Example #2
0
def change_publisher(from_publisher_name, to_publisher_name, options):
    from ckan import model
    stats = StatsList()
    if options.write:
        rev = model.repo.new_revision()
        rev.author = 'script_dataset_change_publisher'
        needs_commit = False
    from_publisher = model.Group.get(from_publisher_name)
    to_publisher = model.Group.get(to_publisher_name)
    datasets = common.get_datasets(dataset_name=options.dataset,
                                   organization_ref=from_publisher_name)
    assert to_publisher
    for dataset in datasets:
        member = model.Session.query(model.Member) \
                      .filter_by(group_id=from_publisher.id) \
                      .filter_by(table_name='package') \
                      .filter_by(table_id=dataset.id) \
                      .first()
        if member:
            print stats.add('Change owner_id and Member', dataset.name)
        else:
            print stats.add('Change owner_id but no Member', dataset.name)
        if options.write:
            dataset.owner_org = to_publisher.id
            if member:
                member.group_id = to_publisher.id
            needs_commit = True

    print stats.report()
    if options.write and needs_commit:
        model.repo.commit_and_remove()
Example #3
0
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
                except etree.XMLSyntaxError, e:
                    couple_stats.add('Could not parse "Gemini" downloaded',
                                     couple_id)
                    log.warning('Could not parse "Gemini" downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
                try:
                    harvest_object = cls.find_harvest_object_by_guid(guid)
                except FindError, e:
                    log.error('%s href=%s couple=%s', e, href, couple_id)
                    couple_stats.add(str(e), couple_id)
                    couples_all_detected = False
                    continue

                dataset_record = harvest_object.package #res.resource_group.package
                couple_stats.add('Couple completed', couple_id)
                log.info('Couple completed %s <-> %s',
                         service_record.name, dataset_record.name)

                cls.add_coupling(service_record, dataset_record, harvest_object, href)
                couples_detected = True

            if coupled_resources:
                if couples_all_detected:
                    service_stats.add('Service couples all completed', service_record.name)
                elif couples_detected:
Example #4
0
    def command(self):
        self._load_config()

        log = logging.getLogger('ckanext')

        import ckan.model as model
        from ckanext.dgu.bin.running_stats import StatsList
        from ckanext.dgu.lib.publisher_matcher import PublisherMatcher

        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")

        self.working_directory = self.args[0]
        log.info("Working directory set to %s" % self.working_directory)

        start = time.time()
        self.authorities_file = self._get_authorities_csv()

        # Read in the WDTK publishers and store in matcher
        wdtk_publishers = {}  # slug: name
        matcher = PublisherMatcher()
        with open(self.authorities_file, 'rU') as f:
            reader = csv.reader(f)
            for row in reader:
                name, short_name, slug = row[0:3]
                matcher.add_external_publisher(slug, name, short_name)
                wdtk_publishers[slug] = name.replace('\x92',
                                                     "'").decode('utf8')

        # Match up DGU publishers
        publishers = model.Session.query(model.Group) \
            .filter(model.Group.type == 'publisher') \
            .filter(model.Group.state == 'active').all()
        log.info("Found %d publishers to process in DB" % len(publishers))
        match_stats = StatsList()
        for publisher in publishers:

            match = matcher.match_to_external_publisher(publisher.title)

            if not match:
                match = matcher.match_to_external_publisher(
                    publisher.extras.get('abbreviation', ''))

            if not match:
                match = matcher.match_to_external_publisher(
                    re.sub('[-_]+', ' ', publisher.name))

            if not match and publisher.name in direct_matches:
                match = direct_matches[publisher.name]
                log.info(match_stats.add('Direct match', publisher.name))
                continue

            # We don't want to write any details automatically if we have
            # any existing phone, email or web details for FOI.
            have_previous_details = any([
                publisher.extras.get('foi-phone'),
                publisher.extras.get('foi-email'),
                publisher.extras.get('foi-web')
            ])

            if not match:
                if have_previous_details:
                    log.info(
                        match_stats.add(
                            'No match but already have FOI details',
                            publisher.name))
                else:
                    log.info(
                        match_stats.add('No match and still needs FOI details',
                                        publisher.name))
                continue

            # Save the publisher
            log.info('%s matches WDTK %s', publisher.name, match)

            # Store the match. Used for publisher_sync and publicbodies/nomen work.
            if not DRY_RUN and publisher.get('wdtk-id') != match and \
               publisher.get('wdtk-title') != wdtk_publishers[match]:
                publisher.extras['wdtk-id'] = match
                publisher.extras['wdtk-title'] = wdtk_publishers[match]
                model.Session.commit()

            # Check if previous WDTK details are still correct
            wdtk_url = WDTK_REQUEST_URL % match
            if 'whatdotheyknow' in publisher.extras.get('foi-web', ''):
                if publisher.extras['foi-web'] == wdtk_url:
                    log.info(
                        match_stats.add(
                            'Match, but already have WDTK FOI details',
                            publisher.name))
                    continue
                else:
                    log.info(
                        match_stats.add(
                            'Match, and correcting WDTK FOI details',
                            publisher.name))
            elif have_previous_details:
                log.info(
                    match_stats.add('Match, but already have FOI details',
                                    publisher.name))
                continue
            else:
                log.info(
                    match_stats.add('Match and added FOI details',
                                    publisher.name))

            if not DRY_RUN:
                publisher.extras['foi-web'] = wdtk_url
                model.Session.commit()

        print 'Full list of publishers not matched:'
        for name in match_stats[
                'No match and still needs FOI details'] + match_stats[
                    'No match but already have FOI details']:
            print name, repr(model.Group.by_name(name).title)

        end = time.time()
        took = str(datetime.timedelta(seconds=end - start))
        log.info('Time taken: %s' % took)
        print match_stats.report()

        if DRY_RUN:
            print 'NB: No changes made - this was a dry run'
Example #5
0
    def command(self):
        self._load_config()

        log = logging.getLogger('ckanext')

        import ckan.model as model
        from ckanext.dgu.bin.running_stats import StatsList
        from ckanext.dgu.lib.publisher_matcher import PublisherMatcher
        
        model.Session.remove()
        model.Session.configure(bind=model.meta.engine)
        model.repo.new_revision()
        log.info("Database access initialised")

        self.working_directory = self.args[0]
        log.info("Working directory set to %s" % self.working_directory)

        start = time.time()
        self.authorities_file = self._get_authorities_csv()

        # Read in the WDTK publishers and store in matcher
        wdtk_publishers = {} # slug: name
        matcher = PublisherMatcher()
        with open(self.authorities_file, 'rU') as f:
            reader = csv.reader(f)
            for row in reader:
                name, short_name, slug = row[0:3]
                matcher.add_external_publisher(slug, name, short_name)
                wdtk_publishers[slug] = name.replace('\x92', "'").decode('utf8')

        # Match up DGU publishers
        publishers = model.Session.query(model.Group) \
            .filter(model.Group.type == 'publisher') \
            .filter(model.Group.state == 'active').all()
        log.info("Found %d publishers to process in DB" %
            len(publishers))
        match_stats = StatsList()
        for publisher in publishers:
            
            match = matcher.match_to_external_publisher(publisher.title)

            if not match:
                match = matcher.match_to_external_publisher(publisher.extras.get('abbreviation', ''))

            if not match:
                match = matcher.match_to_external_publisher(re.sub('[-_]+', ' ', publisher.name))

            if not match and publisher.name in direct_matches:
                match = direct_matches[publisher.name]
                log.info(match_stats.add('Direct match', publisher.name))
                continue
                
            # We don't want to write any details automatically if we have
            # any existing phone, email or web details for FOI.
            have_previous_details = any([publisher.extras.get('foi-phone'),
                                         publisher.extras.get('foi-email'),
                                         publisher.extras.get('foi-web')])

            if not match:
                if have_previous_details:
                    log.info(match_stats.add('No match but already have FOI details', publisher.name))
                else:
                    log.info(match_stats.add('No match and still needs FOI details', publisher.name))
                continue

            # Save the publisher
            log.info('%s matches WDTK %s', publisher.name, match)

            # Store the match. Used for publisher_sync and publicbodies/nomen work.
            if not DRY_RUN and publisher.get('wdtk-id') != match and \
               publisher.get('wdtk-title') != wdtk_publishers[match]:
                publisher.extras['wdtk-id'] = match
                publisher.extras['wdtk-title'] = wdtk_publishers[match]
                model.Session.commit()

            # Check if previous WDTK details are still correct
            wdtk_url = WDTK_REQUEST_URL % match
            if 'whatdotheyknow' in publisher.extras.get('foi-web', ''):
                if publisher.extras['foi-web'] == wdtk_url:
                    log.info(match_stats.add('Match, but already have WDTK FOI details', publisher.name))
                    continue
                else:
                    log.info(match_stats.add('Match, and correcting WDTK FOI details', publisher.name))
            elif have_previous_details:
                log.info(match_stats.add('Match, but already have FOI details', publisher.name))
                continue
            else:
                log.info(match_stats.add('Match and added FOI details', publisher.name))

            if not DRY_RUN:
                publisher.extras['foi-web'] = wdtk_url
                model.Session.commit()

        print 'Full list of publishers not matched:'
        for name in match_stats['No match and still needs FOI details'] + match_stats['No match but already have FOI details']:
            print name, repr(model.Group.by_name(name).title)

        end = time.time()
        took = str(datetime.timedelta(seconds=end-start))
        log.info('Time taken: %s' % took)
        print match_stats.report()

        if DRY_RUN:
            print 'NB: No changes made - this was a dry run'
Example #6
0
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
                except etree.XMLSyntaxError, e:
                    couple_stats.add('Could not parse "Gemini" downloaded',
                                     couple_id)
                    log.warning('Could not parse "Gemini" downloaded href:"%s"', \
                                     href)
                    couples_all_detected = False
                    break
                try:
                    harvest_object = cls.find_harvest_object_by_guid(guid)
                except FindError, e:
                    log.error('%s href=%s couple=%s', e, href, couple_id)
                    couple_stats.add(str(e), couple_id)
                    couples_all_detected = False
                    continue

                dataset_record = harvest_object.package #res.resource_group.package
                couple_stats.add('Couple completed', couple_id)
                log.info('Couple completed %s <-> %s',
                         service_record.name, dataset_record.name)

                cls.add_coupling(service_record, dataset_record, harvest_object, href)
                couples_detected = True

            if coupled_resources:
                if couples_all_detected:
                    service_stats.add('Service couples all completed', service_record.name)
                elif couples_detected:
Example #7
0
def fix_links(csv_filepath, write=False):
    from ckan import model
    stats = StatsList()
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Link fix from CSV'
        needs_commit = False
    with open(csv_filepath, 'rU') as f:
        reader = csv.reader(f)
        header = reader.next()
        assert header == ['NS Title', 'Bad link', 'Good link'], header
        for row in reader:
            ns_title, bad_link, good_link = row
            # Find the package and resource
            pkg_title = ns_title.split(' - ')[0]
            res_title = ' - '.join(ns_title.split(' - ')[1:])
            pkgs = model.Session.query(model.Package)\
                        .filter_by(title=pkg_title)\
                        .filter_by(state='active')\
                        .filter(model.Package.notes.like('%Source agency%'))\
                        .all()
            if not pkgs:
                print stats.add('Package title did not match', ns_title)
                continue
            if len(pkgs) > 1:
                print stats.add('Multiple package title matches', ns_title)
                continue
            pkg = pkgs[0]
            for res_ in pkg.resources:
                if res_.description[:len(
                        res_title)] == res_title and 'hub-id' in res_.extras:
                    res = res_
                    break
            else:
                print stats.add('Resource title did not match', ns_title)
                continue
            # Update the link
            if res.url == good_link:
                print stats.add('Resource URL already fixed', ns_title)
                continue
            if res.url != bad_link and res.url.startswith(
                    'http://webarchive.nationalarchives.gov.uk'):
                print stats.add(
                    'Resource is already pointing to the webarchive - leave it',
                    ns_title)
                continue
            if res.url != bad_link:
                print stats.add('Resource URL is not expected', ns_title)
                continue
            if write:
                print stats.add('Update link (written)', ns_title)
                res.url = good_link
                needs_commit = True
            else:
                print stats.add('Update link (not written)', ns_title)
    print stats.report()
    if write and needs_commit:
        model.repo.commit_and_remove()
                    guid = gemini.read_value("guid")
                except KeyError, e:
                    couple_stats.add("Could not get GUID from Gemini downloaded" % href, couple_id)
                    log.warning('Could not get GUID from Gemini downloaded href:"%s"', href)
                    couples_all_detected = False
                    break
                except etree.XMLSyntaxError, e:
                    couple_stats.add('Could not parse "Gemini" downloaded', couple_id)
                    log.warning('Could not parse "Gemini" downloaded href:"%s"', href)
                    couples_all_detected = False
                    break
                try:
                    harvest_object = cls.find_harvest_object_by_guid(guid)
                except FindError, e:
                    log.error("%s href=%s couple=%s", e, href, couple_id)
                    couple_stats.add(str(e), couple_id)
                    couples_all_detected = False
                    continue

                dataset_record = harvest_object.package  # res.resource_group.package
                couple_stats.add("Couple completed", couple_id)
                log.info("Couple completed %s <-> %s", service_record.name, dataset_record.name)

                cls.add_coupling(service_record, dataset_record, harvest_object, href)
                couples_detected = True

            if coupled_resources:
                if couples_all_detected:
                    service_stats.add("Service couples all completed", service_record.name)
                elif couples_detected:
                    service_stats.add("Service couples partially completed", service_record.name)
Example #9
0
def fix_duplicates(options):
    from ckan import model
    from ckanext.archiver.model import Archival
    write = options.write
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Fix duplicate resources'
        needs_commit = False
    stats = StatsList()
    pkgs = model.Session.query(model.Package)\
                .filter_by(state='active')\
                .join(model.PackageExtra)\
                .filter_by(state='active')\
                .filter_by(key='external_reference')\
                .filter_by(value='ONSHUB')\
                .order_by(model.Package.name)
    if options.dataset:
        pkg = model.Package.get(options.dataset)
        pkgs = pkgs.filter(model.Package.id==pkg.id)
    pkgs = pkgs.all()
    for pkg in pkgs:
        previous_resources = {}

        def get_res_properties(resource):
            return {'url': resource.url,
                    'hub-id': resource.extras.get('hub-id'),
                    'date': resource.extras.get('date'),
                    'publish-date': resource.extras.get('publish-date')}

        def is_res_broken(resource):
            archival = Archival.get_for_resource(resource.id)
            if not archival:
                return None
            return archival.is_broken

        has_duplicates = False
        if not pkg.resources:
            print stats.add('No resources', pkg.name)
        for res in pkg.resources:
            res_properties = get_res_properties(res)
            res_identity = '%s %s' % (pkg.name, res.description)
            if res.description in previous_resources:
                has_duplicates = True
                prev_res = previous_resources[res.description]
                prev_res_properties = get_res_properties(prev_res)
                if res_properties == prev_res_properties:
                    needs_commit=True
                    print stats.add('Resource indentical - dedupe', res_identity)
                    merge_resources((res, prev_res), write)
                elif prev_res_properties['date'] != res_properties['date']:
                    print stats.add('Resource same description, different date in timeseries - ok', res_identity)
                elif prev_res_properties['hub-id'] and res_properties['hub-id'] and prev_res_properties['hub-id'] != res_properties['hub-id']:
                    print stats.add('Resource same description, different hub-id - ok', res_identity)
                elif prev_res_properties['hub-id'] and prev_res_properties['hub-id'] == res_properties['hub-id']:
                    needs_commit=True
                    print stats.add('Resource with same hub-id - merge', res_identity)
                    pprint(prev_res_properties)
                    pprint(res_properties)
                    merge_resources((res, prev_res), write)
                elif prev_res_properties['url'] == res_properties['url']:
                    needs_commit=True
                    print stats.add('Resource same description & url, different other properties - merge', res_identity)
                    pprint(prev_res_properties)
                    pprint(res_properties)
                    merge_resources((res, prev_res), write)
                elif is_res_broken(prev_res) or is_res_broken(res):
                    print stats.add('Resource same description, different properties, some breakage - delete one', res_identity)
                    if is_res_broken(prev_res):
                        print 'BROKEN:'
                    pprint(prev_res_properties)
                    if is_res_broken(res):
                        print 'BROKEN:'
                    pprint(res_properties)
                else:
                    print stats.add('Resource same description, different properties - manual decision', res_identity)
                    pprint(prev_res_properties)
                    pprint(res_properties)
            previous_resources[res.description] = res

        if not has_duplicates:
            print stats.add('Package without duplicates', pkg.name)
    print stats.report()
    if write and needs_commit:
        print 'Writing...'
        model.repo.commit_and_remove()
        print '...done'
    elif write:
        print 'Nothing to write'
    else:
        print 'Not written'
Example #10
0
def fix_links(csv_filepath, write=False):
    from ckan import model
    stats = StatsList()
    if write:
        rev = model.repo.new_revision()
        rev.author = 'Link fix from CSV'
        needs_commit = False
    with open(csv_filepath, 'rU') as f:
        reader = csv.reader(f)
        header = reader.next()
        assert header == ['NS Title', 'Bad link', 'Good link'], header
        for row in reader:
            ns_title, bad_link, good_link = row
            # Find the package and resource
            pkg_title = ns_title.split(' - ')[0]
            res_title = ' - '.join(ns_title.split(' - ')[1:])
            pkgs = model.Session.query(model.Package)\
                        .filter_by(title=pkg_title)\
                        .filter_by(state='active')\
                        .filter(model.Package.notes.like('%Source agency%'))\
                        .all()
            if not pkgs:
                print stats.add('Package title did not match', ns_title)
                continue
            if len(pkgs) > 1:
                print stats.add('Multiple package title matches', ns_title)
                continue
            pkg = pkgs[0]
            for res_ in pkg.resources:
                if res_.description[:len(res_title)] == res_title and 'hub-id' in res_.extras:
                    res = res_
                    break
            else:
                print stats.add('Resource title did not match', ns_title)
                continue
            # Update the link
            if res.url == good_link:
                print stats.add('Resource URL already fixed', ns_title)
                continue
            if res.url != bad_link and res.url.startswith('http://webarchive.nationalarchives.gov.uk'):
                print stats.add('Resource is already pointing to the webarchive - leave it', ns_title)
                continue
            if res.url != bad_link:
                print stats.add('Resource URL is not expected', ns_title)
                continue
            if write:
                print stats.add('Update link (written)', ns_title)
                res.url = good_link
                needs_commit = True
            else:
                print stats.add('Update link (not written)', ns_title)
    print stats.report()
    if write and needs_commit:
        model.repo.commit_and_remove()