def datasets_without_resources(): pkg_dicts = [] pkgs = model.Session.query(model.Package)\ .filter_by(state='active')\ .order_by(model.Package.title)\ .all() for pkg in pkgs: if len(pkg.resources) != 0 or \ pkg.extras.get('unpublished', '').lower() == 'true': continue org = pkg.get_organization() deleted, url = last_resource_deleted(pkg) pkg_dict = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('organization title', org.title), ('organization name', org.name), ('metadata created', pkg.metadata_created.isoformat()), ('metadata modified', pkg.metadata_modified.isoformat()), ('last resource deleted', deleted.isoformat() if deleted else None), ('last resource url', url), ('dataset_notes', lib.dataset_notes(pkg)), )) pkg_dicts.append(pkg_dict) return {'table': pkg_dicts}
def app_dataset_report(): app_dataset_dicts = [] for related in model.Session.query(model.RelatedDataset) \ .filter(model.Related.type=='App') \ .all(): dataset = related.dataset org = dataset.get_organization() top_org = list(go_up_tree(org))[-1] app_dataset_dict = OrderedDict(( ('app title', related.related.title), ('app url', related.related.url), ('dataset name', dataset.name), ('dataset title', dataset.title), ('organization title', org.title), ('organization name', org.name), ('top-level organization title', top_org.title), ('top-level organization name', top_org.name), ('dataset theme', related.dataset.extras.get('theme-primary', '')), ('dataset notes', lib.dataset_notes(dataset)), )) app_dataset_dicts.append(app_dataset_dict) app_dataset_dicts.sort(key=lambda row: row['top-level organization title'] + row['organization title']) return {'table': app_dataset_dicts}
def openness_for_organization(organization=None, include_sub_organizations=False): org = model.Group.get(organization) if not org: raise p.toolkit.ObjectNotFound if not include_sub_organizations: orgs = [org] else: orgs = lib.go_down_tree(org) context = {'model': model, 'session': model.Session, 'ignore_auth': True} score_counts = Counter() rows = [] num_packages = 0 for org in orgs: # NB org.packages() misses out many - see: # http://redmine.dguteam.org.uk/issues/1844 pkgs = model.Session.query(model.Package) \ .filter_by(owner_org=org.id) \ .filter_by(state='active') \ .filter_by(private=False) \ .all() num_packages += len(pkgs) for pkg in pkgs: try: qa = p.toolkit.get_action('qa_package_openness_show')( context, { 'id': pkg.id }) except p.toolkit.ObjectNotFound: log.warning('No QA info for package %s', pkg.name) return rows.append( OrderedDict(( ('dataset_name', pkg.name), ('dataset_title', pkg.title), ('dataset_notes', lib.dataset_notes(pkg)), ('organization_name', org.name), ('organization_title', org.title), ('openness_score', qa['openness_score']), ('openness_score_reason', qa['openness_score_reason']), ))) score_counts[qa['openness_score']] += 1 total_stars = sum([k * v for k, v in score_counts.items() if k]) num_pkgs_with_stars = sum( [v for k, v in score_counts.items() if k is not None]) average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \ if num_pkgs_with_stars else 0.0 return { 'table': rows, 'score_counts': jsonify_counter(score_counts), 'total_stars': total_stars, 'average_stars': average_stars, 'num_packages_scored': len(rows), 'num_packages': num_packages, }
def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter( model.PackageTag.id == None # noqa: E711 ) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [ OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.slice(0, 100) ] # First 100 only for this demo # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent(len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def html_datasets_report(organization, include_sub_organizations=False): ''' Returns datasets that only have an HTML link, by organization. ''' # Get packages query = model.Session.query(model.Package)\ .filter_by(state='active') if organization: query = lib.filter_by_organizations(query, organization, include_sub_organizations) pkgs = query.all() # See if HTML num_datasets_published = 0 num_datasets_only_html = 0 datasets_only_html = [] # use yield_per, otherwise memory use just goes up til the script is killed # by the os. for pkg in pkgs: if p.toolkit.asbool(pkg.extras.get('unpublished')): continue num_datasets_published += 1 formats = set([ res.format.lower() for res in pkg.resources if res.resource_type != 'documentation' ]) if 'html' not in formats: continue #org = pkg.get_organization().name data_formats = formats - set(('asp', '', None)) if data_formats == set(('html', )): num_datasets_only_html += 1 datasets_only_html.append(pkg) rows = [] for pkg in datasets_only_html: row = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('metadata created', pkg.metadata_created.isoformat()), ('metadata modified', pkg.metadata_modified.isoformat()), ('dataset_notes', lib.dataset_notes(pkg)), )) rows.append(row) return { 'table': rows, 'num_datasets_published': num_datasets_published, 'num_datasets_only_html': num_datasets_only_html, }
def openness_for_organization(organization=None, include_sub_organizations=False): org = model.Group.get(organization) if not org: raise p.toolkit.ObjectNotFound if not include_sub_organizations: orgs = [org] else: orgs = lib.go_down_tree(org) context = {'model': model, 'session': model.Session, 'ignore_auth': True} score_counts = Counter() rows = [] num_packages = 0 for org in orgs: # NB org.packages() misses out many - see: # http://redmine.dguteam.org.uk/issues/1844 pkgs = model.Session.query(model.Package) \ .filter_by(owner_org=org.id) \ .filter_by(state='active') \ .all() num_packages += len(pkgs) for pkg in pkgs: try: qa = p.toolkit.get_action('qa_package_openness_show')(context, {'id': pkg.id}) except p.toolkit.ObjectNotFound: log.warning('No QA info for package %s', pkg.name) return rows.append(OrderedDict(( ('dataset_name', pkg.name), ('dataset_title', pkg.title), ('dataset_notes', lib.dataset_notes(pkg)), ('organization_name', org.name), ('organization_title', org.title), ('openness_score', qa['openness_score']), ('openness_score_reason', qa['openness_score_reason']), ))) score_counts[qa['openness_score']] += 1 total_stars = sum([k*v for k, v in score_counts.items() if k]) num_pkgs_with_stars = sum([v for k, v in score_counts.items() if k is not None]) average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \ if num_pkgs_with_stars else 0.0 return {'table': rows, 'score_counts': jsonify_counter(score_counts), 'total_stars': total_stars, 'average_stars': average_stars, 'num_packages_scored': len(rows), 'num_packages': num_packages, }
def create_row(pkg_, resource_dict): org_ = pkg_.get_organization() return OrderedDict(( ('publisher_title', org_.title), ('publisher_name', org_.name), ('package_title', pkg_.title), ('package_name', pkg_.name), ('package_notes', lib.dataset_notes(pkg_)), ('resource_position', resource_dict.get('position')), ('resource_id', resource_dict.get('id')), ('resource_description', resource_dict.get('description')), ('resource_url', resource_dict.get('url')), ('resource_format', resource_dict.get('format')), ('resource_created', resource_dict.get('created')), ))
def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, # noqa {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, # noqa ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter(model.PackageTag.id == None) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.slice(0, 100)] # First 100 only for this demo # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent( len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def create_row(pkg_, resource_dict): org_ = pkg_.get_organization() return OrderedDict( ( ("publisher_title", org_.title), ("publisher_name", org_.name), ("package_title", pkg_.title), ("package_name", pkg_.name), ("package_notes", lib.dataset_notes(pkg_)), ("resource_position", resource_dict.get("position")), ("resource_id", resource_dict.get("id")), ("resource_description", resource_dict.get("description")), ("resource_url", resource_dict.get("url")), ("resource_format", resource_dict.get("format")), ("resource_created", resource_dict.get("created")), ) )
def datasets_without_resources(): pkg_dicts = [] pkgs = model.Session.query(model.Package).filter_by(state="active").order_by(model.Package.title).all() for pkg in pkgs: if len(pkg.resources) != 0 or pkg.extras.get("unpublished", "").lower() == "true": continue org = pkg.get_organization() deleted, url = last_resource_deleted(pkg) pkg_dict = OrderedDict( ( ("name", pkg.name), ("title", pkg.title), ("organization title", org.title), ("organization name", org.name), ("metadata created", pkg.metadata_created.isoformat()), ("metadata modified", pkg.metadata_modified.isoformat()), ("last resource deleted", deleted.isoformat() if deleted else None), ("last resource url", url), ("dataset_notes", lib.dataset_notes(pkg)), ) ) pkg_dicts.append(pkg_dict) return {"table": pkg_dicts}
def broken_links_for_organization(organization, include_sub_organizations=False): ''' Returns a dictionary detailing broken resource links for the organization or if organization it returns the index page for all organizations. params: organization - name of an organization Returns: {'organization_name': 'cabinet-office', 'organization_title:': 'Cabinet Office', 'table': [ {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'} ...] ''' from ckanext.archiver.model import Archival org = model.Group.get(organization) if not org: raise p.toolkit.ObjectNotFound() name = org.name title = org.title archivals = model.Session.query(Archival, model.Package, model.Group).\ filter(Archival.is_broken == True).\ join(model.Package, Archival.package_id == model.Package.id).\ filter(model.Package.state == 'active').\ join(model.Resource, Archival.resource_id == model.Resource.id).\ filter(model.Resource.state == 'active') if not include_sub_organizations: org_ids = [org.id] archivals = archivals.filter(model.Package.owner_org == org.id) else: # We want any organization_id that is part of this organization's tree org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)] archivals = archivals.filter(model.Package.owner_org.in_(org_ids)) archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id) results = [] for archival, pkg, org in archivals.all(): pkg = model.Package.get(archival.package_id) resource = model.Resource.get(archival.resource_id) via = '' er = pkg.extras.get('external_reference', '') if er == 'ONSHUB': via = "Stats Hub" elif er.startswith("DATA4NR"): via = "Data4nr" archived_resource = model.Session.query(model.ResourceRevision)\ .filter_by(id=resource.id)\ .filter_by(revision_timestamp=archival.resource_timestamp)\ .first() or resource row_data = OrderedDict(( ('dataset_title', pkg.title), ('dataset_name', pkg.name), ('dataset_notes', lib.dataset_notes(pkg)), ('organization_title', org.title), ('organization_name', org.name), ('resource_position', resource.position), ('resource_id', resource.id), ('resource_url', archived_resource.url), ('url_up_to_date', resource.url == archived_resource.url), ('via', via), ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None), ('last_updated', archival.updated.isoformat() if archival.updated else None), ('last_success', archival.last_success.isoformat() if archival.last_success else None), ('url_redirected_to', archival.url_redirected_to), ('reason', archival.reason), ('status', archival.status), ('failure_count', archival.failure_count), )) results.append(row_data) num_broken_packages = archivals.distinct(model.Package.name).count() num_broken_resources = len(results) # Get total number of packages & resources num_packages = model.Session.query(model.Package)\ .filter(model.Package.owner_org.in_(org_ids))\ .filter_by(state='active')\ .count() num_resources = model.Session.query(model.Resource)\ .filter_by(state='active') if p.toolkit.check_ckan_version(max_version='2.2.99'): num_resources = num_resources.join(model.ResourceGroup) num_resources = num_resources \ .join(model.Package)\ .filter(model.Package.owner_org.in_(org_ids))\ .filter_by(state='active').count() return {'organization_name': name, 'organization_title': title, 'num_broken_packages': num_broken_packages, 'num_broken_resources': num_broken_resources, 'num_packages': num_packages, 'num_resources': num_resources, 'broken_package_percent': lib.percent(num_broken_packages, num_packages), 'broken_resource_percent': lib.percent(num_broken_resources, num_resources), 'table': results}
def _get_activity(organization_name, include_sub_organizations, periods): import ckan.model as model from paste.deploy.converters import asbool created = dict((period_name, []) for period_name in periods) modified = dict((period_name, []) for period_name in periods) # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ( 'autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', 'fix_secondary_theme.py', ) system_author_template = 'script%' # "%" is a wildcard if organization_name: organization = model.Group.by_name(organization_name) if not organization: raise p.toolkit.ObjectNotFound() if not organization_name: pkgs = model.Session.query(model.Package) \ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision) \ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision) \ .filter(model.PackageRevision.id == pkg.id) \ .filter_by(state='active') \ .join(model.Revision) \ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision) \ .filter(model.Package.id == pkg.id) \ .filter_by(state='active') \ .join(model.ResourceRevision, model.Package.id == model.ResourceRevision.package_id) \ .join(model.Revision) \ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision) \ .filter(model.Package.id == pkg.id) \ .filter_by(state='active') \ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id) \ .join(model.Revision) \ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) for period_name in periods: period = periods[period_name] # created if period[0] < created_.revision_timestamp < period[1]: published = not asbool(pkg.extras.get('unpublished')) created[period_name].append( (created_.id, created_.name, created_.title, lib.dataset_notes(pkg), 'created', period_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) # modified # exclude the creation revision period_start = max(period[0], created_.revision_timestamp) prs = pr_q.filter(model.PackageRevision.revision_timestamp > period_start) \ .filter(model.PackageRevision.revision_timestamp < period[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > period_start) \ .filter(model.ResourceRevision.revision_timestamp < period[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > period_start) \ .filter(model.PackageExtraRevision.revision_timestamp < period[1]) authors = ' '.join( set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join( [date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[period_name].append( (pkg.id, pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', period_name, dates_formatted, authors, published)) return created, modified
def publisher_activity(organization, include_sub_organizations=False): """ Contains information about the datasets a specific organization has released in this and last quarter (calendar year). This is needed by departments for their quarterly transparency reports. """ import datetime import ckan.model as model from paste.deploy.converters import asbool # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ( 'autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', ) created = {'this': [], 'last': []} modified = {'this': [], 'last': []} now = datetime.datetime.now() quarters = get_quarter_dates(now) if organization: organization = model.Group.by_name(organization) if not organization: raise p.toolkit.ObjectNotFound() if not organization: pkgs = model.Session.query(model.Package)\ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision)\ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision)\ .filter(model.PackageRevision.id == pkg.id)\ .filter_by(state='active')\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.ResourceGroup)\ .join(model.ResourceRevision, model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) for quarter_name in quarters: quarter = quarters[quarter_name] if quarter[0] < created_.revision_timestamp < quarter[1]: published = not asbool(pkg.extras.get('unpublished')) created[quarter_name].append( (created_.name, created_.title, lib.dataset_notes(pkg), 'created', quarter_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) else: prs = pr_q.filter(model.PackageRevision.revision_timestamp > quarter[0])\ .filter(model.PackageRevision.revision_timestamp < quarter[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > quarter[0])\ .filter(model.ResourceRevision.revision_timestamp < quarter[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > quarter[0])\ .filter(model.PackageExtraRevision.revision_timestamp < quarter[1]) authors = ' '.join( set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join( [date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[quarter_name].append( (pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', quarter_name, dates_formatted, authors, published)) datasets = [] for quarter_name in quarters: datasets += sorted(created[quarter_name], key=lambda x: x[1]) datasets += sorted(modified[quarter_name], key=lambda x: x[1]) columns = ('Dataset name', 'Dataset title', 'Dataset notes', 'Modified or created', 'Quarter', 'Timestamp', 'Author', 'Published') return {'table': datasets, 'columns': columns, 'quarters': quarters}
def nii_report(): '''A list of the NII datasets, grouped by publisher, with details of broken links and source.''' nii_dataset_q = model.Session.query(model.Package)\ .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\ .join(model.Group, model.Package.owner_org == model.Group.id)\ .filter(model.PackageExtra.key == 'core-dataset')\ .filter(model.PackageExtra.value == 'true')\ .filter(model.Package.state == 'active') nii_dataset_objects = nii_dataset_q\ .order_by(model.Group.title, model.Package.title).all() def broken_resources_for_package(package_id): from ckanext.archiver.model import Archival results = model.Session.query(Archival, model.Resource)\ .filter(Archival.package_id == package_id)\ .filter(Archival.is_broken == True)\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active') broken_resources = [(resource.description, resource.id) for archival, resource in results.all()] return broken_resources nii_dataset_details = [] num_resources = 0 num_broken_resources = 0 num_broken_datasets = 0 broken_organization_names = set() nii_organizations = set() for dataset_object in nii_dataset_objects: broken_resources = broken_resources_for_package(dataset_object.id) org = dataset_object.get_organization() dataset_details = { 'name': dataset_object.name, 'title': dataset_object.title, 'dataset_notes': lib.dataset_notes(dataset_object), 'organization_name': org.name, 'unpublished': p.toolkit.asbool(dataset_object.extras.get('unpublished')), 'num_broken_resources': len(broken_resources), 'broken_resources': broken_resources, } nii_dataset_details.append(dataset_details) if broken_resources: num_broken_resources += len(broken_resources) num_broken_datasets += 1 broken_organization_names.add(org.name) nii_organizations.add(org) num_resources += len(dataset_object.resources) org_tuples = [(org.name, org.title) for org in sorted(nii_organizations, key=lambda o: o.title)] return { 'table': nii_dataset_details, 'organizations': org_tuples, 'num_resources': num_resources, 'num_datasets': len(nii_dataset_objects), 'num_organizations': len(nii_organizations), 'num_broken_resources': num_broken_resources, 'num_broken_datasets': num_broken_datasets, 'num_broken_organizations': len(broken_organization_names), }
def publisher_activity(organization, include_sub_organizations=False): """ Contains information about the datasets a specific organization has released in this and last quarter (calendar year). This is needed by departments for their quarterly transparency reports. """ import datetime import ckan.model as model from paste.deploy.converters import asbool # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ('autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', 'fix_secondary_theme.py', 'script-fix-links-tna', ) created = {'this': [], 'last': []} modified = {'this': [], 'last': []} now = datetime.datetime.now() quarters = get_quarter_dates(now) if organization: organization = model.Group.by_name(organization) if not organization: raise p.toolkit.ObjectNotFound() if not organization: pkgs = model.Session.query(model.Package)\ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision)\ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision)\ .filter(model.PackageRevision.id == pkg.id)\ .filter_by(state='active')\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.ResourceGroup)\ .join(model.ResourceRevision, model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) for quarter_name in quarters: quarter = quarters[quarter_name] if quarter[0] < created_.revision_timestamp < quarter[1]: published = not asbool(pkg.extras.get('unpublished')) created[quarter_name].append( (created_.name, created_.title, lib.dataset_notes(pkg), 'created', quarter_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) else: prs = pr_q.filter(model.PackageRevision.revision_timestamp > quarter[0])\ .filter(model.PackageRevision.revision_timestamp < quarter[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > quarter[0])\ .filter(model.ResourceRevision.revision_timestamp < quarter[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > quarter[0])\ .filter(model.PackageExtraRevision.revision_timestamp < quarter[1]) authors = ' '.join(set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join([date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[quarter_name].append( (pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', quarter_name, dates_formatted, authors, published)) datasets = [] for quarter_name in quarters: datasets += sorted(created[quarter_name], key=lambda x: x[1]) datasets += sorted(modified[quarter_name], key=lambda x: x[1]) columns = ('Dataset name', 'Dataset title', 'Dataset notes', 'Modified or created', 'Quarter', 'Timestamp', 'Author', 'Published') return {'table': datasets, 'columns': columns, 'quarters': quarters}
def nii_report(): '''A list of the NII datasets, grouped by publisher, with details of broken links and source.''' nii_dataset_q = model.Session.query(model.Package)\ .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\ .join(model.Group, model.Package.owner_org == model.Group.id)\ .filter(model.PackageExtra.key == 'core-dataset')\ .filter(model.PackageExtra.value == 'true')\ .filter(model.Package.state == 'active') nii_dataset_objects = nii_dataset_q\ .order_by(model.Group.title, model.Package.title).all() def broken_resources_for_package(package_id): from ckanext.archiver.model import Archival results = model.Session.query(Archival, model.Resource)\ .filter(Archival.package_id == package_id)\ .filter(Archival.is_broken == True)\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active') broken_resources = [(resource.description, resource.id) for archival, resource in results.all()] return broken_resources nii_dataset_details = [] num_resources = 0 num_broken_resources = 0 num_broken_datasets = 0 broken_organization_names = set() nii_organizations = set() for dataset_object in nii_dataset_objects: broken_resources = broken_resources_for_package(dataset_object.id) org = dataset_object.get_organization() dataset_details = { 'name': dataset_object.name, 'title': dataset_object.title, 'dataset_notes': lib.dataset_notes(dataset_object), 'organization_name': org.name, 'unpublished': p.toolkit.asbool(dataset_object.extras.get('unpublished')), 'num_broken_resources': len(broken_resources), 'broken_resources': broken_resources, } nii_dataset_details.append(dataset_details) if broken_resources: num_broken_resources += len(broken_resources) num_broken_datasets += 1 broken_organization_names.add(org.name) nii_organizations.add(org) num_resources += len(dataset_object.resources) org_tuples = [(org.name, org.title) for org in sorted(nii_organizations, key=lambda o: o.title)] return {'table': nii_dataset_details, 'organizations': org_tuples, 'num_resources': num_resources, 'num_datasets': len(nii_dataset_objects), 'num_organizations': len(nii_organizations), 'num_broken_resources': num_broken_resources, 'num_broken_datasets': num_broken_datasets, 'num_broken_organizations': len(broken_organization_names), }
def nii_report(): """A list of the NII datasets, grouped by publisher, with details of broken links and source.""" nii_dataset_q = ( model.Session.query(model.Package) .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id) .join(model.Group, model.Package.owner_org == model.Group.id) .filter(model.PackageExtra.key == "core-dataset") .filter(model.PackageExtra.value == "true") .filter(model.Package.state == "active") ) nii_dataset_objects = nii_dataset_q.order_by(model.Group.title, model.Package.title).all() def broken_resources_for_package(package_id): from ckanext.archiver.model import Archival results = ( model.Session.query(Archival, model.Resource) .filter(Archival.package_id == package_id) .filter(Archival.is_broken == True) .join(model.Package, Archival.package_id == model.Package.id) .filter(model.Package.state == "active") .join(model.Resource, Archival.resource_id == model.Resource.id) .filter(model.Resource.state == "active") ) broken_resources = [(resource.description, resource.id) for archival, resource in results.all()] return broken_resources nii_dataset_details = [] num_resources = 0 num_broken_resources = 0 num_broken_datasets = 0 broken_organization_names = set() nii_organizations = set() for dataset_object in nii_dataset_objects: broken_resources = broken_resources_for_package(dataset_object.id) org = dataset_object.get_organization() dataset_details = { "name": dataset_object.name, "title": dataset_object.title, "dataset_notes": lib.dataset_notes(dataset_object), "organization_name": org.name, "unpublished": p.toolkit.asbool(dataset_object.extras.get("unpublished")), "num_broken_resources": len(broken_resources), "broken_resources": broken_resources, } nii_dataset_details.append(dataset_details) if broken_resources: num_broken_resources += len(broken_resources) num_broken_datasets += 1 broken_organization_names.add(org.name) nii_organizations.add(org) num_resources += len(dataset_object.resources) org_tuples = [(org.name, org.title) for org in sorted(nii_organizations, key=lambda o: o.title)] return { "table": nii_dataset_details, "organizations": org_tuples, "num_resources": num_resources, "num_datasets": len(nii_dataset_objects), "num_organizations": len(nii_organizations), "num_broken_resources": num_broken_resources, "num_broken_datasets": num_broken_datasets, "num_broken_organizations": len(broken_organization_names), }
def _get_activity(organization_name, include_sub_organizations, periods): import ckan.model as model from paste.deploy.converters import asbool created = dict((period_name, []) for period_name in periods) modified = dict((period_name, []) for period_name in periods) # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ('autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', 'fix_secondary_theme.py', ) system_author_template = 'script%' # "%" is a wildcard if organization_name: organization = model.Group.by_name(organization_name) if not organization: raise p.toolkit.ObjectNotFound() if not organization_name: pkgs = model.Session.query(model.Package)\ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision)\ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision)\ .filter(model.PackageRevision.id == pkg.id)\ .filter_by(state='active')\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.ResourceGroup)\ .join(model.ResourceRevision, model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors))\ .filter(~model.Revision.author.like(system_author_template)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors))\ .filter(~model.Revision.author.like(system_author_template)) for period_name in periods: period = periods[period_name] # created if period[0] < created_.revision_timestamp < period[1]: published = not asbool(pkg.extras.get('unpublished')) created[period_name].append( (created_.name, created_.title, lib.dataset_notes(pkg), 'created', period_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) # modified # exclude the creation revision period_start = max(period[0], created_.revision_timestamp) prs = pr_q.filter(model.PackageRevision.revision_timestamp > period_start)\ .filter(model.PackageRevision.revision_timestamp < period[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > period_start)\ .filter(model.ResourceRevision.revision_timestamp < period[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > period_start)\ .filter(model.PackageExtraRevision.revision_timestamp < period[1]) authors = ' '.join(set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join([date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[period_name].append( (pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', period_name, dates_formatted, authors, published)) return created, modified