def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter( model.PackageTag.id == None # noqa: E711 ) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [ OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.slice(0, 100) ] # First 100 only for this demo # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent(len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, # noqa {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, # noqa ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter(model.PackageTag.id == None) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.slice(0, 100)] # First 100 only for this demo # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent( len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def dataset_creation(organization=OD['organization'], include_sub_organizations=OD['include_sub_organizations'], include_private=OD['include_private'], include_draft=OD['include_draft'], page=1): """Produce a report with basic dataset info.""" selectable_states = set(['active']) if include_draft: selectable_states.add('draft') query = model.Session.query(model.Package)\ .filter(model.Package.type == 'dataset', model.Package.state.in_(selectable_states)) if not include_private: query = query.filter(model.Package.private.is_(False)) if organization: query = lib.filter_by_organizations( query, organization, include_sub_organizations) return { 'table': [ OrderedDict(( ('title', pkg.title), ('url', url_for(controller='package', action='read', id=pkg.id, qualified=True)), ('owner', get_org_title(pkg)), ('created_at', pkg.metadata_created.isoformat()), )) for pkg in query.all() ], 'a': query.count() }
def dataset_creation(organization=OD['organization'], include_sub_organizations=OD['include_sub_organizations'], include_private=OD['include_private'], include_draft=OD['include_draft'], page=1): """Produce a report with basic dataset info.""" selectable_states = set(['active']) if include_draft: selectable_states.add('draft') query = model.Session.query(model.Package)\ .filter(model.Package.type == 'dataset', model.Package.state.in_(selectable_states)) if not include_private: query = query.filter(model.Package.private.is_(False)) if organization: query = lib.filter_by_organizations(query, organization, include_sub_organizations) return { 'table': [ OrderedDict(( ('title', pkg.title), ('url', url_for(controller='package', action='read', id=pkg.id, qualified=True)), ('owner', get_org_title(pkg)), ('created_at', pkg.metadata_created.isoformat()), )) for pkg in query.all() ], 'a': query.count() }
def publisher_resources(organization=None, include_sub_organizations=False): ''' Returns a dictionary detailing resources for each dataset in the organisation specified. ''' org = model.Group.by_name(organization) if not org: raise p.toolkit.ObjectNotFound('Publisher not found') # Get packages pkgs = model.Session.query(model.Package)\ .filter_by(state='active') pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() # Get their resources def create_row(pkg_, resource_dict): org_ = pkg_.get_organization() return OrderedDict(( ('publisher_title', org_.title), ('publisher_name', org_.name), ('package_title', pkg_.title), ('package_name', pkg_.name), ('package_notes', lib.dataset_notes(pkg_)), ('resource_position', resource_dict.get('position')), ('resource_id', resource_dict.get('id')), ('resource_description', resource_dict.get('description')), ('resource_url', resource_dict.get('url')), ('resource_format', resource_dict.get('format')), ('resource_created', resource_dict.get('created')), )) num_resources = 0 rows = [] for pkg in pkgs: resources = pkg.resources if resources: for res in resources: res_dict = { 'id': res.id, 'position': res.position, 'description': res.description, 'url': res.url, 'format': res.format, 'created': (res.created.isoformat() if res.created else None) } rows.append(create_row(pkg, res_dict)) num_resources += len(resources) else: # packages with no resources are still listed rows.append(create_row(pkg, {})) return { 'organization_name': org.name, 'organization_title': org.title, 'num_datasets': len(pkgs), 'num_resources': num_resources, 'table': rows, }
def datasets_without_resources(organization, include_sub_organizations=False): pkg_dicts = [] query = model.Session.query(model.Package)\ .filter_by(state='active')\ .order_by(model.Package.title) if organization: query = lib.filter_by_organizations(query, organization, include_sub_organizations) for pkg in query.all(): if len(pkg.resources) != 0 or \ pkg.extras.get('unpublished', '').lower() == 'true': continue deleted, url = last_resource_deleted(pkg) pkg_dict = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('metadata created', pkg.metadata_created.isoformat()), ('metadata modified', pkg.metadata_modified.isoformat()), ('last resource deleted', deleted.isoformat() if deleted else None), ('last resource url', url), ('dataset_notes', lib.dataset_notes(pkg)), )) pkg_dicts.append(pkg_dict) return {'table': pkg_dicts}
def publisher_resources(organization=None, include_sub_organizations=False): """ Returns a dictionary detailing resources for each dataset in the organisation specified. """ org = model.Group.by_name(organization) if not org: raise p.toolkit.ObjectNotFound(u"Izdava\u010D nije pronaden") # Get packages pkgs = model.Session.query(model.Package).filter_by(state="active") pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() # Get their resources def create_row(pkg_, resource_dict): org_ = pkg_.get_organization() return OrderedDict( ( ("publisher_title", org_.title), ("publisher_name", org_.name), ("package_title", pkg_.title), ("package_name", pkg_.name), ("package_notes", lib.dataset_notes(pkg_)), ("resource_position", resource_dict.get("position")), ("resource_id", resource_dict.get("id")), ("resource_description", resource_dict.get("description")), ("resource_url", resource_dict.get("url")), ("resource_format", resource_dict.get("format")), ("resource_created", resource_dict.get("created")), ) ) num_resources = 0 rows = [] for pkg in pkgs: resources = pkg.resources if resources: for res in resources: res_dict = { "id": res.id, "position": res.position, "description": res.description, "url": res.url, "format": res.format, "created": res.created, } rows.append(create_row(pkg, res_dict)) num_resources += len(resources) else: # packages with no resources are still listed rows.append(create_row(pkg, {})) return { "organization_name": org.name, "organization_title": org.title, "num_datasets": len(pkgs), "num_resources": num_resources, "table": rows, }
def publisher_resources(organization=None, include_sub_organizations=False): ''' Returns a dictionary detailing resources for each dataset in the organisation specified. ''' org = model.Group.by_name(organization) if not org: raise p.toolkit.ObjectNotFound('Publisher not found') # Get packages pkgs = model.Session.query(model.Package)\ .filter_by(state='active') pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() # Get their resources def create_row(pkg_, resource_dict): org_ = pkg_.get_organization() return OrderedDict(( ('publisher_title', org_.title), ('publisher_name', org_.name), ('package_title', pkg_.title), ('package_name', pkg_.name), ('package_notes', lib.dataset_notes(pkg_)), ('resource_position', resource_dict.get('position')), ('resource_id', resource_dict.get('id')), ('resource_description', resource_dict.get('description')), ('resource_url', resource_dict.get('url')), ('resource_format', resource_dict.get('format')), ('resource_created', resource_dict.get('created')), )) num_resources = 0 rows = [] for pkg in pkgs: resources = pkg.resources if resources: for res in resources: res_dict = {'id': res.id, 'position': res.position, 'description': res.description, 'url': res.url, 'format': res.format, 'created': (res.created.isoformat() if res.created else None)} rows.append(create_row(pkg, res_dict)) num_resources += len(resources) else: # packages with no resources are still listed rows.append(create_row(pkg, {})) return {'organization_name': org.name, 'organization_title': org.title, 'num_datasets': len(pkgs), 'num_resources': num_resources, 'table': rows, }
def html_datasets_report(organization, include_sub_organizations=False): ''' Returns datasets that only have an HTML link, by organization. ''' # Get packages query = model.Session.query(model.Package)\ .filter_by(state='active') if organization: query = lib.filter_by_organizations(query, organization, include_sub_organizations) pkgs = query.all() # See if HTML num_datasets_published = 0 num_datasets_only_html = 0 datasets_only_html = [] # use yield_per, otherwise memory use just goes up til the script is killed # by the os. for pkg in pkgs: if p.toolkit.asbool(pkg.extras.get('unpublished')): continue num_datasets_published += 1 formats = set([ res.format.lower() for res in pkg.resources if res.resource_type != 'documentation' ]) if 'html' not in formats: continue #org = pkg.get_organization().name data_formats = formats - set(('asp', '', None)) if data_formats == set(('html', )): num_datasets_only_html += 1 datasets_only_html.append(pkg) rows = [] for pkg in datasets_only_html: row = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('metadata created', pkg.metadata_created.isoformat()), ('metadata modified', pkg.metadata_modified.isoformat()), ('dataset_notes', lib.dataset_notes(pkg)), )) rows.append(row) return { 'table': rows, 'num_datasets_published': num_datasets_published, 'num_datasets_only_html': num_datasets_only_html, }
def publisher_activity(organization, include_sub_organizations=False): """ Contains information about the datasets a specific organization has released in this and last quarter (calendar year). This is needed by departments for their quarterly transparency reports. """ import datetime import ckan.model as model from paste.deploy.converters import asbool # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ( 'autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', ) created = {'this': [], 'last': []} modified = {'this': [], 'last': []} now = datetime.datetime.now() quarters = get_quarter_dates(now) if organization: organization = model.Group.by_name(organization) if not organization: raise p.toolkit.ObjectNotFound() if not organization: pkgs = model.Session.query(model.Package)\ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision)\ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision)\ .filter(model.PackageRevision.id == pkg.id)\ .filter_by(state='active')\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.ResourceGroup)\ .join(model.ResourceRevision, model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) for quarter_name in quarters: quarter = quarters[quarter_name] if quarter[0] < created_.revision_timestamp < quarter[1]: published = not asbool(pkg.extras.get('unpublished')) created[quarter_name].append( (created_.name, created_.title, lib.dataset_notes(pkg), 'created', quarter_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) else: prs = pr_q.filter(model.PackageRevision.revision_timestamp > quarter[0])\ .filter(model.PackageRevision.revision_timestamp < quarter[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > quarter[0])\ .filter(model.ResourceRevision.revision_timestamp < quarter[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > quarter[0])\ .filter(model.PackageExtraRevision.revision_timestamp < quarter[1]) authors = ' '.join( set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join( [date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[quarter_name].append( (pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', quarter_name, dates_formatted, authors, published)) datasets = [] for quarter_name in quarters: datasets += sorted(created[quarter_name], key=lambda x: x[1]) datasets += sorted(modified[quarter_name], key=lambda x: x[1]) columns = ('Dataset name', 'Dataset title', 'Dataset notes', 'Modified or created', 'Quarter', 'Timestamp', 'Author', 'Published') return {'table': datasets, 'columns': columns, 'quarters': quarters}
def feedback_report(organization=None, include_sub_organizations=False, include_published=False): """ For the publisher provided (and optionally for sub-publishers) this function will generate a report on the feedback for that publisher. """ import ckan.lib.helpers as helpers from ckanext.dgu.model.feedback import Feedback if organization: organization = model.Group.by_name(organization) if not organization: raise p.toolkit.ObjectNotFound() else: organization = None # Get packages for these organization(s) memberships = model.Session.query(model.Member)\ .join(model.Package, model.Package.id==model.Member.table_id)\ .filter(model.Member.state == 'active') memberships = lib.filter_by_organizations(memberships, organization, include_sub_organizations)\ .filter(model.Member.table_name == 'package')\ .filter(model.Package.state == 'active') # For each package, count the feedback comments results = [] num_pkgs_with_feedback = 0 for member in memberships.all(): pkg = model.Package.get(member.table_id) # Skip unpublished datasets if that's asked for if not include_published and not pkg.extras.get('unpublished', False): continue pkg_data = collections.defaultdict(int) pkg_data['organization-name'] = member.group.name pkg_data['generated-at'] = helpers.render_datetime( datetime.datetime.now(), "%d/%m/%Y %H:%M") pkg_data['organization-title'] = member.group.title pkg_data['package-name'] = pkg.name pkg_data['package-title'] = pkg.title pkg_data['publish-date'] = pkg.extras.get('publish-date', '') for feedback in model.Session.query(Feedback).filter(Feedback.visible == True)\ .filter(Feedback.package_id == member.table_id )\ .filter(Feedback.active == True ): if feedback.economic: pkg_data['economic'] += 1 if feedback.social: pkg_data['social'] += 1 if feedback.linked: pkg_data['linked'] += 1 if feedback.other: pkg_data['other'] += 1 if feedback.effective: pkg_data['effective'] += 1 pkg_data['total-comments'] = sum([ pkg_data['economic'], pkg_data['social'], pkg_data['linked'], pkg_data['other'], pkg_data['effective'] ]) results.append(pkg_data) if pkg_data['total-comments'] > 0: num_pkgs_with_feedback += 1 return { 'table': sorted(results, key=lambda x: -x.get('total-comments')), 'dataset_count': len(results), 'dataset_count_with_feedback': num_pkgs_with_feedback, }
def publisher_activity(organization, include_sub_organizations=False): """ Contains information about the datasets a specific organization has released in this and last quarter (calendar year). This is needed by departments for their quarterly transparency reports. """ import datetime import ckan.model as model from paste.deploy.converters import asbool # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ('autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', 'fix_secondary_theme.py', 'script-fix-links-tna', ) created = {'this': [], 'last': []} modified = {'this': [], 'last': []} now = datetime.datetime.now() quarters = get_quarter_dates(now) if organization: organization = model.Group.by_name(organization) if not organization: raise p.toolkit.ObjectNotFound() if not organization: pkgs = model.Session.query(model.Package)\ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision)\ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision)\ .filter(model.PackageRevision.id == pkg.id)\ .filter_by(state='active')\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.ResourceGroup)\ .join(model.ResourceRevision, model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) for quarter_name in quarters: quarter = quarters[quarter_name] if quarter[0] < created_.revision_timestamp < quarter[1]: published = not asbool(pkg.extras.get('unpublished')) created[quarter_name].append( (created_.name, created_.title, lib.dataset_notes(pkg), 'created', quarter_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) else: prs = pr_q.filter(model.PackageRevision.revision_timestamp > quarter[0])\ .filter(model.PackageRevision.revision_timestamp < quarter[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > quarter[0])\ .filter(model.ResourceRevision.revision_timestamp < quarter[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > quarter[0])\ .filter(model.PackageExtraRevision.revision_timestamp < quarter[1]) authors = ' '.join(set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join([date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[quarter_name].append( (pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', quarter_name, dates_formatted, authors, published)) datasets = [] for quarter_name in quarters: datasets += sorted(created[quarter_name], key=lambda x: x[1]) datasets += sorted(modified[quarter_name], key=lambda x: x[1]) columns = ('Dataset name', 'Dataset title', 'Dataset notes', 'Modified or created', 'Quarter', 'Timestamp', 'Author', 'Published') return {'table': datasets, 'columns': columns, 'quarters': quarters}
def feedback_report(organization=None, include_sub_organizations=False, include_published=False): """ For the publisher provided (and optionally for sub-publishers) this function will generate a report on the feedback for that publisher. """ import ckan.lib.helpers as helpers from ckanext.dgu.model.feedback import Feedback if organization: organization = model.Group.by_name(organization) if not organization: raise p.toolkit.ObjectNotFound() else: organization = None # Get packages for these organization(s) memberships = model.Session.query(model.Member)\ .join(model.Package, model.Package.id==model.Member.table_id)\ .filter(model.Member.state == 'active') memberships = lib.filter_by_organizations(memberships, organization, include_sub_organizations)\ .filter(model.Member.table_name == 'package')\ .filter(model.Package.state == 'active') # For each package, count the feedback comments results = [] num_pkgs_with_feedback = 0 for member in memberships.all(): pkg = model.Package.get(member.table_id) # Skip unpublished datasets if that's asked for if not include_published and not pkg.extras.get('unpublished', False): continue pkg_data = collections.defaultdict(int) pkg_data['organization-name'] = member.group.name pkg_data['generated-at'] = helpers.render_datetime(datetime.datetime.now(), "%d/%m/%Y %H:%M") pkg_data['organization-title'] = member.group.title pkg_data['package-name'] = pkg.name pkg_data['package-title'] = pkg.title pkg_data['publish-date'] = pkg.extras.get('publish-date', '') for feedback in model.Session.query(Feedback).filter(Feedback.visible == True)\ .filter(Feedback.package_id == member.table_id )\ .filter(Feedback.active == True ): if feedback.economic: pkg_data['economic'] += 1 if feedback.social: pkg_data['social'] += 1 if feedback.linked: pkg_data['linked'] += 1 if feedback.other: pkg_data['other'] += 1 if feedback.effective: pkg_data['effective'] += 1 pkg_data['total-comments'] = sum([pkg_data['economic'], pkg_data['social'], pkg_data['linked'], pkg_data['other'], pkg_data['effective']]) results.append(pkg_data) if pkg_data['total-comments'] > 0: num_pkgs_with_feedback += 1 return {'table': sorted(results, key=lambda x: -x.get('total-comments')), 'dataset_count': len(results), 'dataset_count_with_feedback': num_pkgs_with_feedback, }
def licence_report(organization=None, include_sub_organizations=False): ''' Returns a dictionary detailing licences for datasets in the organisation specified, and optionally sub organizations. ''' # Get packages if organization: top_org = model.Group.by_name(organization) if not top_org: raise p.toolkit.ObjectNotFound('Publisher not found') if include_sub_organizations: orgs = lib.go_down_tree(top_org) else: orgs = [top_org] pkgs = set() for org in orgs: org_pkgs = model.Session.query(model.Package)\ .filter_by(state='active') org_pkgs = lib.filter_by_organizations( org_pkgs, organization, include_sub_organizations=False)\ .all() pkgs |= set(org_pkgs) else: pkgs = model.Session.query(model.Package)\ .filter_by(state='active')\ .all() # Get their licences packages_by_licence = collections.defaultdict(list) rows = [] num_pkgs = 0 for pkg in pkgs: if asbool(pkg.extras.get('unpublished')) is True: # Ignore unpublished datasets continue licence_tuple = (pkg.license_id or '', pkg.license.title if pkg.license else '', pkg.extras.get('licence', '')) packages_by_licence[licence_tuple].append((pkg.name, pkg.title)) num_pkgs += 1 for licence_tuple, dataset_tuples in sorted(packages_by_licence.items(), key=lambda x: -len(x[1])): license_id, license_title, licence = licence_tuple dataset_tuples.sort(key=lambda x: x[0]) dataset_names, dataset_titles = zip(*dataset_tuples) licence_dict = OrderedDict(( ('license_id', license_id), ('license_title', license_title), ('licence', licence), ('dataset_titles', '|'.join(t for t in dataset_titles)), ('dataset_names', ' '.join(dataset_names)), )) rows.append(licence_dict) return { 'num_datasets': num_pkgs, 'num_licences': len(rows), 'table': rows, }
def _get_activity(organization_name, include_sub_organizations, periods): import ckan.model as model from paste.deploy.converters import asbool created = dict((period_name, []) for period_name in periods) modified = dict((period_name, []) for period_name in periods) # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ('autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', 'fix_secondary_theme.py', ) system_author_template = 'script%' # "%" is a wildcard if organization_name: organization = model.Group.by_name(organization_name) if not organization: raise p.toolkit.ObjectNotFound() if not organization_name: pkgs = model.Session.query(model.Package)\ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision)\ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision)\ .filter(model.PackageRevision.id == pkg.id)\ .filter_by(state='active')\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.ResourceGroup)\ .join(model.ResourceRevision, model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors))\ .filter(~model.Revision.author.like(system_author_template)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\ .filter(model.Package.id == pkg.id)\ .filter_by(state='active')\ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id)\ .join(model.Revision)\ .filter(~model.Revision.author.in_(system_authors))\ .filter(~model.Revision.author.like(system_author_template)) for period_name in periods: period = periods[period_name] # created if period[0] < created_.revision_timestamp < period[1]: published = not asbool(pkg.extras.get('unpublished')) created[period_name].append( (created_.name, created_.title, lib.dataset_notes(pkg), 'created', period_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) # modified # exclude the creation revision period_start = max(period[0], created_.revision_timestamp) prs = pr_q.filter(model.PackageRevision.revision_timestamp > period_start)\ .filter(model.PackageRevision.revision_timestamp < period[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > period_start)\ .filter(model.ResourceRevision.revision_timestamp < period[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > period_start)\ .filter(model.PackageExtraRevision.revision_timestamp < period[1]) authors = ' '.join(set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join([date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[period_name].append( (pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', period_name, dates_formatted, authors, published)) return created, modified
def _get_activity(organization_name, include_sub_organizations, periods): import ckan.model as model from paste.deploy.converters import asbool created = dict((period_name, []) for period_name in periods) modified = dict((period_name, []) for period_name in periods) # These are the authors whose revisions we ignore, as they are trivial # changes. NB we do want to know about revisions by: # * harvest (harvested metadata) # * dgu (NS Stat Hub imports) # * Fix national indicators system_authors = ( 'autotheme', 'co-prod3.dh.bytemark.co.uk', 'Date format tidier', 'current_revision_fixer', 'current_revision_fixer2', 'fix_contact_details.py', 'Repoint 410 Gone to webarchive url', 'Fix duplicate resources', 'fix_secondary_theme.py', ) system_author_template = 'script%' # "%" is a wildcard if organization_name: organization = model.Group.by_name(organization_name) if not organization: raise p.toolkit.ObjectNotFound() if not organization_name: pkgs = model.Session.query(model.Package) \ .all() else: pkgs = model.Session.query(model.Package) pkgs = lib.filter_by_organizations(pkgs, organization, include_sub_organizations).all() for pkg in pkgs: created_ = model.Session.query(model.PackageRevision) \ .filter(model.PackageRevision.id == pkg.id) \ .order_by("revision_timestamp asc").first() pr_q = model.Session.query(model.PackageRevision, model.Revision) \ .filter(model.PackageRevision.id == pkg.id) \ .filter_by(state='active') \ .join(model.Revision) \ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision) \ .filter(model.Package.id == pkg.id) \ .filter_by(state='active') \ .join(model.ResourceRevision, model.Package.id == model.ResourceRevision.package_id) \ .join(model.Revision) \ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision) \ .filter(model.Package.id == pkg.id) \ .filter_by(state='active') \ .join(model.PackageExtraRevision, model.Package.id == model.PackageExtraRevision.package_id) \ .join(model.Revision) \ .filter(~model.Revision.author.in_(system_authors)) \ .filter(~model.Revision.author.like(system_author_template)) for period_name in periods: period = periods[period_name] # created if period[0] < created_.revision_timestamp < period[1]: published = not asbool(pkg.extras.get('unpublished')) created[period_name].append( (created_.id, created_.name, created_.title, lib.dataset_notes(pkg), 'created', period_name, created_.revision_timestamp.isoformat(), created_.revision.author, published)) # modified # exclude the creation revision period_start = max(period[0], created_.revision_timestamp) prs = pr_q.filter(model.PackageRevision.revision_timestamp > period_start) \ .filter(model.PackageRevision.revision_timestamp < period[1]) rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > period_start) \ .filter(model.ResourceRevision.revision_timestamp < period[1]) pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > period_start) \ .filter(model.PackageExtraRevision.revision_timestamp < period[1]) authors = ' '.join( set([r[1].author for r in prs] + [r[2].author for r in rrs] + [r[2].author for r in pes])) dates = set([r[1].timestamp.date() for r in prs] + [r[2].timestamp.date() for r in rrs] + [r[2].timestamp.date() for r in pes]) dates_formatted = ' '.join( [date.isoformat() for date in sorted(dates)]) if authors: published = not asbool(pkg.extras.get('unpublished')) modified[period_name].append( (pkg.id, pkg.name, pkg.title, lib.dataset_notes(pkg), 'modified', period_name, dates_formatted, authors, published)) return created, modified