def app_dataset_report(): app_dataset_dicts = [] for related in model.Session.query(model.RelatedDataset) \ .filter(model.Related.type=='App') \ .all(): dataset = related.dataset org = dataset.get_organization() top_org = list(go_up_tree(org))[-1] app_dataset_dict = OrderedDict(( ('app title', related.related.title), ('app url', related.related.url), ('dataset name', dataset.name), ('dataset title', dataset.title), ('organization title', org.title), ('organization name', org.name), ('top-level organization title', top_org.title), ('top-level organization name', top_org.name), ('dataset theme', related.dataset.extras.get('theme-primary', '')), ('dataset notes', lib.dataset_notes(dataset)), )) app_dataset_dicts.append(app_dataset_dict) app_dataset_dicts.sort(key=lambda row: row['top-level organization title'] + row['organization title']) return {'table': app_dataset_dicts}
def dataset_creation(organization=OD['organization'], include_sub_organizations=OD['include_sub_organizations'], include_private=OD['include_private'], include_draft=OD['include_draft'], page=1): """Produce a report with basic dataset info.""" selectable_states = set(['active']) if include_draft: selectable_states.add('draft') query = model.Session.query(model.Package)\ .filter(model.Package.type == 'dataset', model.Package.state.in_(selectable_states)) if not include_private: query = query.filter(model.Package.private.is_(False)) if organization: query = lib.filter_by_organizations(query, organization, include_sub_organizations) return { 'table': [ OrderedDict(( ('title', pkg.title), ('url', url_for(controller='package', action='read', id=pkg.id, qualified=True)), ('owner', get_org_title(pkg)), ('created_at', pkg.metadata_created.isoformat()), )) for pkg in query.all() ], 'a': query.count() }
def datasets_without_resources(): pkg_dicts = [] pkgs = model.Session.query(model.Package)\ .filter_by(state='active')\ .order_by(model.Package.title)\ .all() for pkg in pkgs: if len(pkg.resources) != 0 or \ pkg.extras.get('unpublished', '').lower() == 'true': continue org = pkg.get_organization() deleted, url = last_resource_deleted(pkg) pkg_dict = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('organization title', org.title), ('organization name', org.name), ('metadata created', pkg.metadata_created.isoformat()), ('metadata modified', pkg.metadata_modified.isoformat()), ('last resource deleted', deleted.isoformat() if deleted else None), ('last resource url', url), ('dataset_notes', lib.dataset_notes(pkg)), )) pkg_dicts.append(pkg_dict) return {'table': pkg_dicts}
def latest_datasets(self): try: limit = int(request.params.get('limit', default_limit)) except ValueError: limit = default_limit limit = min(100, limit) # max value query = model.Session.query(model.PackageRevision) query = query.filter(model.PackageRevision.state == 'active') query = query.filter(model.PackageRevision.current == True) query = query.order_by( model.package_revision_table.c.revision_timestamp.desc()) query = query.limit(limit) pkg_dicts = [] for pkg_rev in query: pkg = pkg_rev.continuity publishers = pkg.get_groups('publisher') if publishers: pub_title = publishers[0].title pub_link = '/publisher/%s' % publishers[0].name else: pub_title = pub_link = None pkg_dict = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', pkg.notes), ('dataset_link', '/dataset/%s' % pkg.name), ('publisher_title', pub_title), ('publisher_link', pub_link), ('metadata_modified', pkg.metadata_modified.isoformat()), )) pkg_dicts.append(pkg_dict) return self._finish_ok(pkg_dicts)
def html_datasets_report(): ''' Returns datasets that only have an HTML link, by organization. ''' # Get packages pkgs = model.Session.query(model.Package)\ .filter_by(state='active') # See if HTML num_datasets_published = 0 num_datasets_only_html = 0 datasets_by_publisher_only_html = collections.defaultdict(list) # use yield_per, otherwise memory use just goes up til the script is killed # by the os. for pkg in pkgs.yield_per(100): if p.toolkit.asbool(pkg.extras.get('unpublished')): continue num_datasets_published += 1 formats = set([ res.format.lower() for res in pkg.resources if res.resource_type != 'documentation' ]) if 'html' not in formats: continue org = pkg.get_organization().name data_formats = formats - set(('asp', '', None)) if data_formats == set(('html', )): num_datasets_only_html += 1 datasets_by_publisher_only_html[org].append((pkg.name, pkg.title)) rows = [] for org_name, datasets_only_html in sorted( datasets_by_publisher_only_html.iteritems(), key=lambda x: -len(x[1])): org = model.Session.query(model.Group) \ .filter_by(name=org_name) \ .first() top_org = list(go_up_tree(org))[-1] row = OrderedDict(( ('organization title', org.title), ('organization name', org.name), ('top-level organization title', top_org.title), ('top-level organization name', top_org.name), ('num datasets only html', len(datasets_only_html)), ('name datasets only html', ' '.join(d[0] for d in datasets_only_html)), ('title datasets only html', '|'.join(d[1] for d in datasets_only_html)), )) rows.append(row) return { 'table': rows, 'num_datasets_published': num_datasets_published, 'num_datasets_only_html': num_datasets_only_html, }
def openness_for_organization(organization=None, include_sub_organizations=False): org = model.Group.get(organization) if not org: raise p.toolkit.ObjectNotFound if not include_sub_organizations: orgs = [org] else: orgs = lib.go_down_tree(org) context = {'model': model, 'session': model.Session, 'ignore_auth': True} score_counts = Counter() rows = [] num_packages = 0 for org in orgs: # NB org.packages() misses out many - see: # http://redmine.dguteam.org.uk/issues/1844 pkgs = model.Session.query(model.Package) \ .filter_by(owner_org=org.id) \ .filter_by(state='active') \ .all() num_packages += len(pkgs) for pkg in pkgs: try: qa = p.toolkit.get_action('qa_package_openness_show')( context, { 'id': pkg.id }) except p.toolkit.ObjectNotFound: log.warning('No QA info for package %s', pkg.name) return rows.append( OrderedDict(( ('dataset_name', pkg.name), ('dataset_title', pkg.title), ('dataset_notes', lib.dataset_notes(pkg)), ('organization_name', org.name), ('organization_title', org.title), ('openness_score', qa['openness_score']), ('openness_score_reason', qa['openness_score_reason']), ))) score_counts[qa['openness_score']] += 1 total_stars = sum([k * v for k, v in score_counts.items() if k]) num_pkgs_with_stars = sum( [v for k, v in score_counts.items() if k is not None]) average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \ if num_pkgs_with_stars else 0.0 return { 'table': rows, 'score_counts': jsonify_counter(score_counts), 'total_stars': total_stars, 'average_stars': average_stars, 'num_packages_scored': len(rows), 'num_packages': num_packages, }
def revision_as_dict(revision, include_packages=True, ref_package_by='name'): revision_dict = OrderedDict(( ('id', revision.id), ('timestamp', strftimestamp(revision.timestamp)), ('message', revision.message), ('author', revision.author), )) if include_packages: revision_dict['packages'] = [getattr(pkg, ref_package_by) \ for pkg in revision.packages] return revision_dict
def migrate(self): """ Adds any missing columns to the database table for Archival by checking the schema and adding those that are missing. If you wish to add a column, add the column name and sql statement to MIGRATIONS_ADD which will check that the column is not present before running the query. If you wish to modify or delete a column, add the column name and query to the MIGRATIONS_MODIFY which only runs if the column does exist. """ from ckan import model MIGRATIONS_ADD = OrderedDict({ "etag": "ALTER TABLE archival ADD COLUMN etag character varying", "last_modified": "ALTER TABLE archival ADD COLUMN last_modified character varying" }) MIGRATIONS_MODIFY = OrderedDict({}) q = "select column_name from INFORMATION_SCHEMA.COLUMNS where table_name = 'archival';" current_cols = list([m[0] for m in model.Session.execute(q)]) for k, v in MIGRATIONS_ADD.iteritems(): if not k in current_cols: self.log.info(u"Adding column '{0}'".format(k)) self.log.info(u"Executing '{0}'".format(v)) model.Session.execute(v) model.Session.commit() for k, v in MIGRATIONS_MODIFY.iteritems(): if k in current_cols: self.log.info(u"Removing column '{0}'".format(k)) self.log.info(u"Executing '{0}'".format(v)) model.Session.execute(v) model.Session.commit() self.log.info("Migrations complete")
def tagless_report(organization, include_sub_organizations=False): ''' Produces a report on packages without tags. Returns something like this: { 'table': [ {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'}, {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'}, ], 'num_packages': 56, 'packages_without_tags_percent': 4, 'average_tags_per_package': 3.5, } ''' # Find the packages without tags q = model.Session.query(model.Package) \ .outerjoin(model.PackageTag) \ .filter(model.PackageTag.id == None) if organization: q = lib.filter_by_organizations(q, organization, include_sub_organizations) tagless_pkgs = [ OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', lib.dataset_notes(pkg)), ('user', pkg.creator_user_id), ('created', pkg.metadata_created.isoformat()), )) for pkg in q.all() ] # Average number of tags per package q = model.Session.query(model.Package) q = lib.filter_by_organizations(q, organization, include_sub_organizations) num_packages = q.count() q = q.join(model.PackageTag) num_taggings = q.count() if num_packages: average_tags_per_package = round(float(num_taggings) / num_packages, 1) else: average_tags_per_package = None packages_without_tags_percent = lib.percent(len(tagless_pkgs), num_packages) return { 'table': tagless_pkgs, 'num_packages': num_packages, 'packages_without_tags_percent': packages_without_tags_percent, 'average_tags_per_package': average_tags_per_package, }
def create_row(pkg_, resource_dict): org_ = pkg_.get_organization() return OrderedDict(( ('publisher_title', org_.title), ('publisher_name', org_.name), ('package_title', pkg_.title), ('package_name', pkg_.name), ('package_notes', lib.dataset_notes(pkg_)), ('resource_position', resource_dict.get('position')), ('resource_id', resource_dict.get('id')), ('resource_description', resource_dict.get('description')), ('resource_url', resource_dict.get('url')), ('resource_format', resource_dict.get('format')), ('resource_created', resource_dict.get('created')), ))
def revision_as_dict(revision, include_packages=True, include_groups=True,ref_package_by='name'): revision_dict = OrderedDict(( ('id', revision.id), ('timestamp', datetime_to_date_str(revision.timestamp)), ('message', revision.message), ('author', revision.author), ('approved_timestamp', datetime_to_date_str(revision.approved_timestamp) \ if revision.approved_timestamp else None), )) if include_packages: revision_dict['packages'] = [getattr(pkg, ref_package_by) \ for pkg in revision.packages if pkg] if include_groups: revision_dict['groups'] = [getattr(grp, ref_package_by) \ for grp in revision.groups if grp] return revision_dict
def _mini_pkg_dict(self, pkg_id): '''For a package id, return the basic details for the package in a dictionary. Quite expensive - does two database lookups - so be careful with running it lots of times. ''' pkg = model.Session.query(model.Package).get(pkg_id) pub = pkg.get_organization() return OrderedDict((('id', pkg_id), ('name', pkg.name), ('title', pkg.title), ('notes', markdown_extract(pkg.notes)), ('dataset_link', '/dataset/%s' % pkg.name), ('publisher_title', pub.title if pub else None), ('publisher_link', '/publisher/%s' % pub.name if pub else None), # Metadata modified is a big query, so leave out unless required # ('metadata_modified', pkg.metadata_modified.isoformat()), ))
def wms_url_correcter(wms_url): '''Corrects basic errors in WMS URLs. May raise ValidationError if it really cannot be made sense of. ''' # e.g. wms_url = u'http://lasigpublic.nerc-lancaster.ac.uk/ArcGIS/services/Biodiversity/GMFarmEvaluation/MapServer/WMSServer?request=GetCapabilities&service=WMS' # Split up params try: if '?' in wms_url: base_url, params_str = wms_url.split('?') else: base_url, params_str = wms_url, '' params_list = params_str.split('&') if params_list == ['']: params_list = [] params = OrderedDict() for param_str in params_list: if not param_str.strip(): continue key, value = param_str.split('=') params[key.lower()] = value # duplicates get removed here automatically except ValueError: raise ValidationError('URL structure wrong') # Add in request and service params if missing if 'request' not in params: params['request'] = 'GetCapabilities' if 'service' not in params: params['service'] = 'WMS' # Only allow particular parameter values if params['request'].lower() not in ('getcapabilities', 'getfeatureinfo'): raise ValidationError('Invalid value for "request"') if params['service'].lower() != 'wms': raise ValidationError('Invalid value for "service"') # Reassemble URL params_list = [] for key, value in params.items(): params_list.append('%s=%s' % (key, value)) wms_url = base_url + '?' + '&'.join(params_list) return wms_url
def wms_url_correcter(wms_url): '''Corrects basic errors in WMS URLs. May raise ValidationError if it really cannot be made sense of. ''' # e.g. wms_url = u'http://lasigpublic.nerc-lancaster.ac.uk/ArcGIS/services/Biodiversity/GMFarmEvaluation/MapServer/WMSServer?request=GetCapabilities&service=WMS' # Split up params try: if '?' in wms_url: base_url, params_str = wms_url.split('?') else: base_url, params_str = wms_url, '' params_list = params_str.split('&') if params_list == ['']: params_list = [] params = OrderedDict() for param_str in params_list: if not param_str.strip(): continue key, value = param_str.split('=') params[key.lower()] = value # duplicates get removed here automatically except ValueError, e: raise ValidationError('URL structure wrong')
'average_tags_per_package': average_tags_per_package, } def tagless_report_option_combinations(): for organization in lib.all_organizations(include_none=True): for include_sub_organizations in (False, True): yield { 'organization': organization, 'include_sub_organizations': include_sub_organizations } tagless_report_info = { 'name': 'tagless-datasets', 'description': 'Datasets which have no tags.', 'option_defaults': OrderedDict(( ('organization', None), ('include_sub_organizations', False), )), 'option_combinations': tagless_report_option_combinations, 'generate': tagless_report, 'template': 'report/tagless-datasets.html', }
def licence_report(organization=None, include_sub_organizations=False): ''' Returns a dictionary detailing licences for datasets in the organisation specified, and optionally sub organizations. ''' # Get packages if organization: top_org = model.Group.by_name(organization) if not top_org: raise p.toolkit.ObjectNotFound('Publisher not found') if include_sub_organizations: orgs = lib.go_down_tree(top_org) else: orgs = [top_org] pkgs = set() for org in orgs: org_pkgs = model.Session.query(model.Package)\ .filter_by(state='active') org_pkgs = lib.filter_by_organizations( org_pkgs, organization, include_sub_organizations=False)\ .all() pkgs |= set(org_pkgs) else: pkgs = model.Session.query(model.Package)\ .filter_by(state='active')\ .all() # Get their licences packages_by_licence = collections.defaultdict(list) rows = [] num_pkgs = 0 for pkg in pkgs: if asbool(pkg.extras.get('unpublished')) is True: # Ignore unpublished datasets continue licence_tuple = (pkg.license_id or '', pkg.license.title if pkg.license else '', pkg.extras.get('licence', '')) packages_by_licence[licence_tuple].append((pkg.name, pkg.title)) num_pkgs += 1 for licence_tuple, dataset_tuples in sorted(packages_by_licence.items(), key=lambda x: -len(x[1])): license_id, license_title, licence = licence_tuple dataset_tuples.sort(key=lambda x: x[0]) dataset_names, dataset_titles = zip(*dataset_tuples) licence_dict = OrderedDict(( ('license_id', license_id), ('license_title', license_title), ('licence', licence), ('dataset_titles', '|'.join(t for t in dataset_titles)), ('dataset_names', ' '.join(dataset_names)), )) rows.append(licence_dict) return { 'num_datasets': num_pkgs, 'num_licences': len(rows), 'table': rows, }
def broken_links_for_organization(organization, include_sub_organizations=False): ''' Returns a dictionary detailing broken resource links for the organization or if organization it returns the index page for all organizations. params: organization - name of an organization Returns: {'organization_name': 'cabinet-office', 'organization_title:': 'Cabinet Office', 'table': [ {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'} ...] ''' from ckanext.archiver.model import Archival org = model.Group.get(organization) if not org: raise p.toolkit.ObjectNotFound() name = org.name title = org.title archivals = model.Session.query(Archival, model.Package, model.Group).\ filter(Archival.is_broken == True).\ join(model.Package, Archival.package_id == model.Package.id).\ filter(model.Package.state == 'active').\ join(model.Resource, Archival.resource_id == model.Resource.id).\ filter(model.Resource.state == 'active') if not include_sub_organizations: org_ids = [org.id] archivals = archivals.filter(model.Package.owner_org == org.id) else: # We want any organization_id that is part of this organization's tree org_ids = [ '%s' % organization.id for organization in lib.go_down_tree(org) ] archivals = archivals.filter(model.Package.owner_org.in_(org_ids)) archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id) results = [] for archival, pkg, org in archivals.all(): pkg = model.Package.get(archival.package_id) resource = model.Resource.get(archival.resource_id) via = '' er = pkg.extras.get('external_reference', '') if er == 'ONSHUB': via = "Stats Hub" elif er.startswith("DATA4NR"): via = "Data4nr" archived_resource = model.Session.query(model.ResourceRevision)\ .filter_by(id=resource.id)\ .filter_by(revision_timestamp=archival.resource_timestamp)\ .first() or resource row_data = OrderedDict(( ('dataset_title', pkg.title), ('dataset_name', pkg.name), ('dataset_notes', lib.dataset_notes(pkg)), ('organization_title', org.title), ('organization_name', org.name), ('resource_position', resource.position), ('resource_id', resource.id), ('resource_url', archived_resource.url), ('url_up_to_date', resource.url == archived_resource.url), ('via', via), ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None), ('last_updated', archival.updated.isoformat() if archival.updated else None), ('last_success', archival.last_success.isoformat() if archival.last_success else None), ('url_redirected_to', archival.url_redirected_to), ('reason', archival.reason), ('status', archival.status), ('failure_count', archival.failure_count), )) results.append(row_data) num_broken_packages = archivals.distinct(model.Package.name).count() num_broken_resources = len(results) # Get total number of packages & resources num_packages = model.Session.query(model.Package)\ .filter(model.Package.owner_org.in_(org_ids))\ .filter_by(state='active')\ .count() num_resources = model.Session.query(model.Resource)\ .filter_by(state='active') if hasattr(model, 'ResourceGroup'): num_resources = num_resources.join(model.ResourceGroup) num_resources = num_resources \ .join(model.Package)\ .filter(model.Package.owner_org.in_(org_ids))\ .filter_by(state='active').count() return { 'organization_name': name, 'organization_title': title, 'num_broken_packages': num_broken_packages, 'num_broken_resources': num_broken_resources, 'num_packages': num_packages, 'num_resources': num_resources, 'broken_package_percent': lib.percent(num_broken_packages, num_packages), 'broken_resource_percent': lib.percent(num_broken_resources, num_resources), 'table': results }
return {'table': nii_dataset_details, 'organizations': org_tuples, 'num_resources': num_resources, 'num_datasets': len(nii_dataset_objects), 'num_organizations': len(nii_organizations), 'num_broken_resources': num_broken_resources, 'num_broken_datasets': num_broken_datasets, 'num_broken_organizations': len(broken_organization_names), } nii_report_info = { 'name': 'nii', 'title': 'National Information Infrastructure', 'description': 'Details of the datasets in the NII.', 'option_defaults': OrderedDict([]), 'option_combinations': None, 'generate': nii_report, 'template': 'report/nii.html', } # Publisher resources def publisher_resources(organization=None, include_sub_organizations=False): ''' Returns a dictionary detailing resources for each dataset in the organisation specified. '''
class DateType(object): '''Utils for handling dates in forms. * Full or partial dates * User inputs in form DD/MM/YYYY and it is stored in db as YYYY-MM-DD. ''' format_types = ('form', 'db') datetime_fields = OrderedDict([('year', (1000, 2100, 4, 'YYYY')), ('month', (1, 12, 2, 'MM')), ('day', (1, 31, 2, 'DD')), ('hour', (0, 23, 2, 'HH')), ('minute', (0, 59, 2, 'MM')), ]) datetime_fields_indexes = {'min':0, 'max':1, 'digits':2, 'format_code':3} date_fields_order = {'db':('year', 'month', 'day'), 'form':('day', 'month', 'year')} parsing_separators = {'date':'-/', 'time':':\.'} default_separators = {'db':{'date':'-', 'time':':'}, 'form':{'date':'/', 'time':':'},} field_code_map = {'year':'YYYY', 'month':'MM', 'day':'DD', 'hour':'HH', 'minute':'MM'} word_match = re.compile('[A-Za-z]+') timezone_match = re.compile('(\s[A-Z]{3})|(\s[+-]\d\d:?\d\d)') months_abbreviated = [month[:3] for month in months] @classmethod def parse_timedate(cls, timedate_str, format_type): '''Takes a timedate and returns a dictionary of the fields. * Little validation is done. * If it can\'t understand the layout it raises DateConvertError ''' assert format_type in cls.format_types if not hasattr(cls, 'matchers'): # build up a list of re matches for the different # acceptable ways of expressing the time and date cls.matchers = {} cls.readable_formats = {} for format_type_ in cls.format_types: finished_regexps = [] readable_formats = [] # analogous to the regexps, # but human readable year_re = '(?P<%s>\d{2,4})' month_re = '(?P<%s>\w+)' two_digit_decimal_re = '(?P<%s>\d{1,2})' time_re = '%s[%s]%s' % ( two_digit_decimal_re % 'hour', cls.parsing_separators['time'], two_digit_decimal_re % 'minute') time_readable = '%s%s%s' % ( cls.datetime_fields['hour'][cls.datetime_fields_indexes['format_code']], cls.default_separators[format_type_]['time'], cls.datetime_fields['minute'][cls.datetime_fields_indexes['format_code']]) date_field_re = {'year':year_re % 'year', 'month':month_re % 'month', 'day':two_digit_decimal_re % 'day'} date_fields = list(cls.date_fields_order[format_type_]) for how_specific in ('day', 'month', 'year'): date_sep_re = '[%s]' % cls.parsing_separators['date'] date_sep_readable = cls.default_separators[format_type_]['date'] date_field_regexps = [date_field_re[field] for field in date_fields] date_field_readable = [cls.datetime_fields[field][cls.datetime_fields_indexes['format_code']] for field in date_fields] date_re = date_sep_re.join(date_field_regexps) date_readable = date_sep_readable.join(date_field_readable) finished_regexps.append(date_re) readable_formats.append(date_readable) date_fields.remove(how_specific) full_date_re = finished_regexps[0] full_date_readable = readable_formats[0] # Allow time to be before or after the date for format_ in ('%(time_re)s%(sep)s%(full_date_re)s', '%(full_date_re)s%(sep)s%(time_re)s'): finished_regexps.insert(0, format_ % { 'time_re':time_re, 'sep':'\s', 'full_date_re':full_date_re}) readable_formats.insert(0, format_ % { 'time_re':time_readable, 'sep':' ', 'full_date_re':full_date_readable}) cls.matchers[format_type_] = [re.compile('^%s$' % regexp) for regexp in finished_regexps] cls.readable_formats[format_type_] = readable_formats #print format_type_, finished_regexps, readable_formats for index, matcher in enumerate(cls.matchers[format_type]): match = matcher.match(timedate_str) if match: timedate_dict = match.groupdict() timedate_dict = cls.int_timedate(timedate_dict) timedate_dict['readable_format'] = cls.readable_formats[format_type][index] return timedate_dict else: acceptable_formats = ', '.join(["'%s'" % format_ for format_ in cls.readable_formats[format_type]]) raise DateConvertError("Cannot parse %s date '%s'. Acceptable formats: %s" % (format_type, timedate_str, acceptable_formats)) @classmethod def int_timedate(cls, timedate_dict): # Convert timedate string values to integers int_timedate_dict = timedate_dict.copy() for field in cls.datetime_fields.keys(): if timedate_dict.has_key(field): val = timedate_dict[field] if field == 'year': if len(val) == 2: # Deal with 2 digit dates try: int_val = int(val) except ValueError: raise DateConvertError('Expecting integer for %s value: %s' % (field, val)) val = cls.add_centurys_to_two_digit_year(int_val) elif len(val) == 3: raise DateConvertError('Expecting 2 or 4 digit year: "%s"' % (val)) if field == 'month': # Deal with months expressed as words if val in months: val = months.index(val) + 1 if val in cls.months_abbreviated: val = cls.months_abbreviated.index(val) + 1 try: int_timedate_dict[field] = int(val) except ValueError: raise DateConvertError('Expecting integer for %s value: %s' % (field, val)) return int_timedate_dict @classmethod def iso_to_db(cls, iso_date, format): # e.g. 'Wed, 06 Jan 2010 09:30:00' # '%a, %d %b %Y %H:%M:%S' assert isinstance(iso_date, (unicode, str)) try: date_tuple = time.strptime(iso_date, format) except ValueError, e: raise DateConvertError('Could not read date as ISO format "%s". Date provided: "%s"' % (format, iso_date)) date_obj = datetime.datetime(*date_tuple[:4]) date_str = cls.date_to_db(date_obj) return date_str
def publisher_report(metric): orgs = dict(model.Session.query(model.Group.name, model.Group)\ .filter_by(state='active').all()) org_counts = collections.defaultdict(dict) if metric in ('views', 'viewsdownloads', 'visits'): if metric == 'views' or metric == 'viewsdownloads': sql_function = 'sum(pageviews::int)' elif metric == 'visits': sql_function = 'sum(visits::int)' q = ''' select department_id, period_name, %s metric from ga_url where department_id <> '' and package_id <> '' group by department_id, period_name order by department_id ''' % sql_function org_period_count = model.Session.connection().execute(q) for org_name, period_name, count in org_period_count: org_counts[org_name][period_name] = count if metric in ('downloads', 'viewsdownloads'): q = ''' select g.name as org_name, s.period_name, sum(s.value::int) as downloads from GA_Stat as s join Package as p on s.key=p.name join "group" as g on p.owner_org=g.id where stat_name='Downloads' and g.state='active' group by org_name, s.period_name order by downloads desc; ''' org_period_count = model.Session.connection().execute(q) if metric == 'viewsdownloads': # add it onto the existing counts for org_name, period_name, count in org_period_count: org_counts[org_name][period_name] = count + \ org_counts[org_name].get(period_name, 0) org_counts[org_name]['All'] = count + \ org_counts[org_name].get('All', 0) else: for org_name, period_name, count in org_period_count: org_counts[org_name][period_name] = count org_counts[org_name]['All'] = count + \ org_counts[org_name].get('All', 0) org_counts = sorted(org_counts.items(), key=lambda x: -x[1].get('All', 0)) all_periods = [ res[0] for res in model.Session.query(GA_Url.period_name).group_by( GA_Url.period_name).order_by(GA_Url.period_name).all() ] rows = [] for org_name, counts in org_counts: org = orgs.get(org_name) if not org: continue top_org = list(go_up_tree(org))[-1] row = OrderedDict(( ('organization title', org.title), ('organization name', org.name), ('top-level organization title', top_org.title), ('top-level organization name', top_org.name), )) for period_name in all_periods: row[period_name] = counts.get(period_name, 0) rows.append(row) # Group the periods by year, to help the template draw the table nicely #all_periods_tuples = [period.split('-') for period in all_periods # if '-' in period] #all_periods_tuples.sort(key=lambda x: x[0]) #all_periods_by_year = [ # (year, [p for y, p in year_periods]) # for year, year_periods in groupby(all_periods_tuples, lambda x: x[0])] return { 'table': rows, 'all periods': all_periods, #'all periods by year': all_periods_by_year }
top_packages = PackageStats.get_top(limit=last) return {'table': top_packages.get("packages")} def google_analytics_dataset_option_combinations(): options = [20, 25, 30, 35, 40, 45, 50] for option in options: yield {'last': option} googleanalytics_dataset_report_info = { 'name': 'google-analytics-dataset', 'title': 'Most popular datasets', 'description': 'Google analytics showing top datasets with most views', 'option_defaults': OrderedDict((('last', 20), )), 'option_combinations': google_analytics_dataset_option_combinations, 'generate': google_analytics_dataset_report, 'template': 'report/dataset_analytics.html', } def google_analytics_resource_report(last): ''' Generates report based on google analytics data. number of views per package ''' # get resource objects corresponding to popular GA content top_resources = ResourceStats.get_top(limit=last) return {'table': top_resources.get("resources")}
def broken_links_index(include_sub_organizations=False): '''Returns the count of broken links for all organizations.''' from ckanext.archiver.model import Archival counts = {} # Get all the broken datasets and build up the results by org for org in model.Session.query(model.Group)\ .filter(model.Group.type == 'organization')\ .filter(model.Group.state == 'active').all(): archivals = model.Session.query(Archival)\ .filter(Archival.is_broken == True)\ .join(model.Package, Archival.package_id == model.Package.id)\ .filter(model.Package.owner_org == org.id)\ .filter(model.Package.state == 'active')\ .join(model.Resource, Archival.resource_id == model.Resource.id)\ .filter(model.Resource.state == 'active') broken_resources = archivals.count() broken_datasets = archivals.distinct(model.Package.id).count() num_datasets = model.Session.query(model.Package)\ .filter_by(owner_org=org.id)\ .filter_by(state='active')\ .count() num_resources = model.Session.query(model.Package)\ .filter_by(owner_org=org.id)\ .filter_by(state='active') if hasattr(model, 'ResourceGroup'): num_resources = num_resources.join(model.ResourceGroup) num_resources = num_resources \ .join(model.Resource)\ .filter_by(state='active')\ .count() counts[org.name] = { 'organization_title': org.title, 'broken_packages': broken_datasets, 'broken_resources': broken_resources, 'packages': num_datasets, 'resources': num_resources } counts_with_sub_orgs = copy.deepcopy(counts) # new dict if include_sub_organizations: for org_name in counts_with_sub_orgs: org = model.Group.by_name(org_name) for sub_org_id, sub_org_name, sub_org_title, sub_org_parent_id \ in org.get_children_group_hierarchy(type='organization'): if sub_org_name not in counts: # occurs only if there is an organization created since the last loop? continue counts_with_sub_orgs[org_name]['broken_packages'] += \ counts[sub_org_name]['broken_packages'] counts_with_sub_orgs[org_name]['broken_resources'] += \ counts[sub_org_name]['broken_resources'] counts_with_sub_orgs[org_name]['packages'] += \ counts[sub_org_name]['packages'] counts_with_sub_orgs[org_name]['resources'] += \ counts[sub_org_name]['resources'] results = counts_with_sub_orgs else: results = counts data = [] num_broken_packages = 0 num_broken_resources = 0 num_packages = 0 num_resources = 0 for org_name, org_counts in sorted(results.iteritems(), key=lambda r: r[0]): data.append( OrderedDict(( ('organization_title', results[org_name]['organization_title']), ('organization_name', org_name), ('package_count', org_counts['packages']), ('resource_count', org_counts['resources']), ('broken_package_count', org_counts['broken_packages']), ('broken_package_percent', lib.percent(org_counts['broken_packages'], org_counts['packages'])), ('broken_resource_count', org_counts['broken_resources']), ('broken_resource_percent', lib.percent(org_counts['broken_resources'], org_counts['resources'])), ))) # Totals - always use the counts, rather than counts_with_sub_orgs, to # avoid counting a package in both its org and parent org org_counts_ = counts[org_name] num_broken_packages += org_counts_['broken_packages'] num_broken_resources += org_counts_['broken_resources'] num_packages += org_counts_['packages'] num_resources += org_counts_['resources'] return { 'table': data, 'num_broken_packages': num_broken_packages, 'num_broken_resources': num_broken_resources, 'num_packages': num_packages, 'num_resources': num_resources, 'broken_package_percent': lib.percent(num_broken_packages, num_packages), 'broken_resource_percent': lib.percent(num_broken_resources, num_resources), }
from ckan import model from ckan.lib.helpers import OrderedDict from ckanext.report import lib from ckan.lib.helpers import (url_for, Page) from ckanext.statsresources.helpers import get_org_title OD = OrderedDict(( ('organization', None), ('include_sub_organizations', False), ('include_private', False), ('include_draft', False), )) def dataset_creation(organization=OD['organization'], include_sub_organizations=OD['include_sub_organizations'], include_private=OD['include_private'], include_draft=OD['include_draft'], page=1): """Produce a report with basic dataset info.""" selectable_states = set(['active']) if include_draft: selectable_states.add('draft') query = model.Session.query(model.Package)\ .filter(model.Package.type == 'dataset', model.Package.state.in_(selectable_states)) if not include_private: query = query.filter(model.Package.private.is_(False)) if organization: query = lib.filter_by_organizations(query, organization,
'table': nii_dataset_details, 'organizations': org_tuples, 'num_resources': num_resources, 'num_datasets': len(nii_dataset_objects), 'num_organizations': len(nii_organizations), 'num_broken_resources': num_broken_resources, 'num_broken_datasets': num_broken_datasets, 'num_broken_organizations': len(broken_organization_names), } nii_report_info = { 'name': 'nii', 'title': 'National Information Infrastructure', 'description': 'Details of the datasets in the NII.', 'option_defaults': OrderedDict([]), 'option_combinations': None, 'generate': nii_report, 'template': 'report/nii.html', } def publisher_resources(organization=None, include_sub_organizations=False): ''' Returns a dictionary detailing resources for each dataset in the organisation specified. ''' org = model.Group.by_name(organization) if not org: raise p.toolkit.ObjectNotFound('Publisher not found')
# limit is higher if sysadmin if is_sysadmin(): max_limit = 1000 else: max_limit = 50 # Get the revisions in the requested time frame revs = model.Session.query(model.Revision) \ .filter(model.Revision.timestamp >= since_timestamp) \ .order_by(model.Revision.timestamp.asc()) \ .limit(max_limit) \ .all() result = OrderedDict(( ('number_of_revisions', len(revs)), ('since_timestamp', since_timestamp.strftime('%Y-%m-%d %H:%M')), ('current_timestamp', now.strftime('%Y-%m-%d %H:%M')), ('since_revision_id', revs[0].id if revs else None), ('newest_revision_id', revs[-1].id if revs else None), ('results_limited', len(revs) == max_limit))) # See which packages have changed in the time frame changed_package_ids = set() query = model.Session.query(model.PackageRevision) \ .join(model.Revision) \ .filter(model.Revision.timestamp >= since_timestamp) \ .limit(max_limit) if query.count() == max_limit: result['results_limited'] = True changed_package_ids.update([pkg_rev.id for pkg_rev in query.all()]) if not result['results_limited']:
class DguApiController(ApiController): def popular_unpublished(self): """ Returns the most popular unpublished items after it has calculated a score for each criteria. We need to be able to calculate the score for this. """ return self._finish_ok([]) def latest_datasets(self, published_only=True): '''Designed for the dgu home page, shows lists the latest datasets that got changed (exluding extra, group and tag changes) with lots of details about each dataset. ''' try: limit = int(request.params.get('limit', default_limit)) except ValueError: limit = default_limit limit = min(100, limit) # max value from ckan.lib.search import SearchError fq = 'capacity:"public"' if published_only: fq = fq + ' unpublished:false' try: # package search context = {'model': model, 'session': model.Session, 'user': '******'} data_dict = { 'q':'', 'fq': fq, 'facet':'false', 'start':0, 'rows': limit, 'sort': 'metadata_modified desc' } query = get_action('package_search')(context,data_dict) except SearchError, se: log.error('Search error: %s', se) pkg_dicts = [] for pkg_dict in query['results']: pkg = model.Package.get(pkg_dict['id']) publisher = pkg.get_organization() if publisher: pub_title = publisher.title pub_link = '/publisher/%s' % publisher.name else: pub_title = pub_link = None last_modified = pkg.metadata_modified pkg_dict = OrderedDict(( ('name', pkg.name), ('title', pkg.title), ('notes', pkg.notes), ('dataset_link', '/dataset/%s' % pkg.name), ('publisher_title', pub_title), ('publisher_link', pub_link), ('metadata_modified', last_modified.isoformat()), )) pkg_dicts.append(pkg_dict) return self._finish_ok(pkg_dicts)
#all_periods_tuples = [period.split('-') for period in all_periods # if '-' in period] #all_periods_tuples.sort(key=lambda x: x[0]) #all_periods_by_year = [ # (year, [p for y, p in year_periods]) # for year, year_periods in groupby(all_periods_tuples, lambda x: x[0])] return { 'table': rows, 'all periods': all_periods, #'all periods by year': all_periods_by_year } def publisher_report_option_combinations(): return ({ 'metric': metric } for metric in ('views', 'visits', 'downloads', 'viewsdownloads')) publisher_report_info = { 'name': 'site-usage-publisher', 'title': 'Site usage by publisher', 'description': 'Usage statistics, by publisher for each month. Data is from Google Analytics.', 'option_defaults': OrderedDict([('metric', 'views')]), 'option_combinations': publisher_report_option_combinations, 'generate': publisher_report, 'template': 'report/publisher.html', }
def organisation_dataset_scores(organisation_name, include_sub_organisations=False): ''' Returns a dictionary detailing openness scores for the organisation for each dataset. i.e.: {'publisher_name': 'cabinet-office', 'publisher_title:': 'Cabinet Office', 'data': [ {'package_name', 'package_title', 'resource_url', 'openness_score', 'reason', 'last_updated', 'is_broken', 'format'} ...] NB the list does not contain datasets that have 0 resources and therefore score 0 ''' values = {} sql = """ select package.id as package_id, task_status.key as task_status_key, task_status.value as task_status_value, task_status.error as task_status_error, task_status.last_updated as task_status_last_updated, resource.id as resource_id, resource.url as resource_url, resource.position, package.title as package_title, package.name as package_name, "group".id as publisher_id, "group".name as publisher_name, "group".title as publisher_title from resource left join task_status on task_status.entity_id = resource.id left join resource_group on resource.resource_group_id = resource_group.id left join package on resource_group.package_id = package.id left join member on member.table_id = package.id left join "group" on member.group_id = "group".id where entity_id in (select entity_id from task_status where task_status.task_type='qa') and package.state = 'active' and resource.state='active' and resource_group.state='active' and "group".state='active' and task_status.task_type='qa' and task_status.key='status' %(org_filter)s order by package.title, package.name, resource.position """ sql_options = {} org = model.Group.by_name(organisation_name) if not org: abort(404, 'Publisher not found') organisation_title = org.title if not include_sub_organisations: sql_options['org_filter'] = 'and "group".name = :org_name' values['org_name'] = organisation_name else: sub_org_filters = [ '"group".name=\'%s\'' % org.name for org in go_down_tree(org) ] sql_options['org_filter'] = 'and (%s)' % ' or '.join(sub_org_filters) rows = model.Session.execute(sql % sql_options, values) data = dict() # dataset_name: {properties} for row in rows: package_data = data.get(row.package_name) if not package_data: package_data = OrderedDict(( ('dataset_title', row.package_title), ('dataset_name', row.package_name), ('publisher_title', row.publisher_title), ('publisher_name', row.publisher_name), # the rest are placeholders to hold the details # of the highest scoring resource ('resource_position', None), ('resource_id', None), ('resource_url', None), ('openness_score', None), ('openness_score_reason', None), ('last_updated', None), )) if row.task_status_value > package_data['openness_score']: package_data['resource_position'] = row.position package_data['resource_id'] = row.resource_id package_data['resource_url'] = row.resource_url try: package_data.update(json.loads(row.task_status_error)) except ValueError, e: log.error( 'QA status "error" should have been in JSON format, but found: "%s" %s', task_status_error, e) package_data[ 'reason'] = 'Could not display reason due to a system error' package_data['openness_score'] = row.task_status_value package_data['openness_score_reason'] = package_data[ 'reason'] # deprecated package_data['last_updated'] = row.task_status_last_updated data[row.package_name] = package_data
log.error( 'QA status "error" should have been in JSON format, but found: "%s" %s', task_status_error, e) package_data[ 'reason'] = 'Could not display reason due to a system error' package_data['openness_score'] = row.task_status_value package_data['openness_score_reason'] = package_data[ 'reason'] # deprecated package_data['last_updated'] = row.task_status_last_updated data[row.package_name] = package_data # Sort the results by openness_score asc so we can see the worst # results first data = OrderedDict( sorted(data.iteritems(), key=lambda x: x[1]['openness_score'])) return { 'publisher_name': organisation_name, 'publisher_title': organisation_title, 'data': data.values() } def feedback_report(publisher, include_sub_publishers=False, include_published=False, use_cache=False): """ For the publisher provided (and optionally for sub-publishers) this function will generate a report on the feedback for that publisher.
def organisation_resources(organisation_name, include_sub_organisations=False, date_formatter=None): ''' Returns a dictionary detailing resources for each dataset in the organisation specified. headings: ['Publisher title', 'Publisher name', 'Dataset title', 'Dataset name', 'Resource index', 'Description', 'URL', 'Format', 'Date created'] i.e.: {'publisher_name': 'cabinet-office', 'publisher_title:': 'Cabinet Office', 'schema': {'Publisher title': 'publisher_id', 'Publisher name': 'publisher_name', ...}, 'rows': [ row_dict, row_dict, ... ] } ''' sql = """ select package.id as package_id, package.title as package_title, package.name as package_name, resource.id as resource_id, resource.url as resource_url, resource.format as resource_format, resource.description as resource_description, resource.position as resource_position, resource.created as resource_created, "group".id as publisher_id, "group".name as publisher_name, "group".title as publisher_title from resource left join resource_group on resource.resource_group_id = resource_group.id left join package on resource_group.package_id = package.id left join member on member.table_id = package.id left join "group" on member.group_id = "group".id where package.state='active' and resource.state='active' and resource_group.state='active' and "group".state='active' and %(org_filter)s order by "group".name, package.name, resource.position """ org = model.Group.by_name(organisation_name) if not org: abort(404, 'Publisher not found') organisation_title = org.title sql_org_filter, sql_params = sql_to_filter_by_organisation( org, include_sub_organisations=include_sub_organisations) raw_rows = model.Session.execute(sql % sql_org_filter, sql_params) schema = OrderedDict(( ('Publisher title', 'publisher_title'), ('Publisher name', 'publisher_name'), ('Dataset title', 'package_title'), ('Dataset name', 'package_name'), ('Resource index', 'resource_position'), ('Resource ID', 'resource_id'), ('Description', 'resource_description'), ('URL', 'resource_url'), ('Format', 'resource_format'), ('Date created', 'resource_created'), )) rows = [] for raw_row in raw_rows: #row = [getattr(raw_row, key) for key in schema.values()] row = OrderedDict([(key, getattr(raw_row, key)) for key in schema.values()]) if date_formatter: for col in ('resource_created', ): if row[col]: row[col] = date_formatter(row[col]) rows.append(row) return { 'publisher_name': org.name, 'publisher_title': org.title, 'schema': schema, 'rows': rows, }