Esempio n. 1
0
def datasets_without_resources():
    pkg_dicts = []
    pkgs = model.Session.query(model.Package)\
                .filter_by(state='active')\
                .order_by(model.Package.title)\
                .all()
    for pkg in pkgs:
        if len(pkg.resources) != 0 or \
          pkg.extras.get('unpublished', '').lower() == 'true':
            continue
        org = pkg.get_organization()
        deleted, url = last_resource_deleted(pkg)
        pkg_dict = OrderedDict((
                ('name', pkg.name),
                ('title', pkg.title),
                ('organization title', org.title),
                ('organization name', org.name),
                ('metadata created', pkg.metadata_created.isoformat()),
                ('metadata modified', pkg.metadata_modified.isoformat()),
                ('last resource deleted', deleted.isoformat() if deleted else None),
                ('last resource url', url),
                ('dataset_notes', lib.dataset_notes(pkg)),
                ))
        pkg_dicts.append(pkg_dict)
    return {'table': pkg_dicts}
Esempio n. 2
0
def datasets_without_resources():
    pkg_dicts = []
    pkgs = model.Session.query(model.Package)\
                .filter_by(state='active')\
                .order_by(model.Package.title)\
                .all()
    for pkg in pkgs:
        if len(pkg.resources) != 0 or \
          pkg.extras.get('unpublished', '').lower() == 'true':
            continue
        org = pkg.get_organization()
        deleted, url = last_resource_deleted(pkg)
        pkg_dict = OrderedDict((
            ('name', pkg.name),
            ('title', pkg.title),
            ('organization title', org.title),
            ('organization name', org.name),
            ('metadata created', pkg.metadata_created.isoformat()),
            ('metadata modified', pkg.metadata_modified.isoformat()),
            ('last resource deleted',
             deleted.isoformat() if deleted else None),
            ('last resource url', url),
            ('dataset_notes', lib.dataset_notes(pkg)),
        ))
        pkg_dicts.append(pkg_dict)
    return {'table': pkg_dicts}
Esempio n. 3
0
def app_dataset_report():
    app_dataset_dicts = []
    for related in model.Session.query(model.RelatedDataset) \
                        .filter(model.Related.type=='App') \
                        .all():
        dataset = related.dataset
        org = dataset.get_organization()
        top_org = list(go_up_tree(org))[-1]

        app_dataset_dict = OrderedDict((
            ('app title', related.related.title),
            ('app url', related.related.url),
            ('dataset name', dataset.name),
            ('dataset title', dataset.title),
            ('organization title', org.title),
            ('organization name', org.name),
            ('top-level organization title', top_org.title),
            ('top-level organization name', top_org.name),
            ('dataset theme', related.dataset.extras.get('theme-primary', '')),
            ('dataset notes', lib.dataset_notes(dataset)),
            ))
        app_dataset_dicts.append(app_dataset_dict)

    app_dataset_dicts.sort(key=lambda row: row['top-level organization title']
                           + row['organization title'])

    return {'table': app_dataset_dicts}
Esempio n. 4
0
def openness_for_organization(organization=None,
                              include_sub_organizations=False):
    org = model.Group.get(organization)
    if not org:
        raise p.toolkit.ObjectNotFound

    if not include_sub_organizations:
        orgs = [org]
    else:
        orgs = lib.go_down_tree(org)

    context = {'model': model, 'session': model.Session, 'ignore_auth': True}
    score_counts = Counter()
    rows = []
    num_packages = 0
    for org in orgs:
        # NB org.packages() misses out many - see:
        # http://redmine.dguteam.org.uk/issues/1844
        pkgs = model.Session.query(model.Package) \
                    .filter_by(owner_org=org.id) \
                    .filter_by(state='active') \
                    .filter_by(private=False) \
                    .all()
        num_packages += len(pkgs)
        for pkg in pkgs:
            try:
                qa = p.toolkit.get_action('qa_package_openness_show')(
                    context, {
                        'id': pkg.id
                    })
            except p.toolkit.ObjectNotFound:
                log.warning('No QA info for package %s', pkg.name)
                return
            rows.append(
                OrderedDict((
                    ('dataset_name', pkg.name),
                    ('dataset_title', pkg.title),
                    ('dataset_notes', lib.dataset_notes(pkg)),
                    ('organization_name', org.name),
                    ('organization_title', org.title),
                    ('openness_score', qa['openness_score']),
                    ('openness_score_reason', qa['openness_score_reason']),
                )))
            score_counts[qa['openness_score']] += 1

    total_stars = sum([k * v for k, v in score_counts.items() if k])
    num_pkgs_with_stars = sum(
        [v for k, v in score_counts.items() if k is not None])
    average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \
        if num_pkgs_with_stars else 0.0

    return {
        'table': rows,
        'score_counts': jsonify_counter(score_counts),
        'total_stars': total_stars,
        'average_stars': average_stars,
        'num_packages_scored': len(rows),
        'num_packages': num_packages,
    }
Esempio n. 5
0
def tagless_report(organization, include_sub_organizations=False):
    '''
    Produces a report on packages without tags.
    Returns something like this:
        {
         'table': [
            {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******',
             'created': '2008-06-13T10:24:59.435631'},
            {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******',
             'created': '2009-12-14T08:42:45.473827'},
            ],
         'num_packages': 56,
         'packages_without_tags_percent': 4,
         'average_tags_per_package': 3.5,
        }
    '''
    # Find the packages without tags
    q = model.Session.query(model.Package) \
        .outerjoin(model.PackageTag) \
        .filter(
        model.PackageTag.id == None  # noqa: E711
    )
    if organization:
        q = lib.filter_by_organizations(q, organization,
                                        include_sub_organizations)
    tagless_pkgs = [
        OrderedDict((
            ('name', pkg.name),
            ('title', pkg.title),
            ('notes', lib.dataset_notes(pkg)),
            ('user', pkg.creator_user_id),
            ('created', pkg.metadata_created.isoformat()),
        )) for pkg in q.slice(0, 100)
    ]  # First 100 only for this demo

    # Average number of tags per package
    q = model.Session.query(model.Package)
    q = lib.filter_by_organizations(q, organization, include_sub_organizations)
    num_packages = q.count()
    q = q.join(model.PackageTag)
    num_taggings = q.count()
    if num_packages:
        average_tags_per_package = round(float(num_taggings) / num_packages, 1)
    else:
        average_tags_per_package = None
    packages_without_tags_percent = lib.percent(len(tagless_pkgs),
                                                num_packages)

    return {
        'table': tagless_pkgs,
        'num_packages': num_packages,
        'packages_without_tags_percent': packages_without_tags_percent,
        'average_tags_per_package': average_tags_per_package,
    }
Esempio n. 6
0
def html_datasets_report(organization, include_sub_organizations=False):
    '''
    Returns datasets that only have an HTML link, by organization.
    '''

    # Get packages
    query = model.Session.query(model.Package)\
                .filter_by(state='active')
    if organization:
        query = lib.filter_by_organizations(query, organization,
                                            include_sub_organizations)

    pkgs = query.all()
    # See if HTML
    num_datasets_published = 0
    num_datasets_only_html = 0
    datasets_only_html = []
    # use yield_per, otherwise memory use just goes up til the script is killed
    # by the os.
    for pkg in pkgs:
        if p.toolkit.asbool(pkg.extras.get('unpublished')):
            continue
        num_datasets_published += 1

        formats = set([
            res.format.lower() for res in pkg.resources
            if res.resource_type != 'documentation'
        ])
        if 'html' not in formats:
            continue
        #org = pkg.get_organization().name

        data_formats = formats - set(('asp', '', None))
        if data_formats == set(('html', )):
            num_datasets_only_html += 1
            datasets_only_html.append(pkg)

    rows = []
    for pkg in datasets_only_html:
        row = OrderedDict((
            ('name', pkg.name),
            ('title', pkg.title),
            ('metadata created', pkg.metadata_created.isoformat()),
            ('metadata modified', pkg.metadata_modified.isoformat()),
            ('dataset_notes', lib.dataset_notes(pkg)),
        ))
        rows.append(row)

    return {
        'table': rows,
        'num_datasets_published': num_datasets_published,
        'num_datasets_only_html': num_datasets_only_html,
    }
Esempio n. 7
0
def openness_for_organization(organization=None, include_sub_organizations=False):
    org = model.Group.get(organization)
    if not org:
        raise p.toolkit.ObjectNotFound

    if not include_sub_organizations:
        orgs = [org]
    else:
        orgs = lib.go_down_tree(org)

    context = {'model': model, 'session': model.Session, 'ignore_auth': True}
    score_counts = Counter()
    rows = []
    num_packages = 0
    for org in orgs:
        # NB org.packages() misses out many - see:
        # http://redmine.dguteam.org.uk/issues/1844
        pkgs = model.Session.query(model.Package) \
                    .filter_by(owner_org=org.id) \
                    .filter_by(state='active') \
                    .all()
        num_packages += len(pkgs)
        for pkg in pkgs:
            try:
                qa = p.toolkit.get_action('qa_package_openness_show')(context, {'id': pkg.id})
            except p.toolkit.ObjectNotFound:
                log.warning('No QA info for package %s', pkg.name)
                return
            rows.append(OrderedDict((
                ('dataset_name', pkg.name),
                ('dataset_title', pkg.title),
                ('dataset_notes', lib.dataset_notes(pkg)),
                ('organization_name', org.name),
                ('organization_title', org.title),
                ('openness_score', qa['openness_score']),
                ('openness_score_reason', qa['openness_score_reason']),
                )))
            score_counts[qa['openness_score']] += 1

    total_stars = sum([k*v for k, v in score_counts.items() if k])
    num_pkgs_with_stars = sum([v for k, v in score_counts.items()
                               if k is not None])
    average_stars = round(float(total_stars) / num_pkgs_with_stars, 1) \
        if num_pkgs_with_stars else 0.0

    return {'table': rows,
            'score_counts': jsonify_counter(score_counts),
            'total_stars': total_stars,
            'average_stars': average_stars,
            'num_packages_scored': len(rows),
            'num_packages': num_packages,
            }
Esempio n. 8
0
 def create_row(pkg_, resource_dict):
     org_ = pkg_.get_organization()
     return OrderedDict((
         ('publisher_title', org_.title),
         ('publisher_name', org_.name),
         ('package_title', pkg_.title),
         ('package_name', pkg_.name),
         ('package_notes', lib.dataset_notes(pkg_)),
         ('resource_position', resource_dict.get('position')),
         ('resource_id', resource_dict.get('id')),
         ('resource_description', resource_dict.get('description')),
         ('resource_url', resource_dict.get('url')),
         ('resource_format', resource_dict.get('format')),
         ('resource_created', resource_dict.get('created')),
     ))
Esempio n. 9
0
 def create_row(pkg_, resource_dict):
     org_ = pkg_.get_organization()
     return OrderedDict((
             ('publisher_title', org_.title),
             ('publisher_name', org_.name),
             ('package_title', pkg_.title),
             ('package_name', pkg_.name),
             ('package_notes', lib.dataset_notes(pkg_)),
             ('resource_position', resource_dict.get('position')),
             ('resource_id', resource_dict.get('id')),
             ('resource_description', resource_dict.get('description')),
             ('resource_url', resource_dict.get('url')),
             ('resource_format', resource_dict.get('format')),
             ('resource_created', resource_dict.get('created')),
            ))
Esempio n. 10
0
def tagless_report(organization, include_sub_organizations=False):
    '''
    Produces a report on packages without tags.
    Returns something like this:
        {
         'table': [
            {'name': 'river-levels', 'title': 'River levels', 'notes': 'Harvested', 'user': '******', 'created': '2008-06-13T10:24:59.435631'},  # noqa
            {'name': 'co2-monthly', 'title' 'CO2 monthly', 'notes': '', 'user': '******', 'created': '2009-12-14T08:42:45.473827'},  # noqa
            ],
         'num_packages': 56,
         'packages_without_tags_percent': 4,
         'average_tags_per_package': 3.5,
        }
    '''
    # Find the packages without tags
    q = model.Session.query(model.Package) \
             .outerjoin(model.PackageTag) \
             .filter(model.PackageTag.id == None)
    if organization:
        q = lib.filter_by_organizations(q, organization,
                                        include_sub_organizations)
    tagless_pkgs = [OrderedDict((
            ('name', pkg.name),
            ('title', pkg.title),
            ('notes', lib.dataset_notes(pkg)),
            ('user', pkg.creator_user_id),
            ('created', pkg.metadata_created.isoformat()),
            )) for pkg in q.slice(0, 100)]  # First 100 only for this demo

    # Average number of tags per package
    q = model.Session.query(model.Package)
    q = lib.filter_by_organizations(q, organization, include_sub_organizations)
    num_packages = q.count()
    q = q.join(model.PackageTag)
    num_taggings = q.count()
    if num_packages:
        average_tags_per_package = round(float(num_taggings) / num_packages, 1)
    else:
        average_tags_per_package = None
    packages_without_tags_percent = lib.percent(
        len(tagless_pkgs), num_packages)

    return {
        'table': tagless_pkgs,
        'num_packages': num_packages,
        'packages_without_tags_percent': packages_without_tags_percent,
        'average_tags_per_package': average_tags_per_package,
    }
Esempio n. 11
0
 def create_row(pkg_, resource_dict):
     org_ = pkg_.get_organization()
     return OrderedDict(
         (
             ("publisher_title", org_.title),
             ("publisher_name", org_.name),
             ("package_title", pkg_.title),
             ("package_name", pkg_.name),
             ("package_notes", lib.dataset_notes(pkg_)),
             ("resource_position", resource_dict.get("position")),
             ("resource_id", resource_dict.get("id")),
             ("resource_description", resource_dict.get("description")),
             ("resource_url", resource_dict.get("url")),
             ("resource_format", resource_dict.get("format")),
             ("resource_created", resource_dict.get("created")),
         )
     )
Esempio n. 12
0
def datasets_without_resources():
    pkg_dicts = []
    pkgs = model.Session.query(model.Package).filter_by(state="active").order_by(model.Package.title).all()
    for pkg in pkgs:
        if len(pkg.resources) != 0 or pkg.extras.get("unpublished", "").lower() == "true":
            continue
        org = pkg.get_organization()
        deleted, url = last_resource_deleted(pkg)
        pkg_dict = OrderedDict(
            (
                ("name", pkg.name),
                ("title", pkg.title),
                ("organization title", org.title),
                ("organization name", org.name),
                ("metadata created", pkg.metadata_created.isoformat()),
                ("metadata modified", pkg.metadata_modified.isoformat()),
                ("last resource deleted", deleted.isoformat() if deleted else None),
                ("last resource url", url),
                ("dataset_notes", lib.dataset_notes(pkg)),
            )
        )
        pkg_dicts.append(pkg_dict)
    return {"table": pkg_dicts}
Esempio n. 13
0
def broken_links_for_organization(organization, include_sub_organizations=False):
    '''
    Returns a dictionary detailing broken resource links for the organization
    or if organization it returns the index page for all organizations.

    params:
      organization - name of an organization

    Returns:
    {'organization_name': 'cabinet-office',
     'organization_title:': 'Cabinet Office',
     'table': [
       {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'}
      ...]

    '''
    from ckanext.archiver.model import Archival

    org = model.Group.get(organization)
    if not org:
        raise p.toolkit.ObjectNotFound()

    name = org.name
    title = org.title

    archivals = model.Session.query(Archival, model.Package, model.Group).\
        filter(Archival.is_broken == True).\
        join(model.Package, Archival.package_id == model.Package.id).\
        filter(model.Package.state == 'active').\
        join(model.Resource, Archival.resource_id == model.Resource.id).\
        filter(model.Resource.state == 'active')

    if not include_sub_organizations:
        org_ids = [org.id]
        archivals = archivals.filter(model.Package.owner_org == org.id)
    else:
        # We want any organization_id that is part of this organization's tree
        org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)]
        archivals = archivals.filter(model.Package.owner_org.in_(org_ids))

    archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id)

    results = []

    for archival, pkg, org in archivals.all():
        pkg = model.Package.get(archival.package_id)
        resource = model.Resource.get(archival.resource_id)

        via = ''
        er = pkg.extras.get('external_reference', '')
        if er == 'ONSHUB':
            via = "Stats Hub"
        elif er.startswith("DATA4NR"):
            via = "Data4nr"

        archived_resource = model.Session.query(model.ResourceRevision)\
                            .filter_by(id=resource.id)\
                            .filter_by(revision_timestamp=archival.resource_timestamp)\
                            .first() or resource
        row_data = OrderedDict((
            ('dataset_title', pkg.title),
            ('dataset_name', pkg.name),
            ('dataset_notes', lib.dataset_notes(pkg)),
            ('organization_title', org.title),
            ('organization_name', org.name),
            ('resource_position', resource.position),
            ('resource_id', resource.id),
            ('resource_url', archived_resource.url),
            ('url_up_to_date', resource.url == archived_resource.url),
            ('via', via),
            ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None),
            ('last_updated', archival.updated.isoformat() if archival.updated else None),
            ('last_success', archival.last_success.isoformat() if archival.last_success else None),
            ('url_redirected_to', archival.url_redirected_to),
            ('reason', archival.reason),
            ('status', archival.status),
            ('failure_count', archival.failure_count),
            ))

        results.append(row_data)

    num_broken_packages = archivals.distinct(model.Package.name).count()
    num_broken_resources = len(results)

    # Get total number of packages & resources
    num_packages = model.Session.query(model.Package)\
                        .filter(model.Package.owner_org.in_(org_ids))\
                        .filter_by(state='active')\
                        .count()
    num_resources = model.Session.query(model.Resource)\
                         .filter_by(state='active')
    if p.toolkit.check_ckan_version(max_version='2.2.99'):
        num_resources = num_resources.join(model.ResourceGroup)
    num_resources = num_resources \
        .join(model.Package)\
        .filter(model.Package.owner_org.in_(org_ids))\
        .filter_by(state='active').count()

    return {'organization_name': name,
            'organization_title': title,
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            'table': results}
Esempio n. 14
0
def _get_activity(organization_name, include_sub_organizations, periods):
    import ckan.model as model
    from paste.deploy.converters import asbool

    created = dict((period_name, []) for period_name in periods)
    modified = dict((period_name, []) for period_name in periods)

    # These are the authors whose revisions we ignore, as they are trivial
    # changes. NB we do want to know about revisions by:
    # * harvest (harvested metadata)
    # * dgu (NS Stat Hub imports)
    # * Fix national indicators
    system_authors = (
        'autotheme',
        'co-prod3.dh.bytemark.co.uk',
        'Date format tidier',
        'current_revision_fixer',
        'current_revision_fixer2',
        'fix_contact_details.py',
        'Repoint 410 Gone to webarchive url',
        'Fix duplicate resources',
        'fix_secondary_theme.py',
    )
    system_author_template = 'script%'  # "%" is a wildcard

    if organization_name:
        organization = model.Group.by_name(organization_name)
        if not organization:
            raise p.toolkit.ObjectNotFound()

    if not organization_name:
        pkgs = model.Session.query(model.Package) \
            .all()
    else:
        pkgs = model.Session.query(model.Package)
        pkgs = lib.filter_by_organizations(pkgs, organization,
                                           include_sub_organizations).all()

    for pkg in pkgs:
        created_ = model.Session.query(model.PackageRevision) \
            .filter(model.PackageRevision.id == pkg.id) \
            .order_by("revision_timestamp asc").first()

        pr_q = model.Session.query(model.PackageRevision, model.Revision) \
            .filter(model.PackageRevision.id == pkg.id) \
            .filter_by(state='active') \
            .join(model.Revision) \
            .filter(~model.Revision.author.in_(system_authors)) \
            .filter(~model.Revision.author.like(system_author_template))
        rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision) \
            .filter(model.Package.id == pkg.id) \
            .filter_by(state='active') \
            .join(model.ResourceRevision,
                  model.Package.id == model.ResourceRevision.package_id) \
            .join(model.Revision) \
            .filter(~model.Revision.author.in_(system_authors)) \
            .filter(~model.Revision.author.like(system_author_template))
        pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision) \
            .filter(model.Package.id == pkg.id) \
            .filter_by(state='active') \
            .join(model.PackageExtraRevision,
                  model.Package.id == model.PackageExtraRevision.package_id) \
            .join(model.Revision) \
            .filter(~model.Revision.author.in_(system_authors)) \
            .filter(~model.Revision.author.like(system_author_template))

        for period_name in periods:
            period = periods[period_name]
            # created
            if period[0] < created_.revision_timestamp < period[1]:
                published = not asbool(pkg.extras.get('unpublished'))
                created[period_name].append(
                    (created_.id, created_.name, created_.title,
                     lib.dataset_notes(pkg), 'created', period_name,
                     created_.revision_timestamp.isoformat(),
                     created_.revision.author, published))

            # modified
            # exclude the creation revision
            period_start = max(period[0], created_.revision_timestamp)
            prs = pr_q.filter(model.PackageRevision.revision_timestamp > period_start) \
                .filter(model.PackageRevision.revision_timestamp < period[1])
            rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > period_start) \
                .filter(model.ResourceRevision.revision_timestamp < period[1])
            pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > period_start) \
                .filter(model.PackageExtraRevision.revision_timestamp < period[1])
            authors = ' '.join(
                set([r[1].author for r in prs] + [r[2].author for r in rrs] +
                    [r[2].author for r in pes]))
            dates = set([r[1].timestamp.date()
                         for r in prs] + [r[2].timestamp.date() for r in rrs] +
                        [r[2].timestamp.date() for r in pes])
            dates_formatted = ' '.join(
                [date.isoformat() for date in sorted(dates)])
            if authors:
                published = not asbool(pkg.extras.get('unpublished'))
                modified[period_name].append(
                    (pkg.id, pkg.name, pkg.title, lib.dataset_notes(pkg),
                     'modified', period_name, dates_formatted, authors,
                     published))
    return created, modified
Esempio n. 15
0
def publisher_activity(organization, include_sub_organizations=False):
    """
    Contains information about the datasets a specific organization has
    released in this and last quarter (calendar year). This is needed by
    departments for their quarterly transparency reports.
    """
    import datetime
    import ckan.model as model
    from paste.deploy.converters import asbool

    # These are the authors whose revisions we ignore, as they are trivial
    # changes. NB we do want to know about revisions by:
    # * harvest (harvested metadata)
    # * dgu (NS Stat Hub imports)
    # * Fix national indicators
    system_authors = (
        'autotheme',
        'co-prod3.dh.bytemark.co.uk',
        'Date format tidier',
        'current_revision_fixer',
        'current_revision_fixer2',
        'fix_contact_details.py',
        'Repoint 410 Gone to webarchive url',
        'Fix duplicate resources',
    )

    created = {'this': [], 'last': []}
    modified = {'this': [], 'last': []}

    now = datetime.datetime.now()
    quarters = get_quarter_dates(now)

    if organization:
        organization = model.Group.by_name(organization)
        if not organization:
            raise p.toolkit.ObjectNotFound()

    if not organization:
        pkgs = model.Session.query(model.Package)\
                .all()
    else:
        pkgs = model.Session.query(model.Package)
        pkgs = lib.filter_by_organizations(pkgs, organization,
                                           include_sub_organizations).all()

    for pkg in pkgs:
        created_ = model.Session.query(model.PackageRevision)\
            .filter(model.PackageRevision.id == pkg.id) \
            .order_by("revision_timestamp asc").first()

        pr_q = model.Session.query(model.PackageRevision, model.Revision)\
            .filter(model.PackageRevision.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))
        rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\
            .filter(model.Package.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.ResourceGroup)\
            .join(model.ResourceRevision,
                  model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))
        pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\
            .filter(model.Package.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.PackageExtraRevision,
                  model.Package.id == model.PackageExtraRevision.package_id)\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))

        for quarter_name in quarters:
            quarter = quarters[quarter_name]
            if quarter[0] < created_.revision_timestamp < quarter[1]:
                published = not asbool(pkg.extras.get('unpublished'))
                created[quarter_name].append(
                    (created_.name, created_.title, lib.dataset_notes(pkg),
                     'created', quarter_name,
                     created_.revision_timestamp.isoformat(),
                     created_.revision.author, published))
            else:
                prs = pr_q.filter(model.PackageRevision.revision_timestamp > quarter[0])\
                          .filter(model.PackageRevision.revision_timestamp < quarter[1])
                rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > quarter[0])\
                          .filter(model.ResourceRevision.revision_timestamp < quarter[1])
                pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > quarter[0])\
                          .filter(model.PackageExtraRevision.revision_timestamp < quarter[1])
                authors = ' '.join(
                    set([r[1].author
                         for r in prs] + [r[2].author for r in rrs] +
                        [r[2].author for r in pes]))
                dates = set([r[1].timestamp.date() for r in prs] +
                            [r[2].timestamp.date() for r in rrs] +
                            [r[2].timestamp.date() for r in pes])
                dates_formatted = ' '.join(
                    [date.isoformat() for date in sorted(dates)])
                if authors:
                    published = not asbool(pkg.extras.get('unpublished'))
                    modified[quarter_name].append(
                        (pkg.name, pkg.title, lib.dataset_notes(pkg),
                         'modified', quarter_name, dates_formatted, authors,
                         published))

    datasets = []
    for quarter_name in quarters:
        datasets += sorted(created[quarter_name], key=lambda x: x[1])
        datasets += sorted(modified[quarter_name], key=lambda x: x[1])
    columns = ('Dataset name', 'Dataset title', 'Dataset notes',
               'Modified or created', 'Quarter', 'Timestamp', 'Author',
               'Published')

    return {'table': datasets, 'columns': columns, 'quarters': quarters}
Esempio n. 16
0
def nii_report():
    '''A list of the NII datasets, grouped by publisher, with details of broken
    links and source.'''
    nii_dataset_q = model.Session.query(model.Package)\
        .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\
        .join(model.Group, model.Package.owner_org == model.Group.id)\
        .filter(model.PackageExtra.key == 'core-dataset')\
        .filter(model.PackageExtra.value == 'true')\
        .filter(model.Package.state == 'active')
    nii_dataset_objects = nii_dataset_q\
            .order_by(model.Group.title, model.Package.title).all()

    def broken_resources_for_package(package_id):
        from ckanext.archiver.model import Archival

        results = model.Session.query(Archival, model.Resource)\
                       .filter(Archival.package_id == package_id)\
                       .filter(Archival.is_broken == True)\
                       .join(model.Package, Archival.package_id == model.Package.id)\
                       .filter(model.Package.state == 'active')\
                       .join(model.Resource, Archival.resource_id == model.Resource.id)\
                       .filter(model.Resource.state == 'active')

        broken_resources = [(resource.description, resource.id)
                            for archival, resource in results.all()]
        return broken_resources

    nii_dataset_details = []
    num_resources = 0
    num_broken_resources = 0
    num_broken_datasets = 0
    broken_organization_names = set()
    nii_organizations = set()
    for dataset_object in nii_dataset_objects:
        broken_resources = broken_resources_for_package(dataset_object.id)
        org = dataset_object.get_organization()
        dataset_details = {
            'name':
            dataset_object.name,
            'title':
            dataset_object.title,
            'dataset_notes':
            lib.dataset_notes(dataset_object),
            'organization_name':
            org.name,
            'unpublished':
            p.toolkit.asbool(dataset_object.extras.get('unpublished')),
            'num_broken_resources':
            len(broken_resources),
            'broken_resources':
            broken_resources,
        }
        nii_dataset_details.append(dataset_details)
        if broken_resources:
            num_broken_resources += len(broken_resources)
            num_broken_datasets += 1
            broken_organization_names.add(org.name)
        nii_organizations.add(org)
        num_resources += len(dataset_object.resources)

    org_tuples = [(org.name, org.title)
                  for org in sorted(nii_organizations, key=lambda o: o.title)]

    return {
        'table': nii_dataset_details,
        'organizations': org_tuples,
        'num_resources': num_resources,
        'num_datasets': len(nii_dataset_objects),
        'num_organizations': len(nii_organizations),
        'num_broken_resources': num_broken_resources,
        'num_broken_datasets': num_broken_datasets,
        'num_broken_organizations': len(broken_organization_names),
    }
Esempio n. 17
0
def broken_links_for_organization(organization, include_sub_organizations=False):
    '''
    Returns a dictionary detailing broken resource links for the organization
    or if organization it returns the index page for all organizations.

    params:
      organization - name of an organization

    Returns:
    {'organization_name': 'cabinet-office',
     'organization_title:': 'Cabinet Office',
     'table': [
       {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success', 'first_failure', 'failure_count', 'last_updated'}
      ...]

    '''
    from ckanext.archiver.model import Archival

    org = model.Group.get(organization)
    if not org:
        raise p.toolkit.ObjectNotFound()

    name = org.name
    title = org.title

    archivals = model.Session.query(Archival, model.Package, model.Group).\
        filter(Archival.is_broken == True).\
        join(model.Package, Archival.package_id == model.Package.id).\
        filter(model.Package.state == 'active').\
        join(model.Resource, Archival.resource_id == model.Resource.id).\
        filter(model.Resource.state == 'active')

    if not include_sub_organizations:
        org_ids = [org.id]
        archivals = archivals.filter(model.Package.owner_org == org.id)
    else:
        # We want any organization_id that is part of this organization's tree
        org_ids = ['%s' % organization.id for organization in lib.go_down_tree(org)]
        archivals = archivals.filter(model.Package.owner_org.in_(org_ids))

    archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id)

    results = []

    for archival, pkg, org in archivals.all():
        pkg = model.Package.get(archival.package_id)
        resource = model.Resource.get(archival.resource_id)

        via = ''
        er = pkg.extras.get('external_reference', '')
        if er == 'ONSHUB':
            via = "Stats Hub"
        elif er.startswith("DATA4NR"):
            via = "Data4nr"

        archived_resource = model.Session.query(model.ResourceRevision)\
                            .filter_by(id=resource.id)\
                            .filter_by(revision_timestamp=archival.resource_timestamp)\
                            .first() or resource
        row_data = OrderedDict((
            ('dataset_title', pkg.title),
            ('dataset_name', pkg.name),
            ('dataset_notes', lib.dataset_notes(pkg)),
            ('organization_title', org.title),
            ('organization_name', org.name),
            ('resource_position', resource.position),
            ('resource_id', resource.id),
            ('resource_url', archived_resource.url),
            ('url_up_to_date', resource.url == archived_resource.url),
            ('via', via),
            ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None),
            ('last_updated', archival.updated.isoformat() if archival.updated else None),
            ('last_success', archival.last_success.isoformat() if archival.last_success else None),
            ('url_redirected_to', archival.url_redirected_to),
            ('reason', archival.reason),
            ('status', archival.status),
            ('failure_count', archival.failure_count),
            ))

        results.append(row_data)

    num_broken_packages = archivals.distinct(model.Package.name).count()
    num_broken_resources = len(results)

    # Get total number of packages & resources
    num_packages = model.Session.query(model.Package)\
                        .filter(model.Package.owner_org.in_(org_ids))\
                        .filter_by(state='active')\
                        .count()
    num_resources = model.Session.query(model.Resource)\
                         .filter_by(state='active')
    if p.toolkit.check_ckan_version(max_version='2.2.99'):
        num_resources = num_resources.join(model.ResourceGroup)
    num_resources = num_resources \
        .join(model.Package)\
        .filter(model.Package.owner_org.in_(org_ids))\
        .filter_by(state='active').count()

    return {'organization_name': name,
            'organization_title': title,
            'num_broken_packages': num_broken_packages,
            'num_broken_resources': num_broken_resources,
            'num_packages': num_packages,
            'num_resources': num_resources,
            'broken_package_percent': lib.percent(num_broken_packages, num_packages),
            'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
            'table': results}
Esempio n. 18
0
def publisher_activity(organization, include_sub_organizations=False):
    """
    Contains information about the datasets a specific organization has
    released in this and last quarter (calendar year). This is needed by
    departments for their quarterly transparency reports.
    """
    import datetime
    import ckan.model as model
    from paste.deploy.converters import asbool

    # These are the authors whose revisions we ignore, as they are trivial
    # changes. NB we do want to know about revisions by:
    # * harvest (harvested metadata)
    # * dgu (NS Stat Hub imports)
    # * Fix national indicators
    system_authors = ('autotheme', 'co-prod3.dh.bytemark.co.uk',
                      'Date format tidier', 'current_revision_fixer',
                      'current_revision_fixer2', 'fix_contact_details.py',
                      'Repoint 410 Gone to webarchive url',
                      'Fix duplicate resources',
                      'fix_secondary_theme.py',
                      'script-fix-links-tna',
                      )

    created = {'this': [], 'last': []}
    modified = {'this': [], 'last': []}

    now = datetime.datetime.now()
    quarters = get_quarter_dates(now)

    if organization:
        organization = model.Group.by_name(organization)
        if not organization:
            raise p.toolkit.ObjectNotFound()

    if not organization:
        pkgs = model.Session.query(model.Package)\
                .all()
    else:
        pkgs = model.Session.query(model.Package)
        pkgs = lib.filter_by_organizations(pkgs, organization,
                                           include_sub_organizations).all()

    for pkg in pkgs:
        created_ = model.Session.query(model.PackageRevision)\
            .filter(model.PackageRevision.id == pkg.id) \
            .order_by("revision_timestamp asc").first()

        pr_q = model.Session.query(model.PackageRevision, model.Revision)\
            .filter(model.PackageRevision.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))
        rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\
            .filter(model.Package.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.ResourceGroup)\
            .join(model.ResourceRevision,
                  model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))
        pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\
            .filter(model.Package.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.PackageExtraRevision,
                  model.Package.id == model.PackageExtraRevision.package_id)\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))

        for quarter_name in quarters:
            quarter = quarters[quarter_name]
            if quarter[0] < created_.revision_timestamp < quarter[1]:
                published = not asbool(pkg.extras.get('unpublished'))
                created[quarter_name].append(
                    (created_.name, created_.title, lib.dataset_notes(pkg),
                     'created', quarter_name,
                     created_.revision_timestamp.isoformat(),
                     created_.revision.author, published))
            else:
                prs = pr_q.filter(model.PackageRevision.revision_timestamp > quarter[0])\
                          .filter(model.PackageRevision.revision_timestamp < quarter[1])
                rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > quarter[0])\
                          .filter(model.ResourceRevision.revision_timestamp < quarter[1])
                pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > quarter[0])\
                          .filter(model.PackageExtraRevision.revision_timestamp < quarter[1])
                authors = ' '.join(set([r[1].author for r in prs] +
                                      [r[2].author for r in rrs] +
                                      [r[2].author for r in pes]))
                dates = set([r[1].timestamp.date() for r in prs] +
                            [r[2].timestamp.date() for r in rrs] +
                            [r[2].timestamp.date() for r in pes])
                dates_formatted = ' '.join([date.isoformat()
                                            for date in sorted(dates)])
                if authors:
                    published = not asbool(pkg.extras.get('unpublished'))
                    modified[quarter_name].append(
                        (pkg.name, pkg.title, lib.dataset_notes(pkg),
                         'modified', quarter_name,
                         dates_formatted, authors, published))

    datasets = []
    for quarter_name in quarters:
        datasets += sorted(created[quarter_name], key=lambda x: x[1])
        datasets += sorted(modified[quarter_name], key=lambda x: x[1])
    columns = ('Dataset name', 'Dataset title', 'Dataset notes', 'Modified or created', 'Quarter', 'Timestamp', 'Author', 'Published')

    return {'table': datasets, 'columns': columns,
            'quarters': quarters}
Esempio n. 19
0
def nii_report():
    '''A list of the NII datasets, grouped by publisher, with details of broken
    links and source.'''
    nii_dataset_q = model.Session.query(model.Package)\
        .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)\
        .join(model.Group, model.Package.owner_org == model.Group.id)\
        .filter(model.PackageExtra.key == 'core-dataset')\
        .filter(model.PackageExtra.value == 'true')\
        .filter(model.Package.state == 'active')
    nii_dataset_objects = nii_dataset_q\
            .order_by(model.Group.title, model.Package.title).all()

    def broken_resources_for_package(package_id):
        from ckanext.archiver.model import Archival

        results = model.Session.query(Archival, model.Resource)\
                       .filter(Archival.package_id == package_id)\
                       .filter(Archival.is_broken == True)\
                       .join(model.Package, Archival.package_id == model.Package.id)\
                       .filter(model.Package.state == 'active')\
                       .join(model.Resource, Archival.resource_id == model.Resource.id)\
                       .filter(model.Resource.state == 'active')

        broken_resources = [(resource.description, resource.id)
                            for archival, resource in results.all()]
        return broken_resources

    nii_dataset_details = []
    num_resources = 0
    num_broken_resources = 0
    num_broken_datasets = 0
    broken_organization_names = set()
    nii_organizations = set()
    for dataset_object in nii_dataset_objects:
        broken_resources = broken_resources_for_package(dataset_object.id)
        org = dataset_object.get_organization()
        dataset_details = {
                'name': dataset_object.name,
                'title': dataset_object.title,
                'dataset_notes': lib.dataset_notes(dataset_object),
                'organization_name': org.name,
                'unpublished': p.toolkit.asbool(dataset_object.extras.get('unpublished')),
                'num_broken_resources': len(broken_resources),
                'broken_resources': broken_resources,
                }
        nii_dataset_details.append(dataset_details)
        if broken_resources:
            num_broken_resources += len(broken_resources)
            num_broken_datasets += 1
            broken_organization_names.add(org.name)
        nii_organizations.add(org)
        num_resources += len(dataset_object.resources)

    org_tuples = [(org.name, org.title) for org in
                  sorted(nii_organizations, key=lambda o: o.title)]

    return {'table': nii_dataset_details,
            'organizations': org_tuples,
            'num_resources': num_resources,
            'num_datasets': len(nii_dataset_objects),
            'num_organizations': len(nii_organizations),
            'num_broken_resources': num_broken_resources,
            'num_broken_datasets': num_broken_datasets,
            'num_broken_organizations': len(broken_organization_names),
            }
Esempio n. 20
0
def nii_report():
    """A list of the NII datasets, grouped by publisher, with details of broken
    links and source."""
    nii_dataset_q = (
        model.Session.query(model.Package)
        .join(model.PackageExtra, model.PackageExtra.package_id == model.Package.id)
        .join(model.Group, model.Package.owner_org == model.Group.id)
        .filter(model.PackageExtra.key == "core-dataset")
        .filter(model.PackageExtra.value == "true")
        .filter(model.Package.state == "active")
    )
    nii_dataset_objects = nii_dataset_q.order_by(model.Group.title, model.Package.title).all()

    def broken_resources_for_package(package_id):
        from ckanext.archiver.model import Archival

        results = (
            model.Session.query(Archival, model.Resource)
            .filter(Archival.package_id == package_id)
            .filter(Archival.is_broken == True)
            .join(model.Package, Archival.package_id == model.Package.id)
            .filter(model.Package.state == "active")
            .join(model.Resource, Archival.resource_id == model.Resource.id)
            .filter(model.Resource.state == "active")
        )

        broken_resources = [(resource.description, resource.id) for archival, resource in results.all()]
        return broken_resources

    nii_dataset_details = []
    num_resources = 0
    num_broken_resources = 0
    num_broken_datasets = 0
    broken_organization_names = set()
    nii_organizations = set()
    for dataset_object in nii_dataset_objects:
        broken_resources = broken_resources_for_package(dataset_object.id)
        org = dataset_object.get_organization()
        dataset_details = {
            "name": dataset_object.name,
            "title": dataset_object.title,
            "dataset_notes": lib.dataset_notes(dataset_object),
            "organization_name": org.name,
            "unpublished": p.toolkit.asbool(dataset_object.extras.get("unpublished")),
            "num_broken_resources": len(broken_resources),
            "broken_resources": broken_resources,
        }
        nii_dataset_details.append(dataset_details)
        if broken_resources:
            num_broken_resources += len(broken_resources)
            num_broken_datasets += 1
            broken_organization_names.add(org.name)
        nii_organizations.add(org)
        num_resources += len(dataset_object.resources)

    org_tuples = [(org.name, org.title) for org in sorted(nii_organizations, key=lambda o: o.title)]

    return {
        "table": nii_dataset_details,
        "organizations": org_tuples,
        "num_resources": num_resources,
        "num_datasets": len(nii_dataset_objects),
        "num_organizations": len(nii_organizations),
        "num_broken_resources": num_broken_resources,
        "num_broken_datasets": num_broken_datasets,
        "num_broken_organizations": len(broken_organization_names),
    }
Esempio n. 21
0
def _get_activity(organization_name, include_sub_organizations, periods):
    import ckan.model as model
    from paste.deploy.converters import asbool

    created = dict((period_name, []) for period_name in periods)
    modified = dict((period_name, []) for period_name in periods)

    # These are the authors whose revisions we ignore, as they are trivial
    # changes. NB we do want to know about revisions by:
    # * harvest (harvested metadata)
    # * dgu (NS Stat Hub imports)
    # * Fix national indicators
    system_authors = ('autotheme', 'co-prod3.dh.bytemark.co.uk',
                      'Date format tidier', 'current_revision_fixer',
                      'current_revision_fixer2', 'fix_contact_details.py',
                      'Repoint 410 Gone to webarchive url',
                      'Fix duplicate resources',
                      'fix_secondary_theme.py',
                      )
    system_author_template = 'script%'  # "%" is a wildcard

    if organization_name:
        organization = model.Group.by_name(organization_name)
        if not organization:
            raise p.toolkit.ObjectNotFound()

    if not organization_name:
        pkgs = model.Session.query(model.Package)\
                    .all()
    else:
        pkgs = model.Session.query(model.Package)
        pkgs = lib.filter_by_organizations(pkgs, organization,
                                           include_sub_organizations).all()

    for pkg in pkgs:
        created_ = model.Session.query(model.PackageRevision)\
            .filter(model.PackageRevision.id == pkg.id) \
            .order_by("revision_timestamp asc").first()

        pr_q = model.Session.query(model.PackageRevision, model.Revision)\
            .filter(model.PackageRevision.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors)) \
            .filter(~model.Revision.author.like(system_author_template))
        rr_q = model.Session.query(model.Package, model.ResourceRevision, model.Revision)\
            .filter(model.Package.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.ResourceGroup)\
            .join(model.ResourceRevision,
                  model.ResourceGroup.id == model.ResourceRevision.resource_group_id)\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))\
            .filter(~model.Revision.author.like(system_author_template))
        pe_q = model.Session.query(model.Package, model.PackageExtraRevision, model.Revision)\
            .filter(model.Package.id == pkg.id)\
            .filter_by(state='active')\
            .join(model.PackageExtraRevision,
                  model.Package.id == model.PackageExtraRevision.package_id)\
            .join(model.Revision)\
            .filter(~model.Revision.author.in_(system_authors))\
            .filter(~model.Revision.author.like(system_author_template))

        for period_name in periods:
            period = periods[period_name]
            # created
            if period[0] < created_.revision_timestamp < period[1]:
                published = not asbool(pkg.extras.get('unpublished'))
                created[period_name].append(
                    (created_.name, created_.title, lib.dataset_notes(pkg),
                     'created', period_name,
                     created_.revision_timestamp.isoformat(),
                     created_.revision.author, published))

            # modified
            # exclude the creation revision
            period_start = max(period[0], created_.revision_timestamp)
            prs = pr_q.filter(model.PackageRevision.revision_timestamp > period_start)\
                        .filter(model.PackageRevision.revision_timestamp < period[1])
            rrs = rr_q.filter(model.ResourceRevision.revision_timestamp > period_start)\
                        .filter(model.ResourceRevision.revision_timestamp < period[1])
            pes = pe_q.filter(model.PackageExtraRevision.revision_timestamp > period_start)\
                        .filter(model.PackageExtraRevision.revision_timestamp < period[1])
            authors = ' '.join(set([r[1].author for r in prs] +
                                   [r[2].author for r in rrs] +
                                   [r[2].author for r in pes]))
            dates = set([r[1].timestamp.date() for r in prs] +
                        [r[2].timestamp.date() for r in rrs] +
                        [r[2].timestamp.date() for r in pes])
            dates_formatted = ' '.join([date.isoformat()
                                        for date in sorted(dates)])
            if authors:
                published = not asbool(pkg.extras.get('unpublished'))
                modified[period_name].append(
                    (pkg.name, pkg.title, lib.dataset_notes(pkg),
                        'modified', period_name,
                        dates_formatted, authors, published))
    return created, modified