Example #1
0
def _run_field_counts(name, collection_id):
    logger = logging.getLogger('ocdskingfisher.summarize.field-counts')
    logger.info('Processing collection ID %s', collection_id)

    start = time()

    db = Database()
    db.set_search_path([name, 'public'])

    db.execute_values(
        'INSERT INTO field_counts VALUES %s',
        db.all(
            """
        /* kingfisher-summarize field-counts */

        SELECT
            collection_id,
            release_type,
            path,
            sum(object_property) object_property,
            sum(array_item) array_count,
            count(distinct id) distinct_releases
        FROM
            release_summary
        CROSS JOIN
            flatten(release)
        WHERE
            collection_id = %(id)s
        GROUP BY collection_id, release_type, path
    """, {'id': collection_id}))
    db.commit()

    logger.info('Collection ID %s: %ss', collection_id, time() - start)
Example #2
0
def _run_file(name, identifier, content):
    logger = logging.getLogger('ocdskingfisher.summarize.summary-tables')
    logger.info(f'Processing {identifier}')

    start = time()

    db = Database()
    db.set_search_path([name, 'public'])

    db.execute(f'/* kingfisher-summarize {identifier} */\n' + content)
    db.commit()

    logger.info('%s: %ss', identifier, time() - start)
Example #3
0
def cli(ctx):
    load_dotenv()

    path = os.path.join(click.get_app_dir('Kingfisher Summarize'),
                        'logging.json')
    if os.path.isfile(path):
        with open(path) as f:
            logging.config.dictConfig(json.load(f))
    # Python's root logger only prints warning and above.
    else:
        logging.basicConfig(
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            level=logging.INFO)

    logger = logging.getLogger('ocdskingfisher.summarize.cli')
    logger.info('Running %s', ctx.invoked_subcommand)

    global db
    db = Database()
Example #4
0
def db():
    return Database()
Example #5
0
def _run_field_lists(name, table, tables_only):
    logger = logging.getLogger('ocdskingfisher.summarize.field-lists')
    logger.info(f'Processing {table.name}')

    start = time()

    db = Database()
    db.set_search_path([name, 'public'])

    summary_table = table.name
    field_list_table = f'{summary_table}_field_list'
    no_field_list_table = f'{summary_table}_no_field_list'

    if tables_only:
        no_field_list_type = sql.SQL('TABLE')
        final_summary_type = sql.SQL('TABLE')
    else:
        no_field_list_type = sql.SQL('TABLE' if table.is_table else 'VIEW')
        final_summary_type = sql.SQL('VIEW')

    # Create a *_field_list table, add a unique index, rename the *_summary table to *_no_field_list, and re-create
    # the *_summary table. Then, copy the comments from the old to the new *_summary table.

    # Use jsonb_object_agg instead of array_agg so that paths are unique, and so that queries against the field use the
    # faster "in" operator for objects (?&) than for arrays (@>).
    statement = """
        CREATE TABLE {field_list_table} AS
        SELECT
            {primary_keys},
            jsonb_object_agg(path, NULL) AS field_list
        FROM
            {summary_table}
        CROSS JOIN
            flatten({summary_table}.{data_column})
        GROUP BY
            {primary_keys}
    """
    db.execute(statement,
               summary_table=summary_table,
               field_list_table=field_list_table,
               data_column=table.data_column,
               primary_keys=table.primary_keys)

    statement = 'CREATE UNIQUE INDEX {index} ON {field_list_table}({primary_keys})'
    db.execute(statement,
               index=f'{field_list_table}_id',
               field_list_table=field_list_table,
               primary_keys=table.primary_keys)

    statement = 'ALTER {no_field_list_type} {summary_table} RENAME TO {no_field_list_table}'
    db.execute(statement,
               no_field_list_type=no_field_list_type,
               summary_table=summary_table,
               no_field_list_table=no_field_list_table)

    statement = """
        CREATE {final_summary_type} {summary_table} AS
        SELECT
            {no_field_list_table}.*,
            {field_list_table}.field_list
        FROM
            {no_field_list_table}
        JOIN
            {field_list_table} USING ({primary_keys})
    """
    db.execute(statement,
               final_summary_type=final_summary_type,
               summary_table=summary_table,
               no_field_list_table=no_field_list_table,
               field_list_table=field_list_table,
               primary_keys=table.primary_keys)

    for row in db.all(COLUMN_COMMENTS_SQL, {
            'schema': name,
            'table': f'{table.name}_no_field_list'
    }):
        statement = 'COMMENT ON COLUMN {table}.{column} IS %(comment)s'
        db.execute(statement, {'comment': row[2]},
                   table=table.name,
                   column=row[0])

    comment = f'All JSON paths in the {table.data_column} object, excluding array indices, expressed as a JSONB ' \
              'object in which keys are paths and values are NULL. This column is only available if the --field-' \
              'lists option was used.'
    db.execute('COMMENT ON COLUMN {table}.field_list IS %(comment)s',
               {'comment': comment},
               table=table.name)

    db.commit()

    logger.info('%s: %ss', table.name, time() - start)
def _run_field_lists(name, summary_table, tables_only):
    logger = logging.getLogger('ocdskingfisher.summarize.field-lists')
    logger.info('Processing %s', summary_table)

    start = time()

    db = Database()
    db.set_search_path([name, 'public'])

    table = SUMMARIES[summary_table]

    variables = {}

    if tables_only:
        no_field_list_type = sql.SQL('TABLE')
        final_summary_type = sql.SQL('TABLE')
    else:
        no_field_list_type = sql.SQL('TABLE' if table.is_table else 'VIEW')
        final_summary_type = sql.SQL('VIEW')

    field_list_table = f'{summary_table}_field_list'
    format_kwargs = {
        'summary_table':
        summary_table,
        'field_list_table':
        field_list_table,
        'no_field_list_table':
        f'{summary_table}_no_field_list',
        'no_field_list_type':
        no_field_list_type,
        'final_summary_type':
        final_summary_type,
        'data_column':
        table.data_column,
        'primary_keys':
        table.primary_keys,
        'qualified_primary_keys':
        [(summary_table, field) for field in table.primary_keys],
        'index':
        f'{field_list_table}_id',
    }

    counts_per_path_select = """
        SELECT
            {primary_keys},
            path,
            GREATEST(sum(array_item), sum(object_property)) path_count
        FROM
            {summary_table}
        CROSS JOIN
            flatten({summary_table}.{data_column})
        GROUP BY
            {primary_keys}, path
    """

    # Allow users to measure co-occurrence of fields across related award and contract objects.
    if summary_table in ('contracts_summary', 'awards_summary'):
        if summary_table == 'contracts_summary':
            variables['path_prefix'] = 'awards'
            format_kwargs['other_data_column'] = 'award'
        else:
            variables['path_prefix'] = 'contracts'
            format_kwargs['other_data_column'] = 'contract'

        counts_per_path_select += """
        UNION ALL

        SELECT
            {qualified_primary_keys},
            %(path_prefix)s || '/' || path AS path,
            GREATEST(sum(array_item), sum(object_property)) path_count
        FROM
            awards_summary
        JOIN
            contracts_summary
        ON
            awards_summary.id = contracts_summary.id AND
            awards_summary.award_id = contracts_summary.awardid
        CROSS JOIN
            flatten({other_data_column})
        GROUP BY
            {qualified_primary_keys}, path

        UNION ALL

        SELECT
            {qualified_primary_keys},
            %(path_prefix)s AS path,
            count(*) path_count
        FROM
            awards_summary
        JOIN
            contracts_summary
        ON
            awards_summary.id = contracts_summary.id AND
            awards_summary.award_id = contracts_summary.awardid
        GROUP BY {qualified_primary_keys}
    """

    # Create a *_field_list table, add a unique index, rename the *_summary table to *_no_field_list, and re-create
    # the *_summary table. Then, copy the comments from the old to the new *_summary table.
    statement = """
        CREATE TABLE {field_list_table} AS
        WITH path_counts AS (
            INNER_SELECT
        )
        SELECT
            {primary_keys},
            jsonb_object_agg(path, path_count) AS field_list
        FROM
            path_counts
        GROUP BY
            {primary_keys}
    """.replace('INNER_SELECT', counts_per_path_select)
    db.execute(statement, variables=variables, **format_kwargs)

    statement = 'CREATE UNIQUE INDEX {index} ON {field_list_table}({primary_keys})'
    db.execute(statement, **format_kwargs)

    statement = 'ALTER {no_field_list_type} {summary_table} RENAME TO {no_field_list_table}'
    db.execute(statement, **format_kwargs)

    statement = """
        CREATE {final_summary_type} {summary_table} AS
        SELECT
            {no_field_list_table}.*,
            {field_list_table}.field_list
        FROM
            {no_field_list_table}
        JOIN
            {field_list_table} USING ({primary_keys})
    """
    db.execute(statement, **format_kwargs)

    for row in db.all(COLUMN_COMMENTS_SQL, {
            'schema': name,
            'table': f'{summary_table}_no_field_list'
    }):
        statement = 'COMMENT ON COLUMN {table}.{column} IS %(comment)s'
        db.execute(statement, {'comment': row[2]},
                   table=summary_table,
                   column=row[0])

    if summary_table == 'contracts_summary':
        comment = f"All JSON paths in the {table.data_column} object as well as in the related award's " \
                  f"{format_kwargs['other_data_column']} object (prefixed by {variables['path_prefix']}/), " \
                  "expressed as a JSONB object in which keys are paths and values are numbers of occurrences. " \
                  "Paths exclude array indices."
    elif summary_table == 'awards_summary':
        comment = f"All JSON paths in the {table.data_column} object as well as in the related contracts' " \
                  f"{format_kwargs['other_data_column']} object (prefixed by {variables['path_prefix']}/), " \
                  "expressed as a JSONB object in which keys are paths and values are numbers of occurrences. " \
                  "Paths exclude array indices."
    else:
        comment = f"All JSON paths in the {table.data_column} object, expressed as a JSONB object in which keys are " \
                  "paths and values are numbers of occurrences. Paths exclude array indices."
    comment += 'This column is only available if the --field-lists option is used.'
    db.execute('COMMENT ON COLUMN {table}.field_list IS %(comment)s',
               {'comment': comment},
               table=summary_table)

    db.commit()

    logger.info('%s: %ss', summary_table, time() - start)
    return summary_table
def migrate():
    db = Database()

    db.execute('CREATE SCHEMA IF NOT EXISTS summaries')
    db.set_search_path(['summaries'])
    db.execute("""CREATE TABLE IF NOT EXISTS selected_collections
                  (schema TEXT NOT NULL, collection_id INTEGER NOT NULL)""")
    db.execute(
        """CREATE UNIQUE INDEX IF NOT EXISTS selected_collections_schema_collection_id
                  ON selected_collections (schema, collection_id)""")
    db.commit()

    for schema in db.schemas():
        db.set_search_path([schema])
        try:
            collections = db.pluck('SELECT id FROM selected_collections')
        except UndefinedTable as e:
            print(f'ERROR: undefined {schema}.selected_collections table: {e}',
                  file=sys.stderr)
            db.connection.rollback()
            continue
        db.execute_values(
            'INSERT INTO summaries.selected_collections (schema, collection_id) VALUES %s',
            [(
                schema,
                _id,
            ) for _id in collections])
        db.commit()