def _run_field_counts(name, collection_id): logger = logging.getLogger('ocdskingfisher.summarize.field-counts') logger.info('Processing collection ID %s', collection_id) start = time() db = Database() db.set_search_path([name, 'public']) db.execute_values( 'INSERT INTO field_counts VALUES %s', db.all( """ /* kingfisher-summarize field-counts */ SELECT collection_id, release_type, path, sum(object_property) object_property, sum(array_item) array_count, count(distinct id) distinct_releases FROM release_summary CROSS JOIN flatten(release) WHERE collection_id = %(id)s GROUP BY collection_id, release_type, path """, {'id': collection_id})) db.commit() logger.info('Collection ID %s: %ss', collection_id, time() - start)
def _run_file(name, identifier, content): logger = logging.getLogger('ocdskingfisher.summarize.summary-tables') logger.info(f'Processing {identifier}') start = time() db = Database() db.set_search_path([name, 'public']) db.execute(f'/* kingfisher-summarize {identifier} */\n' + content) db.commit() logger.info('%s: %ss', identifier, time() - start)
def cli(ctx): load_dotenv() path = os.path.join(click.get_app_dir('Kingfisher Summarize'), 'logging.json') if os.path.isfile(path): with open(path) as f: logging.config.dictConfig(json.load(f)) # Python's root logger only prints warning and above. else: logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO) logger = logging.getLogger('ocdskingfisher.summarize.cli') logger.info('Running %s', ctx.invoked_subcommand) global db db = Database()
def db(): return Database()
def _run_field_lists(name, table, tables_only): logger = logging.getLogger('ocdskingfisher.summarize.field-lists') logger.info(f'Processing {table.name}') start = time() db = Database() db.set_search_path([name, 'public']) summary_table = table.name field_list_table = f'{summary_table}_field_list' no_field_list_table = f'{summary_table}_no_field_list' if tables_only: no_field_list_type = sql.SQL('TABLE') final_summary_type = sql.SQL('TABLE') else: no_field_list_type = sql.SQL('TABLE' if table.is_table else 'VIEW') final_summary_type = sql.SQL('VIEW') # Create a *_field_list table, add a unique index, rename the *_summary table to *_no_field_list, and re-create # the *_summary table. Then, copy the comments from the old to the new *_summary table. # Use jsonb_object_agg instead of array_agg so that paths are unique, and so that queries against the field use the # faster "in" operator for objects (?&) than for arrays (@>). statement = """ CREATE TABLE {field_list_table} AS SELECT {primary_keys}, jsonb_object_agg(path, NULL) AS field_list FROM {summary_table} CROSS JOIN flatten({summary_table}.{data_column}) GROUP BY {primary_keys} """ db.execute(statement, summary_table=summary_table, field_list_table=field_list_table, data_column=table.data_column, primary_keys=table.primary_keys) statement = 'CREATE UNIQUE INDEX {index} ON {field_list_table}({primary_keys})' db.execute(statement, index=f'{field_list_table}_id', field_list_table=field_list_table, primary_keys=table.primary_keys) statement = 'ALTER {no_field_list_type} {summary_table} RENAME TO {no_field_list_table}' db.execute(statement, no_field_list_type=no_field_list_type, summary_table=summary_table, no_field_list_table=no_field_list_table) statement = """ CREATE {final_summary_type} {summary_table} AS SELECT {no_field_list_table}.*, {field_list_table}.field_list FROM {no_field_list_table} JOIN {field_list_table} USING ({primary_keys}) """ db.execute(statement, final_summary_type=final_summary_type, summary_table=summary_table, no_field_list_table=no_field_list_table, field_list_table=field_list_table, primary_keys=table.primary_keys) for row in db.all(COLUMN_COMMENTS_SQL, { 'schema': name, 'table': f'{table.name}_no_field_list' }): statement = 'COMMENT ON COLUMN {table}.{column} IS %(comment)s' db.execute(statement, {'comment': row[2]}, table=table.name, column=row[0]) comment = f'All JSON paths in the {table.data_column} object, excluding array indices, expressed as a JSONB ' \ 'object in which keys are paths and values are NULL. This column is only available if the --field-' \ 'lists option was used.' db.execute('COMMENT ON COLUMN {table}.field_list IS %(comment)s', {'comment': comment}, table=table.name) db.commit() logger.info('%s: %ss', table.name, time() - start)
def _run_field_lists(name, summary_table, tables_only): logger = logging.getLogger('ocdskingfisher.summarize.field-lists') logger.info('Processing %s', summary_table) start = time() db = Database() db.set_search_path([name, 'public']) table = SUMMARIES[summary_table] variables = {} if tables_only: no_field_list_type = sql.SQL('TABLE') final_summary_type = sql.SQL('TABLE') else: no_field_list_type = sql.SQL('TABLE' if table.is_table else 'VIEW') final_summary_type = sql.SQL('VIEW') field_list_table = f'{summary_table}_field_list' format_kwargs = { 'summary_table': summary_table, 'field_list_table': field_list_table, 'no_field_list_table': f'{summary_table}_no_field_list', 'no_field_list_type': no_field_list_type, 'final_summary_type': final_summary_type, 'data_column': table.data_column, 'primary_keys': table.primary_keys, 'qualified_primary_keys': [(summary_table, field) for field in table.primary_keys], 'index': f'{field_list_table}_id', } counts_per_path_select = """ SELECT {primary_keys}, path, GREATEST(sum(array_item), sum(object_property)) path_count FROM {summary_table} CROSS JOIN flatten({summary_table}.{data_column}) GROUP BY {primary_keys}, path """ # Allow users to measure co-occurrence of fields across related award and contract objects. if summary_table in ('contracts_summary', 'awards_summary'): if summary_table == 'contracts_summary': variables['path_prefix'] = 'awards' format_kwargs['other_data_column'] = 'award' else: variables['path_prefix'] = 'contracts' format_kwargs['other_data_column'] = 'contract' counts_per_path_select += """ UNION ALL SELECT {qualified_primary_keys}, %(path_prefix)s || '/' || path AS path, GREATEST(sum(array_item), sum(object_property)) path_count FROM awards_summary JOIN contracts_summary ON awards_summary.id = contracts_summary.id AND awards_summary.award_id = contracts_summary.awardid CROSS JOIN flatten({other_data_column}) GROUP BY {qualified_primary_keys}, path UNION ALL SELECT {qualified_primary_keys}, %(path_prefix)s AS path, count(*) path_count FROM awards_summary JOIN contracts_summary ON awards_summary.id = contracts_summary.id AND awards_summary.award_id = contracts_summary.awardid GROUP BY {qualified_primary_keys} """ # Create a *_field_list table, add a unique index, rename the *_summary table to *_no_field_list, and re-create # the *_summary table. Then, copy the comments from the old to the new *_summary table. statement = """ CREATE TABLE {field_list_table} AS WITH path_counts AS ( INNER_SELECT ) SELECT {primary_keys}, jsonb_object_agg(path, path_count) AS field_list FROM path_counts GROUP BY {primary_keys} """.replace('INNER_SELECT', counts_per_path_select) db.execute(statement, variables=variables, **format_kwargs) statement = 'CREATE UNIQUE INDEX {index} ON {field_list_table}({primary_keys})' db.execute(statement, **format_kwargs) statement = 'ALTER {no_field_list_type} {summary_table} RENAME TO {no_field_list_table}' db.execute(statement, **format_kwargs) statement = """ CREATE {final_summary_type} {summary_table} AS SELECT {no_field_list_table}.*, {field_list_table}.field_list FROM {no_field_list_table} JOIN {field_list_table} USING ({primary_keys}) """ db.execute(statement, **format_kwargs) for row in db.all(COLUMN_COMMENTS_SQL, { 'schema': name, 'table': f'{summary_table}_no_field_list' }): statement = 'COMMENT ON COLUMN {table}.{column} IS %(comment)s' db.execute(statement, {'comment': row[2]}, table=summary_table, column=row[0]) if summary_table == 'contracts_summary': comment = f"All JSON paths in the {table.data_column} object as well as in the related award's " \ f"{format_kwargs['other_data_column']} object (prefixed by {variables['path_prefix']}/), " \ "expressed as a JSONB object in which keys are paths and values are numbers of occurrences. " \ "Paths exclude array indices." elif summary_table == 'awards_summary': comment = f"All JSON paths in the {table.data_column} object as well as in the related contracts' " \ f"{format_kwargs['other_data_column']} object (prefixed by {variables['path_prefix']}/), " \ "expressed as a JSONB object in which keys are paths and values are numbers of occurrences. " \ "Paths exclude array indices." else: comment = f"All JSON paths in the {table.data_column} object, expressed as a JSONB object in which keys are " \ "paths and values are numbers of occurrences. Paths exclude array indices." comment += 'This column is only available if the --field-lists option is used.' db.execute('COMMENT ON COLUMN {table}.field_list IS %(comment)s', {'comment': comment}, table=summary_table) db.commit() logger.info('%s: %ss', summary_table, time() - start) return summary_table
def migrate(): db = Database() db.execute('CREATE SCHEMA IF NOT EXISTS summaries') db.set_search_path(['summaries']) db.execute("""CREATE TABLE IF NOT EXISTS selected_collections (schema TEXT NOT NULL, collection_id INTEGER NOT NULL)""") db.execute( """CREATE UNIQUE INDEX IF NOT EXISTS selected_collections_schema_collection_id ON selected_collections (schema, collection_id)""") db.commit() for schema in db.schemas(): db.set_search_path([schema]) try: collections = db.pluck('SELECT id FROM selected_collections') except UndefinedTable as e: print(f'ERROR: undefined {schema}.selected_collections table: {e}', file=sys.stderr) db.connection.rollback() continue db.execute_values( 'INSERT INTO summaries.selected_collections (schema, collection_id) VALUES %s', [( schema, _id, ) for _id in collections]) db.commit()