def refresh_api_table( table: str, progress: multiprocessing.Value = None, approach: ApproachType = "advanced", ): """ Import updates from the upstream catalog database into the API. The process involves the following steps. 1. Get the list of overlapping columns: ``_get_shared_cols`` 2. Create the FDW extension if it does not exist 3. Create FDW for the data transfer: ``get_fdw_query`` 4. Import data into a temporary table: ``get_copy_data_query`` 5. Clean the data: ``clean_image_data`` This is the main function of this module. :param table: The upstream table to copy. :param progress: multiprocessing.Value float for sharing task progress :param approach: whether to use advanced logic specific to media ingestion """ # Step 1: Get the list of overlapping columns slack.info(f"`{table}`: Starting data refresh | _Next: copying data from upstream_") downstream_db = database_connect() upstream_db = psycopg2.connect( dbname=UPSTREAM_DB_NAME, user=UPSTREAM_DB_USER, port=UPSTREAM_DB_PORT, password=UPSTREAM_DB_PASSWORD, host=UPSTREAM_DB_HOST, connect_timeout=5, ) shared_cols = _get_shared_cols(downstream_db, upstream_db, table) upstream_db.close() with downstream_db, downstream_db.cursor() as downstream_cur: # Step 2: Create the FDW extension if it does not exist log.info("(Re)initializing foreign data wrapper") try: create_ext = get_create_ext_query() downstream_cur.execute(create_ext) except psycopg2.errors.UniqueViolation: log.warning("Extension already exists, possible race condition.") with downstream_db, downstream_db.cursor() as downstream_cur: # Step 3: Create FDW for the data transfer init_fdw = get_fdw_query( RELATIVE_UPSTREAM_DB_HOST, RELATIVE_UPSTREAM_DB_PORT, UPSTREAM_DB_NAME, UPSTREAM_DB_USER, UPSTREAM_DB_PASSWORD, f"{table}_view", ) downstream_cur.execute(init_fdw) # Step 4: Import data into a temporary table log.info("Copying upstream data...") environment = config("ENVIRONMENT", default="local").lower() limit_default = 100_000 if environment in {"prod", "production"}: # If we're in production, turn off limits unless it's explicitly provided limit_default = 0 limit = config("DATA_REFRESH_LIMIT", cast=int, default=limit_default) copy_data = get_copy_data_query( table, shared_cols, approach=approach, limit=limit ) log.info(f"Running copy-data query: \n{copy_data.as_string(downstream_cur)}") downstream_cur.execute(copy_data) next_step = ( "_Next: {starting data cleaning}_" if table == "image" else "Finished refreshing table" ) slack.verbose(f"`{table}`: Data copy complete | {next_step}") if table == "image": # Step 5: Clean the data log.info("Cleaning data...") clean_image_data(table) log.info("Cleaning completed!") slack.verbose( f"`{table}`: Data cleaning complete | " f"Finished refreshing table" ) downstream_db.close() log.info(f"Finished refreshing table '{table}'.") _update_progress(progress, 100.0)
def reload_upstream(table, progress=None, finish_time=None): """ Import updates from the upstream CC Catalog database into the API. :param table: The upstream table to copy. :param progress: multiprocessing.Value float for sharing task progress :param finish_time: multiprocessing.Value int for sharing finish timestamp :return: """ downstream_db = database_connect() upstream_db = psycopg2.connect(dbname='openledger', user='******', port=UPSTREAM_DB_PORT, password=UPSTREAM_DB_PASSWORD, host=UPSTREAM_DB_HOST, connect_timeout=5) query_cols = ','.join(_get_shared_cols(downstream_db, upstream_db, table)) upstream_db.close() # Connect to upstream database and create references to foreign tables. log.info('(Re)initializing foreign data wrapper') init_fdw = ''' CREATE EXTENSION IF NOT EXISTS postgres_fdw; DROP SERVER IF EXISTS upstream CASCADE; CREATE SERVER upstream FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{host}', dbname 'openledger', port '{port}'); CREATE USER MAPPING IF NOT EXISTS FOR deploy SERVER upstream OPTIONS (user 'deploy', password '{passwd}'); DROP SCHEMA IF EXISTS upstream_schema CASCADE; CREATE SCHEMA upstream_schema AUTHORIZATION deploy; IMPORT FOREIGN SCHEMA public LIMIT TO ({table}) FROM SERVER upstream INTO upstream_schema; '''.format(host=UPSTREAM_DB_HOST, passwd=UPSTREAM_DB_PASSWORD, table=table, port=UPSTREAM_DB_PORT) # 1. Import data into a temporary table # 2. Recreate indices from the original table # 3. Recreate constraints from the original table. # 4. Delete orphaned foreign key references. # 5. Clean the data. # 6. Promote the temporary table and delete the original. copy_data = ''' DROP TABLE IF EXISTS temp_import_{table}; CREATE TABLE temp_import_{table} (LIKE {table} INCLUDING CONSTRAINTS); INSERT INTO temp_import_{table} ({cols}) SELECT {cols} from upstream_schema.{table}; ALTER TABLE temp_import_{table} ADD PRIMARY KEY (id); DROP SERVER upstream CASCADE; '''.format(table=table, cols=query_cols) create_indices = ';\n'.join(_generate_indices(downstream_db, table)) remap_constraints = ';\n'.join(_generate_constraints(downstream_db, table)) go_live = ''' DROP TABLE {table}; ALTER TABLE temp_import_{table} RENAME TO {table}; '''.format(table=table) with downstream_db.cursor() as downstream_cur: log.info('Copying upstream data...') downstream_cur.execute(init_fdw) downstream_cur.execute(copy_data) downstream_db.commit() downstream_db.close() upstream_info = { 'port': UPSTREAM_DB_PORT, 'password': UPSTREAM_DB_PASSWORD, 'host': UPSTREAM_DB_HOST } clean_image_data(table, upstream_info) log.info('Cleaning step finished.') downstream_db = database_connect() with downstream_db.cursor() as downstream_cur: log.info('Copying finished! Recreating database indices...') _update_progress(progress, 50.0) if create_indices != '': downstream_cur.execute(create_indices) _update_progress(progress, 70.0) log.info('Done creating indices! Remapping constraints...') if remap_constraints != '': downstream_cur.execute(remap_constraints) _update_progress(progress, 99.0) log.info('Done remapping constraints! Going live with new table...') downstream_cur.execute(go_live) downstream_db.commit() downstream_db.close() log.info('Finished refreshing table \'{}\'.'.format(table)) _update_progress(progress, 100.0) if finish_time: finish_time.value = datetime.datetime.utcnow().timestamp()