Esempio n. 1
0
def refresh_api_table(
    table: str,
    progress: multiprocessing.Value = None,
    approach: ApproachType = "advanced",
):
    """
    Import updates from the upstream catalog database into the API. The
    process involves the following steps.

    1. Get the list of overlapping columns: ``_get_shared_cols``
    2. Create the FDW extension if it does not exist
    3. Create FDW for the data transfer: ``get_fdw_query``
    4. Import data into a temporary table: ``get_copy_data_query``
    5. Clean the data: ``clean_image_data``

    This is the main function of this module.

    :param table: The upstream table to copy.
    :param progress: multiprocessing.Value float for sharing task progress
    :param approach: whether to use advanced logic specific to media ingestion
    """

    # Step 1: Get the list of overlapping columns
    slack.info(f"`{table}`: Starting data refresh | _Next: copying data from upstream_")
    downstream_db = database_connect()
    upstream_db = psycopg2.connect(
        dbname=UPSTREAM_DB_NAME,
        user=UPSTREAM_DB_USER,
        port=UPSTREAM_DB_PORT,
        password=UPSTREAM_DB_PASSWORD,
        host=UPSTREAM_DB_HOST,
        connect_timeout=5,
    )
    shared_cols = _get_shared_cols(downstream_db, upstream_db, table)
    upstream_db.close()

    with downstream_db, downstream_db.cursor() as downstream_cur:
        # Step 2: Create the FDW extension if it does not exist
        log.info("(Re)initializing foreign data wrapper")
        try:
            create_ext = get_create_ext_query()
            downstream_cur.execute(create_ext)
        except psycopg2.errors.UniqueViolation:
            log.warning("Extension already exists, possible race condition.")

    with downstream_db, downstream_db.cursor() as downstream_cur:
        # Step 3: Create FDW for the data transfer
        init_fdw = get_fdw_query(
            RELATIVE_UPSTREAM_DB_HOST,
            RELATIVE_UPSTREAM_DB_PORT,
            UPSTREAM_DB_NAME,
            UPSTREAM_DB_USER,
            UPSTREAM_DB_PASSWORD,
            f"{table}_view",
        )
        downstream_cur.execute(init_fdw)

        # Step 4: Import data into a temporary table
        log.info("Copying upstream data...")
        environment = config("ENVIRONMENT", default="local").lower()
        limit_default = 100_000
        if environment in {"prod", "production"}:
            # If we're in production, turn off limits unless it's explicitly provided
            limit_default = 0
        limit = config("DATA_REFRESH_LIMIT", cast=int, default=limit_default)
        copy_data = get_copy_data_query(
            table, shared_cols, approach=approach, limit=limit
        )
        log.info(f"Running copy-data query: \n{copy_data.as_string(downstream_cur)}")
        downstream_cur.execute(copy_data)

    next_step = (
        "_Next: {starting data cleaning}_"
        if table == "image"
        else "Finished refreshing table"
    )
    slack.verbose(f"`{table}`: Data copy complete | {next_step}")

    if table == "image":
        # Step 5: Clean the data
        log.info("Cleaning data...")
        clean_image_data(table)
        log.info("Cleaning completed!")
        slack.verbose(
            f"`{table}`: Data cleaning complete | " f"Finished refreshing table"
        )

    downstream_db.close()
    log.info(f"Finished refreshing table '{table}'.")
    _update_progress(progress, 100.0)
Esempio n. 2
0
def reload_upstream(table, progress=None, finish_time=None):
    """
    Import updates from the upstream CC Catalog database into the API.

    :param table: The upstream table to copy.
    :param progress: multiprocessing.Value float for sharing task progress
    :param finish_time: multiprocessing.Value int for sharing finish timestamp
    :return:
    """
    downstream_db = database_connect()
    upstream_db = psycopg2.connect(dbname='openledger',
                                   user='******',
                                   port=UPSTREAM_DB_PORT,
                                   password=UPSTREAM_DB_PASSWORD,
                                   host=UPSTREAM_DB_HOST,
                                   connect_timeout=5)
    query_cols = ','.join(_get_shared_cols(downstream_db, upstream_db, table))
    upstream_db.close()
    # Connect to upstream database and create references to foreign tables.
    log.info('(Re)initializing foreign data wrapper')
    init_fdw = '''
        CREATE EXTENSION IF NOT EXISTS postgres_fdw;
        DROP SERVER IF EXISTS upstream CASCADE;
        CREATE SERVER upstream FOREIGN DATA WRAPPER postgres_fdw
        OPTIONS (host '{host}', dbname 'openledger', port '{port}');

        CREATE USER MAPPING IF NOT EXISTS FOR deploy SERVER upstream
        OPTIONS (user 'deploy', password '{passwd}');
        DROP SCHEMA IF EXISTS upstream_schema CASCADE;
        CREATE SCHEMA upstream_schema AUTHORIZATION deploy;

        IMPORT FOREIGN SCHEMA public
        LIMIT TO ({table}) FROM SERVER upstream INTO upstream_schema;
    '''.format(host=UPSTREAM_DB_HOST,
               passwd=UPSTREAM_DB_PASSWORD,
               table=table,
               port=UPSTREAM_DB_PORT)
    # 1. Import data into a temporary table
    # 2. Recreate indices from the original table
    # 3. Recreate constraints from the original table.
    # 4. Delete orphaned foreign key references.
    # 5. Clean the data.
    # 6. Promote the temporary table and delete the original.
    copy_data = '''
        DROP TABLE IF EXISTS temp_import_{table};
        CREATE TABLE temp_import_{table} (LIKE {table} INCLUDING CONSTRAINTS);
        INSERT INTO temp_import_{table} ({cols})
        SELECT {cols} from upstream_schema.{table};
        ALTER TABLE temp_import_{table} ADD PRIMARY KEY (id);
        DROP SERVER upstream CASCADE;
    '''.format(table=table, cols=query_cols)
    create_indices = ';\n'.join(_generate_indices(downstream_db, table))
    remap_constraints = ';\n'.join(_generate_constraints(downstream_db, table))
    go_live = '''
        DROP TABLE {table};
        ALTER TABLE temp_import_{table} RENAME TO {table};
    '''.format(table=table)

    with downstream_db.cursor() as downstream_cur:
        log.info('Copying upstream data...')
        downstream_cur.execute(init_fdw)
        downstream_cur.execute(copy_data)
    downstream_db.commit()
    downstream_db.close()
    upstream_info = {
        'port': UPSTREAM_DB_PORT,
        'password': UPSTREAM_DB_PASSWORD,
        'host': UPSTREAM_DB_HOST
    }
    clean_image_data(table, upstream_info)
    log.info('Cleaning step finished.')
    downstream_db = database_connect()
    with downstream_db.cursor() as downstream_cur:
        log.info('Copying finished! Recreating database indices...')
        _update_progress(progress, 50.0)
        if create_indices != '':
            downstream_cur.execute(create_indices)
        _update_progress(progress, 70.0)
        log.info('Done creating indices! Remapping constraints...')
        if remap_constraints != '':
            downstream_cur.execute(remap_constraints)
        _update_progress(progress, 99.0)
        log.info('Done remapping constraints! Going live with new table...')
        downstream_cur.execute(go_live)
    downstream_db.commit()
    downstream_db.close()
    log.info('Finished refreshing table \'{}\'.'.format(table))
    _update_progress(progress, 100.0)
    if finish_time:
        finish_time.value = datetime.datetime.utcnow().timestamp()