def json_to_postgres(data, table): # Write parsed JSON data to file with open('{}/data/{}.json'.format(root_dir, table), 'w') as f: json.dump(data, f) logging.info( 'Creating unlogged table to store JSON {} data...'.format(table)) staging_json_table = AsIs('staging.' + table + '_json') # Create an unlogged Postgres table to copy the JSON data to # Entire JSON blob will be copied to one field: doc execute_from_text(''' DROP TABLE IF EXISTS {unlogged}; CREATE UNLOGGED TABLE {unlogged} (doc JSON); '''.format(unlogged=staging_json_table)) # Copy from JSON file to unlogged table with open('{}/data/{}.json'.format(root_dir, table), 'r') as f: _, cur = cursor() logging.info('Copying JSON {} data into table...'.format(table)) cur.copy_expert('COPY {} FROM STDIN'.format(staging_json_table), f) # Make sure JSON table has data data_check_json(staging_json_table)
def load_table(table, backfill, new_data_field, json_data=None): # If backfilling, run the DDL script (to drop and recreate prod table) if backfill: execute_from_file(root_dir + '/sql/' + table + '_ddl.sql') if json_data != None: # Copy JSON data to Postgres json_to_postgres(json_data, table) # Create staging table with data from latest request and insert into prod table execute_from_file(root_dir + '/sql/' + table + '_insert.sql') # Make sure staging and prod tables actually have data # Staging table may not have data if there is nothing new to insert data_check_row_count(table) if not backfill: # Output the new values to be inserted from the current execution new_data = execute_from_text( 'SELECT DISTINCT {} FROM staging.{}'.format(new_data_field, table), full_output=True) # Format output new_data_clean = [value[0] for value in new_data] if 'date' in new_data_field: new_data_clean = [ value.strftime('%Y-%m-%d') for value in new_data_clean ] if len(new_data) == 0: logging.info('No new data to insert into public.{}'.format(table)) else: logging.info('New {} values inserted into public.{}: {}'.format( new_data_field, table, sorted(new_data_clean)))
def json_to_postgres(data, table): """ Write parsed JSON data to unlogged Postgres table Entire JSON blob will be copied to one record in the table This format allows for subsequent parsing into final SQL table format (using json_populate_recordset) :param data: parsed JSON data from API call (e.g. for company, prices tables) :param table: table name (string) """ # Write parsed JSON data to a local file with open('{}/data/{}.json'.format(root_dir, table), 'w') as f: json.dump(data, f) # Create an unlogged Postgres table to copy the local JSON file to. # Table has one column: doc logging.info('Creating unlogged table to store JSON {} data...'.format(table)) staging_json_table = AsIs('staging.'+table+'_json') execute_from_text(''' DROP TABLE IF EXISTS {unlogged}; CREATE UNLOGGED TABLE {unlogged} (doc JSON); '''.format(unlogged=staging_json_table)) # Copy from local JSON file to unlogged table with open('{}/data/{}.json'.format(root_dir, table), 'r') as f: _, cur = cursor() logging.info('Copying JSON {} data into table...'.format(table)) cur.copy_expert('COPY {} FROM STDIN'.format(staging_json_table), f) # Make sure new unlogged table has data query_check_length = 'SELECT LENGTH(doc::VARCHAR) FROM {}'.format(staging_json_table) json_char_length = execute_from_text(query_check_length, aggregate_output=True) # Log success if the doc field is populated if json_char_length != 0: logging.info('JSON data successfully copied to {}. Char length: {}'.format(staging_json_table, json_char_length)) else: logging.error('No JSON data copied to {}. Char length: {}'.format(staging_json_table, json_char_length))
def data_check_row_count(table): # Make sure each staging and production table has data for schema in ['staging', 'public']: query_row_count = 'SELECT COUNT(*) FROM {}.{}'.format(schema, table) row_count = execute_from_text(query_row_count, single_output=True) if row_count != 0: logging.info('Row count for {}.{}: {}'.format( schema, table, row_count)) else: logging.info('Table {}.{} is empty'.format(schema, table))
def load_table(table, backfill, new_data_field, from_json_data=None): """ Loads each production table If backfilling, drops and recreates the table If not backfilling, logs the new values to be inserted :param table: table name (string) :param backfill: perform backfill? (boolean) :param new_data_filed: field to check for newly inserted records (e.g. 'date' for prices table, 'stock_code' for company table) :param from_json_data: parsed JSON data to load the table with (e.g. company, prices tables). The orders table is derived FROM the prices table, so it requires no JSON data processing """ # If backfilling, run the DDL script (to drop and recreate prod table) if backfill: execute_from_file(root_dir+'/sql/'+table+'_ddl.sql') # If the table requires loading from JSON data ('company', 'prices'), load it # 'orders' table is derived from existing tables, so it does not require loading from JSON if from_json_data != None: # Write parsed JSON data to unlogged Postgres table json_to_postgres(from_json_data, table) # Create staging table with data from latest request and insert into prod table execute_from_file(root_dir+'/sql/'+table+'_insert.sql') # Make sure staging and prod tables actually have data # Note: staging table may not have data if there is nothing new to insert data_check_row_count(table) # If not backfilling, output the new values to be inserted from the current execution if not backfill: new_data = execute_from_text('SELECT DISTINCT {} FROM staging.{}'.format(new_data_field, table), full_output=True) # Format output new_data_clean = [value[0] for value in new_data] if 'date' in new_data_field: new_data_clean = [value.strftime('%Y-%m-%d') for value in new_data_clean] if len(new_data) == 0: logging.info('No new data to insert into public.{}'.format(table)) else: # Log the new stock codes or dates to be appended logging.info('New {} values inserted into public.{}: {}'.format(new_data_field, table, sorted(new_data_clean)))
def data_check_json(table): ''' In the below function (json_to_postgres), we save the parsed JSON response as an unlogged Postgres table This function makes sure this "staging" JSON table has data, i.e. the copy_expert function worked correctly Note: the entire JSON blob should be stored in the field 'doc' ''' query_check_length = 'SELECT LENGTH(doc::VARCHAR) FROM {}'.format(table) json_char_length = execute_from_text(query_check_length, single_output=True) if json_char_length != 0: logging.info( 'JSON data successfully copied to {}. Char length: {}'.format( table, json_char_length)) else: logging.error('No JSON data copied to {}. Char length: {}'.format( table, json_char_length))