CLI.add_argument( "--move_after_loading", type=str, default= 'no' # default is to append new records instead of dropping and reloading everything ) CLI.add_argument( "--move_to_folder", type=str, default= '' # default is to append new records instead of dropping and reloading everything ) # parse the command line args = CLI.parse_args() folders_to_load = args.folders format = args.format.lower() header = args.header.lower() mode = args.mode.lower() move_after_loading = args.move_after_loading.lower() move_to_folder = args.move_to_folder.lower() # call function with command line arguments for folder in folders_to_load: s3_to_postGIS(folder_to_load=folder, AWS_Credentials=get_connection_strings("AWS_DEV"), format=format, header=header, mode=mode, move_after_loading=move_after_loading, move_to_folder=move_to_folder)
def s3_to_postGIS(folder_to_load: str, AWS_Credentials: dict, format: str, header: str, mode: str, move_after_loading: str, move_to_folder: str): # set up S3 and RDS connections s3_resource = boto3.resource( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket_name = AWS_Credentials['s3_bucket'] bucket = s3_resource.Bucket(bucket_name) region = AWS_Credentials['region'] dbname = 'postgres' env = "DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") db_uid = db_credentials[env]['UID'] db_pwd = db_credentials[env]['PWD'] db_host = db_credentials[env]['HOST'] db_port = db_credentials[env]['PORT'] # add psql install location to default path psql_path = subprocess.check_output(['which', 'psql']).strip().decode('utf-8') sys.path.append(psql_path) # grab list of all files in target folder that have a target table # url encode the file key so the ones with semicolons don't throw an error # update january 2021: the script now throws an error if i try to feed it URL-encoded object keys, so just using the plain text one now files_to_load = [(urllib.parse.quote(obj.key), obj.key, obj.Object().metadata['target_schema'], obj.Object().metadata['target_table']) for obj in bucket.objects.filter(Prefix=folder_to_load) if 'target_table' in obj.Object().metadata.keys() if format in obj.key] # generate distinct list of target tables so they're all only dropped and recreated/truncated one time target_tables = [(target_schema, target_table) for (file_name, file_name_native, target_schema, target_table) in files_to_load] target_tables_distinct = set(target_tables) target_tables = list(target_tables_distinct) # drop and recreate and/or truncate each target table for (target_schema, target_table) in target_tables: generate_table(engine=engine, target_schema=target_schema, target_table=target_table, mode=mode) # set table import parameters that are the same for every file copy_parameters = '\'(FORMAT {}, HEADER {})\''.format(format, header) columns_to_copy = '\'\'' aws_credentials_param = '\'{}\', \'{}\',\'\''.format( AWS_Credentials['aws_access_key_id'], AWS_Credentials['aws_secret_access_key']) # create file-specific table import parameters for (file_name, file_name_native, target_schema, target_table) in files_to_load: destination_table = '\'{}.{}\''.format(target_schema, target_table) create_s3_uri_param = '\'{}\', \'{}\',\'{}\''.format( AWS_Credentials['s3_bucket'], file_name_native, region) base_file_name = os.path.basename(file_name_native) # create import statement import_table_query = 'SELECT aws_s3.table_import_from_s3({}, {},{}, aws_commons.create_s3_uri({}) ,aws_commons.create_aws_credentials({}));'.format( destination_table, columns_to_copy, copy_parameters, create_s3_uri_param, aws_credentials_param) # create arg to pass to os.system os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format( db_pwd, db_host, db_port, db_uid, dbname, import_table_query) # execute if move_after_loading != 'yes': os.system(os_system_arg) elif move_after_loading == 'yes' and move_to_folder != '': os.system(os_system_arg) try: s3_resource.Object(bucket_name, move_to_folder + base_file_name).copy_from( CopySource={ 'Bucket': bucket_name, 'Key': file_name_native }) s3_resource.Object(bucket_name, file_name_native).delete() except: print(file_name_native, " could not be copied and/or deleted") continue else: print("please provide move-to folder") continue # after data is loaded, update the geographies for (target_schema, target_table) in target_tables: correct_geo(engine=engine, target_schema=target_schema, target_table=target_table, mode=mode)
import sqlalchemy from connect_to_rds import get_connection_strings, create_postgres_engine from add_location_info import add_location_info,add_school_info,add_roadway_info, add_intersection_info,create_final_table from get_address import GeoLoc import urllib dbname='postgres' env="DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") GOOGLE_API_KEY = get_connection_strings("GOOGLE_MAPS")["API_Key"] # First move all source data records to a temp table step_1_query=""" DROP TABLE IF EXISTS tmp.moving_violations; CREATE TABLE tmp.moving_violations as SELECT * FROM source_data.moving_violations; CREATE INDEX IF NOT EXISTS mv_location_index ON tmp.moving_violations (location); """ engine.execute(step_1_query) print("temp table created") # create the geocodes table if they don't already exist create_tables_query = """ CREATE TABLE IF NOT EXISTS source_data.mv_geocodes ( location varchar null ,lat numeric null ,long numeric null
import sqlalchemy from connect_to_rds import get_connection_strings, create_postgres_engine from add_location_info import add_location_info, add_school_info, create_final_table dbname = 'postgres' env = "DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") geography_levels = { 'comp_plan_area': { 'geo_boundaries_source_table': 'source_data.comp_plan_areas', 'orig_field_name': 'name' }, 'census_tract': { 'geo_boundaries_source_table': 'source_data.census_tracts', 'orig_field_name': 'tract' }, 'nbh_cluster_names': { 'geo_boundaries_source_table': 'source_data.neighborhood_clusters', 'orig_field_name': 'nbh_names' }, 'ward_name': { 'geo_boundaries_source_table': 'source_data.ward_boundaries', 'orig_field_name': 'name' }, 'anc_id': { 'geo_boundaries_source_table': 'source_data.anc_boundaries', 'orig_field_name': 'anc_id'
partition_by_field='objectid', within_distance=0.001) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='crashes_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=10) print("intersection info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='dc_crashes_w_details', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count) # command line arguments CLI = argparse.ArgumentParser() CLI.add_argument("--env", type=str) # parse the command line args = CLI.parse_args() env = args.env if __name__ == "__main__": generate_crashes_table(AWS_Credentials=get_connection_strings("AWS_DEV"), env=env)
import datetime from datetime import timezone import time from connect_to_rds import get_connection_strings import pprint import json import boto3 import os import pandas as pd from pathlib import Path import csv from get_address import GeoLoc import argparse # get aws credentials AWS_Credentials = get_connection_strings("AWS_DEV") client = boto3.client( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) s3_resource = boto3.resource( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket = s3_resource.Bucket(AWS_Credentials['s3_bucket']) bucket_name = AWS_Credentials['s3_bucket'] region = AWS_Credentials['region'] GOOGLE_API_KEY = get_connection_strings("GOOGLE_MAPS")["API_Key"] # set script variables home = os.path.expanduser('~')
def create_test_db(env: str, test_db_name: str): # connect to the prod db engine = create_postgres_engine(destination="AWS_PostGIS", env="PROD") db_credentials = get_connection_strings("AWS_PostGIS") # get prod master credentials prod_db_host = db_credentials['PROD']['HOST'] prod_db_port = db_credentials['PROD']['PORT'] prod_db_name = db_credentials['PROD']['DB'] prod_db_uid = db_credentials['PROD']['UID'] prod_db_pwd = db_credentials['PROD']['PWD'] # get testdb credentials test_db_host = db_credentials[env.upper()]['HOST'] test_db_port = db_credentials[env.upper()]['PORT'] test_db_name = db_credentials[env.upper()]['DB'] test_db_uid = db_credentials[env.upper()]['UID'] test_db_pwd = db_credentials[env.upper()]['PWD'] test_db_users = db_credentials[env.upper()]['USERS'] kill_db_query = """ SELECT pg_terminate_backend (pid) FROM pg_stat_activity WHERE pg_stat_activity.datname = '{0}'; """.format(test_db_name) # kill engine.execute(kill_db_query) # drop command = 'DROP DATABASE IF EXISTS {}'.format(test_db_name) os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format( prod_db_pwd, prod_db_host, prod_db_port, prod_db_uid, prod_db_name, command) os.system(os_system_arg) # create command = 'CREATE DATABASE {}'.format(test_db_name) os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format( prod_db_pwd, prod_db_host, prod_db_port, prod_db_uid, prod_db_name, command) os.system(os_system_arg) # create users on new db for user_pwd in test_db_users: user = list(user_pwd.keys())[0] pwd = user_pwd[user] command = 'CREATE ROLE {} WITH LOGIN ENCRYPTED PASSWORD \'{}\';'.format( user, pwd) os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) print(os_system_arg) os.system(os_system_arg) # install PostGIS extensions command = """ CREATE EXTENSION postgis; CREATE EXTENSION fuzzystrmatch; CREATE EXTENSION postgis_tiger_geocoder; CREATE EXTENSION postgis_topology; """ os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # alter schemas command = """ ALTER SCHEMA tiger OWNER TO rds_superuser; ALTER SCHEMA tiger_data OWNER TO rds_superuser; ALTER SCHEMA topology OWNER TO rds_superuser; """ os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # create function command = 'CREATE FUNCTION exec(text) returns text language plpgsql volatile AS \$f\$ BEGIN EXECUTE \$1; RETURN \$1; END; \$f\$;' os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # execute function command = """ SELECT exec('ALTER TABLE ' || quote_ident(s.nspname) || '.' || quote_ident(s.relname) || ' OWNER TO rds_superuser;') FROM ( SELECT nspname, relname FROM pg_class c JOIN pg_namespace n ON (c.relnamespace = n.oid) WHERE nspname in ('tiger','topology') AND relkind IN ('r','S','v') ORDER BY relkind = 'S') s; """ os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # get all schemas on prod db schemas = [ r for (r, ) in engine.execute( "select distinct table_schema from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%'" ).fetchall() ] # create engine on test db test_engine = create_postgres_engine(destination="AWS_PostGIS", env=env.upper()) # create schemas for schema in schemas: create_schema_query = """ CREATE SCHEMA IF NOT EXISTS {0}; GRANT ALL PRIVILEGES ON SCHEMA {0} TO PUBLIC; """.format(schema) test_engine.execute(create_schema_query)
import tweepy import datetime from datetime import timezone import time from connect_to_rds import get_connection_strings import pprint import json import boto3 import os import pandas as pd from pathlib import Path # get aws credentials AWS_Credentials = get_connection_strings("AWS_DEV") s3 = boto3.client('s3' ,aws_access_key_id=AWS_Credentials['aws_access_key_id'] ,aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) s3_resource = boto3.resource('s3' ,aws_access_key_id=AWS_Credentials['aws_access_key_id'] ,aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket = s3_resource.Bucket(AWS_Credentials['s3_bucket']) region=AWS_Credentials['region'] # set script variables home = os.path.expanduser('~') destination_folder='source-data/twitter/all_time/' filename_stem = 'twitter_search_' date_tuples=[("201501010000","202012310000")] environment_name ='dev' current_date=datetime.datetime.now(timezone.utc).strftime("%Y-%m-%d")
def refresh_test_db(env: str): engine = create_postgres_engine(destination="AWS_PostGIS", env=env.upper()) db_credentials = get_connection_strings("AWS_PostGIS") db_users = db_credentials[env.upper()]['USERS'] prod_db_host = db_credentials['PROD']['HOST'] prod_db_port = db_credentials['PROD']['PORT'] prod_db_name = db_credentials['PROD']['DB'] prod_engine = create_postgres_engine(destination="AWS_PostGIS", env="PROD") create_fdw_query = """ BEGIN; CREATE EXTENSION IF NOT EXISTS postgres_fdw; DROP SERVER IF EXISTS prod CASCADE; CREATE SERVER prod FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{prod_db_host}', dbname '{prod_db_name}'); COMMIT; """.format(prod_db_name=prod_db_name, prod_db_host=prod_db_host) engine.execute(create_fdw_query) # create user mappings for user_pwd in db_users: user = list(user_pwd.keys())[0] pwd = user_pwd[user] map_user_query = """ CREATE USER MAPPING FOR {user} SERVER prod OPTIONS (user '{user}', password '{pwd}'); """.format(user=user, pwd=pwd) engine.execute(map_user_query) # pull the schemas off the viz copy of the prod database schemas = [(r, 'prod_' + r) for (r, ) in prod_engine.execute( "select distinct table_schema from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%'" ).fetchall()] # map schemas for source_schema, destination_schema in schemas: create_schema_query = """ CREATE SCHEMA IF NOT EXISTS {destination_schema}; GRANT ALL PRIVILEGES ON SCHEMA {destination_schema} TO PUBLIC; IMPORT FOREIGN SCHEMA {source_schema} FROM SERVER prod INTO {destination_schema}; """.format(source_schema=source_schema, destination_schema=destination_schema) engine.execute(create_schema_query) # pull all the tables from prod db schemas_tables = [(schema, table) for ( schema, table ) in prod_engine.execute( "select distinct table_schema,table_name from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%' and table_name not like '[%%]'" ).fetchall()] # create and populate tables for schema, table in schemas_tables: create_populate_tables_query = """ CREATE TABLE IF NOT EXISTS {schema}."{table}" (LIKE prod_{schema}."{table}"); DELETE FROM {schema}."{table}"; INSERT INTO {schema}."{table}" SELECT * FROM prod_{schema}."{table}"; GRANT ALL PRIVILEGES ON {schema}."{table}" TO PUBLIC; """.format(schema=schema, table=table) print(create_populate_tables_query) engine.execute(create_populate_tables_query)
import boto3 import os from pathlib import Path from connect_to_rds import get_connection_strings import datetime from datetime import timezone import requests import json import sys import csv Census_API_Key = get_connection_strings('CENSUS')['API_Key'] AWS_Credentials = get_connection_strings('AWS_DEV') sys.path.append(os.path.expanduser('~')) s3 = boto3.resource( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) client = boto3.client( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket_name = AWS_Credentials['s3_bucket'] region = AWS_Credentials['region'] census_endpoint = "https://api.census.gov/data/2019/acs/acs5" metadata = {
def geocode_text(engine, records_to_geocode: list, administrative_area: str, **kwargs): text_type = kwargs.get('text_type', 'Unknown') # create tables if they don't already exist create_geocodes_location_table = """ CREATE TABLE IF NOT EXISTS source_data.geocoded_text ( GET_DATETIME TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ,text varchar null ,text_type varchar null ,point_type varchar null ,point_geography geography null ,polygon_geography geography null ,administrative_area varchar null ) """ engine.execute(create_geocodes_location_table) create_missing_geocodes_location_table = """ CREATE TABLE IF NOT EXISTS source_data.geocoded_text_fails ( GET_DATETIME TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ,text varchar null ,text_type varchar null ,administrative_area varchar null ) """ engine.execute(create_missing_geocodes_location_table) # grab the google API key GOOGLE_API_KEY = get_connection_strings("GOOGLE_MAPS")["API_Key"] # pull records that were already geocoded and either succeeded or failed already_geocoded_records = [ r for (r, ) in engine.execute( "select distinct text from source_data.geocoded_text").fetchall() ] already_failed_records = [ r for (r, ) in engine.execute( "select distinct text from source_data.geocoded_text_fails"). fetchall() ] # limit records to only those that haven't already been checked new_records = [ r for r in records_to_geocode if r not in already_failed_records if r not in already_geocoded_records ] print(len(records_to_geocode), " records provided, ", len(new_records), " of them need to be geocoded") # then using the google maps API, add a lat and long for addresses that don't have them for record in new_records: address = str(record) try: geo_loc_instance = GeoLoc(GOOGLE_API_KEY) geocode_location = geo_loc_instance.GetGeoLoc( test_address=address, test_administrative_area=administrative_area) # insert into the table point_lat = geocode_location['geometry']['location']['lat'] point_long = geocode_location['geometry']['location']['lng'] point_type = geocode_location['geometry']['location_type'] ne_lat = geocode_location['geometry']['viewport']['northeast'][ 'lat'] ne_long = geocode_location['geometry']['viewport']['northeast'][ 'lng'] sw_lat = geocode_location['geometry']['viewport']['southwest'][ 'lat'] sw_long = geocode_location['geometry']['viewport']['southwest'][ 'lng'] insert_record_query = """ INSERT INTO source_data.geocoded_text (text,text_type, point_type,point_geography,polygon_geography,administrative_area) SELECT '{address}', '{text_type}', '{point_type}' ,ST_SetSRID(ST_MakePoint({point_long}, {point_lat}),4326)::geography ,ST_Polygonize(results)::geography ,'{administrative_area}' FROM (SELECT ST_MakeLine( ST_SetSRID(ST_MakePoint({sw_long},{sw_lat}), 4326), ST_SetSRID(ST_MakePoint({ne_long},{sw_lat}),4326) ) AS results UNION SELECT ST_MakeLine( ST_SetSRID(ST_MakePoint({ne_long},{sw_lat}), 4326), ST_SetSRID(ST_MakePoint({ne_long},{ne_lat}),4326) ) UNION SELECT ST_MakeLine( ST_SetSRID(ST_MakePoint({ne_long},{ne_lat}), 4326), ST_SetSRID(ST_MakePoint({sw_long},{ne_lat}),4326) ) UNION SELECT ST_MakeLine( ST_SetSRID(ST_MakePoint({sw_long},{ne_lat}), 4326), ST_SetSRID(ST_MakePoint({sw_long},{sw_lat}),4326) ) ) as tmp; """.format(address=address, text_type=text_type, point_type=point_type, point_long=point_long, point_lat=point_lat, sw_lat=sw_lat, sw_long=sw_long, ne_lat=ne_lat, ne_long=ne_long, administrative_area=administrative_area) engine.execute(insert_record_query) except Exception as error: # print("could not geocode record ", address) # replace single quotes to avoid insertion errors address = address.replace('\'', '') engine.execute( "INSERT INTO source_data.geocoded_text_fails (text, text_type,administrative_area) SELECT '{address}', '{text_type}','{administrative_area}'" .format(address=address, text_type=text_type, administrative_area=administrative_area))
""" final_query=""" DROP TABLE IF EXISTS {0}.{1}; CREATE TABLE {0}.{1} AS SELECT * FROM tmp_pulsepoint_units_join; GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC; """.format(target_schema, target_table) engine.execute(step1_query) engine.execute(step_2_query) engine.execute(step_3_query) engine.execute(final_query) # command line arguments CLI=argparse.ArgumentParser() CLI.add_argument( "--env", type=str ) # parse the command line args = CLI.parse_args() env=args.env if __name__ == "__main__": generate_pulsepoint_table(AWS_Credentials=get_connection_strings("AWS_DEV"), env=env)
import sqlalchemy from connect_to_rds import get_connection_strings, create_postgres_engine import requests import json dbname = 'postgres' env = "DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") WALKSCORE_API_KEY = get_connection_strings("WALKSCORE")["API_Key"] # walkscore-specific variables walkscore_endpoint = 'https://api.walkscore.com/score' format = 'json' # First move all source data records to a temp table step_1_query = """ CREATE TABLE IF NOT EXISTS tmp.address_walkscores as ( SELECT * FROM ( SELECT DISTINCT b.objectid as census_block_objectid, a.fulladdress, a.latitude, a.longitude, b.geography , ROW_NUMBER() over (PARTITION BY b.objectid ORDER BY a.fulladdress) AS row_num FROM source_data.address_points a INNER JOIN source_data.census_blocks b on ST_Intersects(a.geography::geometry, b.geography::geometry) ) as tmp WHERE ROW_NUM = 1 ); CREATE INDEX IF NOT EXISTS address_index ON tmp.address_walkscores (fulladdress);