/
etl.py
61 lines (48 loc) · 1.49 KB
/
etl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import configparser
import psycopg2
from psycopg2.extras import DictCursor, DictConnection
from sql_queries import COPY_TABLE_QUERIES, INSERT_TABLE_QUERIES
def load_staging_tables(cur: DictCursor, conn: DictConnection):
"""
Function that loads all data from the source S3 bucket into the staging
Redshift tables by using the `COPY_TABLE_QUERIES` variable
Parameters
----------
cur
conn
"""
for query in COPY_TABLE_QUERIES:
cur.execute(query)
conn.commit()
def insert_tables(cur: DictCursor, conn: DictConnection):
"""
Function that inserts data from the staging Redshift tables into the
Redshift analytical tables by using the `INSERT_TABLE_QUERIES` variable
Parameters
----------
cur
conn
"""
for query in INSERT_TABLE_QUERIES:
cur.execute(query)
conn.commit()
def main():
"""
Function that loads configuration values from `dwh.cfg`, creates a
connection to Redshift, loads data from S3 into the Redshift staging tables,
after staging data is loded, analytical tables are filled with insertions
from the staging tables.
"""
config = configparser.ConfigParser()
config.read('dwh.cfg')
conn = psycopg2.connect(
"host={} dbname={} user={} password={} port={}".format(
*config['CLUSTER'].values()
)
)
cur = conn.cursor()
load_staging_tables(cur, conn)
insert_tables(cur, conn)
conn.close()
if __name__ == "__main__":
main()