def load(date=None): """ Load the CSV file containing security data into the database. :param str date: date of security data collection, optional (default None) """ if not date: date = datetime.today().strftime('%Y/%m/%d') else: date = utils.format_date(date) # collect functions parameters from environment variables params = config.get_environment_variables() bucket = params['AWS']['data_bucket'] profile = params['AWS']['profile'] con_params = params['DATABASE'] database = con_params['database'] table_name = con_params['table'] # create table if exists logging.info(f"creating table '{table_name}' if not exists") con = loading.get_connection(con_params) loading.execute_sqls([(CREATE_TABLE.format(table_name), ())], con) # to make the 'load' step idempotent, we check if data for the specified # date is already in the database, if it is we either stop or delete and # load again con = loading.get_connection(con_params) loading.check_table_for_loaded_data(table_name, date, con) # download CSV file to be loaded and remove header csv_key = aws.get_s3_key(SECURITY_REPORT_S3_PREFIX, SECURITY_REPORT_S3_SUFFIX, date) logging.info(f"downloading '{csv_key}' from S3 bucket '{bucket}'") csv_data = aws.download_s3_object(bucket, csv_key, profile) csv_no_header = utils.remove_header(csv_data.decode()) # load the whole CSV at once logging.info(f"loading data into database '{database}' " f"in table '{table_name}'") con = loading.get_connection(con_params) loading.copy_into(file_object=StringIO(csv_no_header), table_name=table_name, connection=con) logging.info('loading finished') logging.info('creating clean view of data if no exists') con = loading.get_connection(con_params) loading.execute_sqls( [(CREATE_NO_NULL_VIEW.format(table_name=table_name), ())], con, ) logging.info('refreshing clean view of data') con = loading.get_connection(con_params) loading.execute_sqls( [(REFRESH_NO_NULL_VIEW.format(table_name=table_name), ())], con, )
def query_db(sql, db_params): con = loading.get_connection(db_params) cur = con.cursor() try: cur.execute(sql) result = [row for row in cur] finally: con.close() return result
def get_all_data(): db_params = config.get_db_env_vars() con = loading.get_connection(db_params) try: data = pd.read_sql( get_all_data_sql.format(db_params["table"]), con) finally: con.close() data.set_index(pd.to_datetime(data["collection_date"]), inplace=True) return data
def test_get_connection(self, connect_mock): connect_mock.return_value = 'engine' parameters = { 'database': 'database', 'db_user': '******', 'password': '******', 'host': 'host', 'port': 'port' } engine = get_connection(parameters) self.assertEqual(engine, 'engine') connect_mock.assert_called_with(database='database', user='******', password='******', host='host', port='port')
def top_ten_values(attribute): con = loading.get_connection(config.get_db_env_vars()) df = pd.read_sql(top_val_sql.format(attribute=attribute), con) con.close() if attribute == "yield_percent": df["yield_percent"] = df["yield_percent"] * 100 if max(df[attribute]) > 1e6: df[attribute] = (df[attribute] / 1e6).round(5).apply("{:,}".format) df.rename(columns={attribute: f"{attribute} (M)"}, inplace=True) else: df[attribute] = df[attribute].round(5) output_table = html.Table( [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [ html.Tr([html.Td(df.iloc[i][col]) for col in df.columns]) for i in range(10) ]) return output_table
def bottom_ten_prog(attribute): con = loading.get_connection(config.get_db_env_vars()) df = pd.read_sql(bottom_prog_sql.format(attribute=attribute), con) con.close() if attribute == "yield_percent": df["diff"] = df["diff"] * 100 if max(df["diff"]) > 1e6: df["diff"] = (df["diff"] / 1e6).round(5).apply("{:,}".format) df.rename(columns={"diff": f"diff (M)"}, inplace=True) elif max(df["diff"]) > 1e3: df["diff"] = (df["diff"] / 1e3).round(5).apply("{:,}".format) df.rename(columns={"diff": f"diff (K)"}, inplace=True) else: df["diff"] = df["diff"].round(5) output_table = html.Table( [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [ html.Tr([html.Td(df.iloc[i][col]) for col in df.columns]) for i in range(10) ]) return output_table