Example #1
0
def load(date=None):
    """
    Load the CSV file containing security data into the database.

    :param str date: date of security data collection, optional (default None)
    """
    if not date:
        date = datetime.today().strftime('%Y/%m/%d')
    else:
        date = utils.format_date(date)

    # collect functions parameters from environment variables
    params = config.get_environment_variables()
    bucket = params['AWS']['data_bucket']
    profile = params['AWS']['profile']
    con_params = params['DATABASE']
    database = con_params['database']
    table_name = con_params['table']

    # create table if exists
    logging.info(f"creating table '{table_name}' if not exists")
    con = loading.get_connection(con_params)
    loading.execute_sqls([(CREATE_TABLE.format(table_name), ())], con)

    # to make the 'load' step idempotent, we check if data for the specified
    # date is already in the database, if it is we either stop or delete and
    # load again
    con = loading.get_connection(con_params)
    loading.check_table_for_loaded_data(table_name, date, con)

    # download CSV file to be loaded and remove header
    csv_key = aws.get_s3_key(SECURITY_REPORT_S3_PREFIX,
                             SECURITY_REPORT_S3_SUFFIX, date)
    logging.info(f"downloading '{csv_key}' from S3 bucket '{bucket}'")
    csv_data = aws.download_s3_object(bucket, csv_key, profile)
    csv_no_header = utils.remove_header(csv_data.decode())

    # load the whole CSV at once
    logging.info(f"loading data into database '{database}' "
                 f"in table '{table_name}'")
    con = loading.get_connection(con_params)
    loading.copy_into(file_object=StringIO(csv_no_header),
                      table_name=table_name,
                      connection=con)
    logging.info('loading finished')

    logging.info('creating clean view of data if no exists')
    con = loading.get_connection(con_params)
    loading.execute_sqls(
        [(CREATE_NO_NULL_VIEW.format(table_name=table_name), ())],
        con,
    )

    logging.info('refreshing clean view of data')
    con = loading.get_connection(con_params)
    loading.execute_sqls(
        [(REFRESH_NO_NULL_VIEW.format(table_name=table_name), ())],
        con,
    )
Example #2
0
def query_db(sql, db_params):
    con = loading.get_connection(db_params)
    cur = con.cursor()
    try:
        cur.execute(sql)
        result = [row for row in cur]
    finally:
        con.close()
    return result
Example #3
0
def get_all_data():
    db_params = config.get_db_env_vars()
    con = loading.get_connection(db_params)
    try:
        data = pd.read_sql(
            get_all_data_sql.format(db_params["table"]),
            con)
    finally:
        con.close()
    data.set_index(pd.to_datetime(data["collection_date"]), inplace=True)
    return data
 def test_get_connection(self, connect_mock):
     connect_mock.return_value = 'engine'
     parameters = {
         'database': 'database',
         'db_user': '******',
         'password': '******',
         'host': 'host',
         'port': 'port'
     }
     engine = get_connection(parameters)
     self.assertEqual(engine, 'engine')
     connect_mock.assert_called_with(database='database',
                                     user='******',
                                     password='******',
                                     host='host',
                                     port='port')
Example #5
0
def top_ten_values(attribute):
    con = loading.get_connection(config.get_db_env_vars())
    df = pd.read_sql(top_val_sql.format(attribute=attribute), con)
    con.close()
    if attribute == "yield_percent":
        df["yield_percent"] = df["yield_percent"] * 100
    if max(df[attribute]) > 1e6:
        df[attribute] = (df[attribute] / 1e6).round(5).apply("{:,}".format)
        df.rename(columns={attribute: f"{attribute} (M)"}, inplace=True)
    else:
        df[attribute] = df[attribute].round(5)
    output_table = html.Table(
        [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [
            html.Tr([html.Td(df.iloc[i][col]) for col in df.columns])
            for i in range(10)
        ])
    return output_table
Example #6
0
def bottom_ten_prog(attribute):
    con = loading.get_connection(config.get_db_env_vars())
    df = pd.read_sql(bottom_prog_sql.format(attribute=attribute), con)
    con.close()
    if attribute == "yield_percent":
        df["diff"] = df["diff"] * 100
    if max(df["diff"]) > 1e6:
        df["diff"] = (df["diff"] / 1e6).round(5).apply("{:,}".format)
        df.rename(columns={"diff": f"diff (M)"}, inplace=True)
    elif max(df["diff"]) > 1e3:
        df["diff"] = (df["diff"] / 1e3).round(5).apply("{:,}".format)
        df.rename(columns={"diff": f"diff (K)"}, inplace=True)
    else:
        df["diff"] = df["diff"].round(5)

    output_table = html.Table(
        [html.Tr([html.Th(capitalize(col)) for col in df.columns])] + [
            html.Tr([html.Td(df.iloc[i][col]) for col in df.columns])
            for i in range(10)
        ])
    return output_table