def test_lbnl_tracking_the_sun_2019(): # config config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE) database_name = config.tracking_the_sun_db_name table_name = "tracking_the_sun_2019" # Partitions glue = OEDIGlue(region_name=config.region_name) partition_keys = glue.get_partition_keys(database_name, table_name, with_pandas=False) assert len(partition_keys) == 1 assert partition_keys[0]["Name"] == "state" partition_values = glue.get_partition_values(database_name, table_name) if "__HIVE_DEFAULT_PARTITION__" in partition_values: partition_values.remove("__HIVE_DEFAULT_PARTITION__") assert len(partition_values) == 28 # Query athena = OEDIAthena(staging_location=config.staging_location, region_name=config.region_name) sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE state='CO';" result = athena.run_query(query_string=sql, pandas_cursor=False) assert result[0][0] == 93930 sql = f"SELECT * FROM {database_name}.{table_name} WHERE state='CO' AND system_id_from_first_data_provider='SRO00164';" df = athena.run_query(query_string=sql, pandas_cursor=True) assert df.shape == (2, 60) assert pytest.approx(df["total_installed_price"].values[0]) == 22504.46
def test_nrel_pv_rooftops_aspects(): # config config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE) database_name = config.pv_rooftops_db_name table_name = "pv_rooftop_aspects" # Partitions glue = OEDIGlue(region_name=config.region_name) partition_keys = glue.get_partition_keys(database_name, table_name, with_pandas=False) assert len(partition_keys) == 1 assert partition_keys[0]["Name"] == "city_year" partition_values = glue.get_partition_values(database_name, table_name) if "__HIVE_DEFAULT_PARTITION__" in partition_values: partition_values.remove("__HIVE_DEFAULT_PARTITION__") assert len(partition_values) == 168 # Query athena = OEDIAthena(staging_location=config.staging_location, region_name=config.region_name) sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE city_year='augusta_me_08';" result = athena.run_query(query_string=sql, pandas_cursor=False) assert result[0][0] == 197547 sql = f"SELECT * FROM {database_name}.{table_name} WHERE city_year='augusta_me_08' AND gid=53283;" df = athena.run_query(query_string=sql, pandas_cursor=True) assert df.shape == (1, 9) assert df["aspect"].values[0] == 7
def test_nrel_pv_rooftops_buildings(): # config config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE) database_name = config.pv_rooftops_db_name table_name = "pv_rooftop_buildings" # Partitions glue = OEDIGlue(region_name=config.region_name) partition_keys = glue.get_partition_keys(database_name, table_name, with_pandas=False) assert len(partition_keys) == 1 assert partition_keys[0]["Name"] == "city_year" partition_values = glue.get_partition_values(database_name, table_name) if "__HIVE_DEFAULT_PARTITION__" in partition_values: partition_values.remove("__HIVE_DEFAULT_PARTITION__") assert len(partition_values) == 168 # Query athena = OEDIAthena(staging_location=config.staging_location, region_name=config.region_name) sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE city_year='augusta_me_08';" result = athena.run_query(query_string=sql, pandas_cursor=False) assert result[0][0] == 17431 sql = f"SELECT * FROM {database_name}.{table_name} WHERE city_year='augusta_me_08' AND gid=16039;" df = athena.run_query(query_string=sql, pandas_cursor=True) assert df.shape == (1, 8) assert df["the_geom_4326"].values[0].startswith( "MULTIPOLYGON(((-69.7528548573896 44.3860361743938,")
def test_nrel_pv_rooftops_rasd(): # config config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE) database_name = config.pv_rooftops_db_name table_name = "pv_rooftop_rasd" # Partitions glue = OEDIGlue(region_name=config.region_name) partition_keys = glue.get_partition_keys(database_name, table_name, with_pandas=False) assert len(partition_keys) == 1 assert partition_keys[0]["Name"] == "city_year" partition_values = glue.get_partition_values(database_name, table_name) if "__HIVE_DEFAULT_PARTITION__" in partition_values: partition_values.remove("__HIVE_DEFAULT_PARTITION__") assert len(partition_values) == 166 # Query athena = OEDIAthena(staging_location=config.staging_location, region_name=config.region_name) sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE city_year='topeka_ks_08';" result = athena.run_query(query_string=sql, pandas_cursor=False) assert result[0][0] == 1 sql = f"SELECT * FROM {database_name}.{table_name} WHERE city_year='augusta_me_08'" df = athena.run_query(query_string=sql, pandas_cursor=True) assert df.shape == (1, 7) assert df["the_geom_96703"].values[0].startswith( "MULTIPOLYGON Z (((2059331.26461854 2664063.99720305 86.4199981689453," ) #test_nrel_pv_rooftops_aspects()
def test_nrel_pv_rooftops_developable_planes(): # config config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE) database_name = config.database_name table_name = "pv_rooftops_developable_planes" # Partitions glue = OEDIGlue(region_name=config.region_name) partition_keys = glue.get_partition_keys(database_name, table_name, with_pandas=False) assert len(partition_keys) == 1 assert partition_keys[0]["Name"] == "partition_0" partition_values = glue.get_partition_values(database_name, table_name) if "__HIVE_DEFAULT_PARTITION__" in partition_values: partition_values.remove("__HIVE_DEFAULT_PARTITION__") assert len(partition_values) == 166 # Query athena = OEDIAthena(staging_location=config.staging_location, region_name=config.region_name) sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE partition_0='topeka_ks_08';" result = athena.run_query(query_string=sql, pandas_cursor=False) assert result[0][0] == 537304 sql = f"SELECT * FROM {database_name}.{table_name} WHERE partition_0='augusta_me_08' AND gid=15924;" df = athena.run_query(query_string=sql, pandas_cursor=True) assert df.shape == (1, 18) assert df["slope"].values[0] == 26
def test_oedi_glue__get_databasses(): # Setup client = boto3.client("glue", region_name="us-west-1") database_name = "test-database" client.create_database(DatabaseInput={"Name": database_name}) # Test glue = OEDIGlue() response = glue.get_databases() assert len(response) == 1 db = response[0] assert db["Name"] == database_name assert db["CreateTime"] is not None
def test_oedi_glue__get_table(): # Setup client = boto3.client("glue", region_name="us-west-1") database_name = "test-database" client.create_database(DatabaseInput={"Name": database_name}) table_name = "test-table" partition_keys = [{"Name": "state"}, {"Name": "city"}] table_input = create_table_input(database_name, table_name, partition_keys) client.create_table(DatabaseName=database_name, TableInput=table_input) # Test glue = OEDIGlue() table = glue.get_table(database_name, table_name) assert table["DatabaseName"] == "test-database" assert table["Name"] == "test-table" assert table["PartitionKeys"] == partition_keys
def test_oedi_glue__list_tables(): # Setup client = boto3.client("glue", region_name="us-west-1") database_name = "test-database" client.create_database(DatabaseInput={"Name": database_name}) table_name = "test-table-1" table_input = create_table_input(database_name, table_name) client.create_table(DatabaseName=database_name, TableInput=table_input) table_name = "test-table-2" table_input = create_table_input(database_name, table_name) client.create_table(DatabaseName=database_name, TableInput=table_input) # Test glue = OEDIGlue() tables = glue.list_tables(database_name) assert len(tables) == 2 assert tables[0]["CreateTime"] == ""
def test_oedi_glue__get_partition_values(): # Setup client = boto3.client("glue", region_name="us-west-1") database_name = "test-database" client.create_database(DatabaseInput={"Name": database_name}) table_name = "test-table" my_partition_keys = [{ "Name": "state" }, { "Name": "city" }, { "Name": "zipcode" }] table_input = create_table_input(database_name, table_name, my_partition_keys) client.create_table(DatabaseName=database_name, TableInput=table_input) # Test glue = OEDIGlue() values = glue.get_partition_values(database_name, table_name) assert len(values) == 0
def test_oedi_glue__get_table_columns(): # Setup client = boto3.client("glue", region_name="us-west-1") database_name = "test-database" client.create_database(DatabaseInput={"Name": database_name}) table_name = "test-table" mycolumns = [{ "Name": "state", "Type": "string" }, { "Name": "size", "Type": "integer" }] table_input = create_table_input(database_name, table_name, columns=mycolumns) client.create_table(DatabaseName=database_name, TableInput=table_input) # Test glue = OEDIGlue() columns = glue.get_table_columns(database_name, table_name) assert len(columns) == 2
import time import click from prettytable import PrettyTable from oedi.config import AWSDataLakeConfig from oedi.AWS.glue import OEDIGlue config = AWSDataLakeConfig() glue = OEDIGlue(region_name=config.region_name) @click.command() def list_databases(): """List available databases""" databases = glue.get_databases() pretty_table = PrettyTable() pretty_table.field_names = ["No.", "Name", "CreateTime"] for i, db in enumerate(databases): pretty_table.add_row([i, db["Name"], db["CreateTime"]]) print("All available databaes are:") print(pretty_table) @click.command() @click.option("-d", "--database-name", type=click.STRING, required=False,
def get_glue_client(): config = AWSDataLakeConfig() glue = OEDIGlue(region_name=config.region_name) return glue
def test_oedi_glue__get_crawler_state(mock_get_crawler): glue = OEDIGlue(config_file=OEDI_TEST_CONFIG_FILE) crawlers = glue.list_crawlers() assert len(crawlers) == 2
def test_oedi_glue__get_crawler(mock_get_crawler): glue = OEDIGlue() crawler_name = "bucket-name-folder2-another-dataset" crawler = glue.get_crawler(crawler_name)["Crawler"] assert crawler["Name"] == crawler_name
def test_oedi_glue__get_crawler(mock_get_crawler): glue = OEDIGlue() crawler_name = "bucket-name-data-one-part-a" crawler = glue.get_crawler(crawler_name)["Crawler"] assert crawler["Name"] == crawler_name