def test_lbnl_tracking_the_sun_2019():
    # config
    config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE)
    database_name = config.tracking_the_sun_db_name
    table_name = "tracking_the_sun_2019"

    # Partitions
    glue = OEDIGlue(region_name=config.region_name)
    partition_keys = glue.get_partition_keys(database_name,
                                             table_name,
                                             with_pandas=False)
    assert len(partition_keys) == 1
    assert partition_keys[0]["Name"] == "state"

    partition_values = glue.get_partition_values(database_name, table_name)
    if "__HIVE_DEFAULT_PARTITION__" in partition_values:
        partition_values.remove("__HIVE_DEFAULT_PARTITION__")
    assert len(partition_values) == 28

    # Query
    athena = OEDIAthena(staging_location=config.staging_location,
                        region_name=config.region_name)
    sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE state='CO';"
    result = athena.run_query(query_string=sql, pandas_cursor=False)
    assert result[0][0] == 93930

    sql = f"SELECT * FROM {database_name}.{table_name} WHERE state='CO' AND system_id_from_first_data_provider='SRO00164';"
    df = athena.run_query(query_string=sql, pandas_cursor=True)
    assert df.shape == (2, 60)
    assert pytest.approx(df["total_installed_price"].values[0]) == 22504.46
def test_nrel_pv_rooftops_aspects():
    # config
    config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE)
    database_name = config.pv_rooftops_db_name
    table_name = "pv_rooftop_aspects"

    # Partitions
    glue = OEDIGlue(region_name=config.region_name)
    partition_keys = glue.get_partition_keys(database_name,
                                             table_name,
                                             with_pandas=False)
    assert len(partition_keys) == 1
    assert partition_keys[0]["Name"] == "city_year"

    partition_values = glue.get_partition_values(database_name, table_name)
    if "__HIVE_DEFAULT_PARTITION__" in partition_values:
        partition_values.remove("__HIVE_DEFAULT_PARTITION__")
    assert len(partition_values) == 168

    # Query
    athena = OEDIAthena(staging_location=config.staging_location,
                        region_name=config.region_name)
    sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE city_year='augusta_me_08';"
    result = athena.run_query(query_string=sql, pandas_cursor=False)
    assert result[0][0] == 197547

    sql = f"SELECT * FROM {database_name}.{table_name} WHERE city_year='augusta_me_08' AND gid=53283;"
    df = athena.run_query(query_string=sql, pandas_cursor=True)
    assert df.shape == (1, 9)
    assert df["aspect"].values[0] == 7
def test_nrel_pv_rooftops_buildings():
    # config
    config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE)
    database_name = config.pv_rooftops_db_name
    table_name = "pv_rooftop_buildings"

    # Partitions
    glue = OEDIGlue(region_name=config.region_name)
    partition_keys = glue.get_partition_keys(database_name,
                                             table_name,
                                             with_pandas=False)
    assert len(partition_keys) == 1
    assert partition_keys[0]["Name"] == "city_year"

    partition_values = glue.get_partition_values(database_name, table_name)
    if "__HIVE_DEFAULT_PARTITION__" in partition_values:
        partition_values.remove("__HIVE_DEFAULT_PARTITION__")
    assert len(partition_values) == 168

    # Query
    athena = OEDIAthena(staging_location=config.staging_location,
                        region_name=config.region_name)
    sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE city_year='augusta_me_08';"
    result = athena.run_query(query_string=sql, pandas_cursor=False)
    assert result[0][0] == 17431

    sql = f"SELECT * FROM {database_name}.{table_name} WHERE city_year='augusta_me_08' AND gid=16039;"
    df = athena.run_query(query_string=sql, pandas_cursor=True)
    assert df.shape == (1, 8)
    assert df["the_geom_4326"].values[0].startswith(
        "MULTIPOLYGON(((-69.7528548573896 44.3860361743938,")
def test_nrel_pv_rooftops_rasd():
    # config
    config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE)
    database_name = config.pv_rooftops_db_name
    table_name = "pv_rooftop_rasd"

    # Partitions
    glue = OEDIGlue(region_name=config.region_name)
    partition_keys = glue.get_partition_keys(database_name,
                                             table_name,
                                             with_pandas=False)
    assert len(partition_keys) == 1
    assert partition_keys[0]["Name"] == "city_year"

    partition_values = glue.get_partition_values(database_name, table_name)
    if "__HIVE_DEFAULT_PARTITION__" in partition_values:
        partition_values.remove("__HIVE_DEFAULT_PARTITION__")
    assert len(partition_values) == 166

    # Query
    athena = OEDIAthena(staging_location=config.staging_location,
                        region_name=config.region_name)
    sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE city_year='topeka_ks_08';"
    result = athena.run_query(query_string=sql, pandas_cursor=False)
    assert result[0][0] == 1

    sql = f"SELECT * FROM {database_name}.{table_name} WHERE city_year='augusta_me_08'"
    df = athena.run_query(query_string=sql, pandas_cursor=True)
    assert df.shape == (1, 7)
    assert df["the_geom_96703"].values[0].startswith(
        "MULTIPOLYGON Z (((2059331.26461854 2664063.99720305 86.4199981689453,"
    )


#test_nrel_pv_rooftops_aspects()
def test_nrel_pv_rooftops_developable_planes():
    # config
    config = AWSDataLakeConfig(config_file=TEST_CONFIG_FILE)
    database_name = config.database_name
    table_name = "pv_rooftops_developable_planes"

    # Partitions
    glue = OEDIGlue(region_name=config.region_name)
    partition_keys = glue.get_partition_keys(database_name,
                                             table_name,
                                             with_pandas=False)
    assert len(partition_keys) == 1
    assert partition_keys[0]["Name"] == "partition_0"

    partition_values = glue.get_partition_values(database_name, table_name)
    if "__HIVE_DEFAULT_PARTITION__" in partition_values:
        partition_values.remove("__HIVE_DEFAULT_PARTITION__")
    assert len(partition_values) == 166

    # Query
    athena = OEDIAthena(staging_location=config.staging_location,
                        region_name=config.region_name)
    sql = f"SELECT COUNT(*) FROM {database_name}.{table_name} WHERE partition_0='topeka_ks_08';"
    result = athena.run_query(query_string=sql, pandas_cursor=False)
    assert result[0][0] == 537304

    sql = f"SELECT * FROM {database_name}.{table_name} WHERE partition_0='augusta_me_08' AND gid=15924;"
    df = athena.run_query(query_string=sql, pandas_cursor=True)
    assert df.shape == (1, 18)
    assert df["slope"].values[0] == 26
def test_oedi_glue__get_databasses():
    # Setup
    client = boto3.client("glue", region_name="us-west-1")

    database_name = "test-database"
    client.create_database(DatabaseInput={"Name": database_name})

    # Test
    glue = OEDIGlue()
    response = glue.get_databases()
    assert len(response) == 1

    db = response[0]
    assert db["Name"] == database_name
    assert db["CreateTime"] is not None
def test_oedi_glue__get_table():
    # Setup
    client = boto3.client("glue", region_name="us-west-1")

    database_name = "test-database"
    client.create_database(DatabaseInput={"Name": database_name})

    table_name = "test-table"
    partition_keys = [{"Name": "state"}, {"Name": "city"}]
    table_input = create_table_input(database_name, table_name, partition_keys)
    client.create_table(DatabaseName=database_name, TableInput=table_input)

    # Test
    glue = OEDIGlue()
    table = glue.get_table(database_name, table_name)
    assert table["DatabaseName"] == "test-database"
    assert table["Name"] == "test-table"
    assert table["PartitionKeys"] == partition_keys
def test_oedi_glue__list_tables():
    # Setup
    client = boto3.client("glue", region_name="us-west-1")

    database_name = "test-database"
    client.create_database(DatabaseInput={"Name": database_name})

    table_name = "test-table-1"
    table_input = create_table_input(database_name, table_name)
    client.create_table(DatabaseName=database_name, TableInput=table_input)

    table_name = "test-table-2"
    table_input = create_table_input(database_name, table_name)
    client.create_table(DatabaseName=database_name, TableInput=table_input)

    # Test
    glue = OEDIGlue()
    tables = glue.list_tables(database_name)
    assert len(tables) == 2
    assert tables[0]["CreateTime"] == ""
def test_oedi_glue__get_partition_values():
    # Setup
    client = boto3.client("glue", region_name="us-west-1")

    database_name = "test-database"
    client.create_database(DatabaseInput={"Name": database_name})

    table_name = "test-table"
    my_partition_keys = [{
        "Name": "state"
    }, {
        "Name": "city"
    }, {
        "Name": "zipcode"
    }]
    table_input = create_table_input(database_name, table_name,
                                     my_partition_keys)
    client.create_table(DatabaseName=database_name, TableInput=table_input)

    # Test
    glue = OEDIGlue()
    values = glue.get_partition_values(database_name, table_name)
    assert len(values) == 0
def test_oedi_glue__get_table_columns():
    # Setup
    client = boto3.client("glue", region_name="us-west-1")

    database_name = "test-database"
    client.create_database(DatabaseInput={"Name": database_name})

    table_name = "test-table"
    mycolumns = [{
        "Name": "state",
        "Type": "string"
    }, {
        "Name": "size",
        "Type": "integer"
    }]
    table_input = create_table_input(database_name,
                                     table_name,
                                     columns=mycolumns)
    client.create_table(DatabaseName=database_name, TableInput=table_input)

    # Test
    glue = OEDIGlue()
    columns = glue.get_table_columns(database_name, table_name)
    assert len(columns) == 2
Beispiel #11
0
import time

import click
from prettytable import PrettyTable

from oedi.config import AWSDataLakeConfig
from oedi.AWS.glue import OEDIGlue

config = AWSDataLakeConfig()
glue = OEDIGlue(region_name=config.region_name)


@click.command()
def list_databases():
    """List available databases"""
    databases = glue.get_databases()

    pretty_table = PrettyTable()
    pretty_table.field_names = ["No.", "Name", "CreateTime"]
    for i, db in enumerate(databases):
        pretty_table.add_row([i, db["Name"], db["CreateTime"]])

    print("All available databaes are:")
    print(pretty_table)


@click.command()
@click.option("-d",
              "--database-name",
              type=click.STRING,
              required=False,
Beispiel #12
0
def get_glue_client():
    config = AWSDataLakeConfig()
    glue = OEDIGlue(region_name=config.region_name)
    return glue
def test_oedi_glue__get_crawler_state(mock_get_crawler):
    glue = OEDIGlue(config_file=OEDI_TEST_CONFIG_FILE)
    crawlers = glue.list_crawlers()
    assert len(crawlers) == 2
def test_oedi_glue__get_crawler(mock_get_crawler):
    glue = OEDIGlue()
    crawler_name = "bucket-name-folder2-another-dataset"
    crawler = glue.get_crawler(crawler_name)["Crawler"]
    assert crawler["Name"] == crawler_name
def test_oedi_glue__get_crawler(mock_get_crawler):
    glue = OEDIGlue()
    crawler_name = "bucket-name-data-one-part-a"
    crawler = glue.get_crawler(crawler_name)["Crawler"]
    assert crawler["Name"] == crawler_name