def load_lz_standard_lookup(table_name):
    landing_zone_location = dict_dbs_locations.get('LANDING_ZONE_LOC')

    # Initializing a Spark session
    spark = initialize_spark_session('load_lz' + table_name)

    # Loading the standard lookups with the same schema in landing_zone
    try:

        # Standard schema for the standard lookups
        schema_lookups_schema = StructType([
            StructField("Code", StringType(), True),
            StructField("Description", StringType(), True)
        ])

        df_file = spark \
            .read \
            .schema(schema_lookups_schema) \
            .option("header", "true") \
            .csv(os.path.join(edge_node_path, table_name, '*.csv'))

        df_file.write.format("csv") \
            .mode("overwrite") \
            .option("sep", ",") \
            .option('header', 'true') \
            .save(os.path.join(landing_zone_location, table_name))

        logging.info(f'{table_name} has been loaded in the landing zone.')

    except Exception as e:
        logging.error(f"Failed to load {table_name} in the landing zone,{e}")
from pyspark.sql.functions import col
from constants import dict_dbs_locations, dict_dbs_names


def load_l_airport(spark, integration_layer_loc, landing_zone_name):
    delta_l_airport = DeltaTable.forPath(spark,
                                         integration_layer_loc + '/L_AIRPORT')

    df_LZ_l_airport = spark.sql(f"""
        SELECT 
        CODE
        ,DESCRIPTION
        FROM {landing_zone_name}.L_AIRPORT
    """)

    delta_l_airport.alias("oldData") \
        .merge(df_LZ_l_airport.alias("newData"), "oldData.CODE = newData.CODE") \
        .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \
        .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \
        .execute()


if __name__ == '__main__':
    spark = initialize_spark_session('load_l_airport')
    from delta.tables import *

    integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC')
    landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME')

    load_l_airport(spark, integration_layer_loc, landing_zone_name)
import os
from pyspark.sql.functions import col

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s: %(levelname)s: %(message)s ")

if __name__ == '__main__':
    spark = initialize_spark_session('create_presentation_layer')

    from delta.tables import *

    # Creating the presentation_layer database in spark sql
    try:

        db_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME')
        db_loc = dict_dbs_locations.get('PRESENTATION_LAYER_LOC')

        spark.sql(
            ddl_create_presentation_layer_db.format(
                presentation_layer_db_name=db_name,
                presentation_layer_db_loc=db_loc))

        spark.sql(f'USE {db_name}')

        logging.info(f'{db_name} has been created.')

    except Exception as e:
        logging.error(f'Failed to create the {db_name} db in spark sql,{e}')
        spark.stop()
        raise Exception(f'Failed to create the {db_name}, {e}')
        logging.info(
            'CITY_DEMOGRAPHICS has been loaded in the Presentation layer')

    except Exception as e:
        logging.error(
            'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer')
        spark.stop()
        raise Exception(
            f'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer,{e}')


if __name__ == '__main__':
    spark = initialize_spark_session('load_pl_city_demographics')
    from delta.tables import *

    try:

        presentation_layer_loc = dict_dbs_locations.get(
            'PRESENTATION_LAYER_LOC')
        presentation_layer_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME')
        integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME')

    except Exception as e:
        logging.error('Failed to retrieve Environment variables')
        spark.stop()
        raise Exception(
            f'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer,{e}')

    load_pl_city_demographics(spark, presentation_layer_name,
                              presentation_layer_loc, integration_layer_name)
Exemple #5
0
from pyspark.sql.types import StructField, StructType, StringType
from constants import dict_dbs_locations, edge_node_path
from sql_queries.landing_zone_ddl import list_landing_zone_standard_lookups
from helper_functions.zip_csv_to_gzip_parquet import zip_csv_to_gzip_parquet
from helper_functions.loop_files import loop_files
import os

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s: %(levelname)s: %(message)s ")

if __name__ == '__main__':

    # Initializing a Spark session
    spark = initialize_spark_session('load_landing_zone')

    landing_zone_location = dict_dbs_locations.get('LANDING_ZONE_LOC')

    # Loading the standard lookups with the same schema in landing_zone
    try:

        # Standard schema for the standard lookups
        schema_lookups_schema = StructType([
            StructField("Code", StringType(), True),
            StructField("Description", StringType(), True)
        ])

        # Loops over all the standard lookups to load them
        for table_name in list_landing_zone_standard_lookups:
            df_file = spark \
                .read \
                .schema(schema_lookups_schema) \
from sql_queries.landing_zone_ddl import ddl_create_land_zone_db, dict_landing_zone_ddls
from constants import dict_dbs_locations, dict_dbs_names
from helper_functions.initialize_spark_session import initialize_spark_session

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s: %(levelname)s: %(message)s ")

if __name__ == '__main__':

    spark = initialize_spark_session('create_landing_zone')

    # Creating the landing_zone database in spark sql
    try:

        db_name = dict_dbs_names.get('LANDING_ZONE_NAME')
        db_loc = dict_dbs_locations.get('LANDING_ZONE_LOC')

        spark.sql(
            ddl_create_land_zone_db.format(landing_zone_db_name=db_name,
                                           landing_zone_db_loc=db_loc))

        logging.info(f'{db_name} has been created.')

    except Exception as e:
        logging.error(f'Failed to create the {db_name} db in spark sql,{e}')
        spark.stop()
        raise Exception(f'Failed to create the {db_name}, {e}')

    # creating landing zone tables
    try:
import os
from pyspark.sql.functions import col

logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s: %(levelname)s: %(message)s ")

if __name__ == '__main__':
    spark = initialize_spark_session('create_integration_layer')

    from delta.tables import *

    # Creating the integration_layer database in spark sql
    try:

        db_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME')
        db_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC')

        spark.sql(
            ddl_create_integration_layer_db.format(
                integration_layer_db_name=db_name,
                integration_layer_db_loc=db_loc))

        spark.sql(f'USE {db_name}')

        logging.info(f'{db_name} has been created.')

    except Exception as e:
        logging.error(f'Failed to create the {db_name} db in spark sql,{e}')
        spark.stop()
        raise Exception(f'Failed to create the {db_name}, {e}')