def load_lz_standard_lookup(table_name): landing_zone_location = dict_dbs_locations.get('LANDING_ZONE_LOC') # Initializing a Spark session spark = initialize_spark_session('load_lz' + table_name) # Loading the standard lookups with the same schema in landing_zone try: # Standard schema for the standard lookups schema_lookups_schema = StructType([ StructField("Code", StringType(), True), StructField("Description", StringType(), True) ]) df_file = spark \ .read \ .schema(schema_lookups_schema) \ .option("header", "true") \ .csv(os.path.join(edge_node_path, table_name, '*.csv')) df_file.write.format("csv") \ .mode("overwrite") \ .option("sep", ",") \ .option('header', 'true') \ .save(os.path.join(landing_zone_location, table_name)) logging.info(f'{table_name} has been loaded in the landing zone.') except Exception as e: logging.error(f"Failed to load {table_name} in the landing zone,{e}")
from constants import dict_dbs_locations, dict_dbs_names def load_l_ontime_delay_groups(spark, integration_layer_loc, landing_zone_name): delta_l_ontime_delay_groups = DeltaTable.forPath( spark, integration_layer_loc + '/L_ONTIME_DELAY_GROUPS') df_LZ_l_ontime_delay_groups = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_ONTIME_DELAY_GROUPS """) delta_l_ontime_delay_groups.alias("oldData") \ .merge(df_LZ_l_ontime_delay_groups.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_ontime_delay_groups') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_ontime_delay_groups(spark, integration_layer_loc, landing_zone_name)
import logging from sql_queries.presentation_layer_ddl import ddl_create_presentation_layer_db, dict_pl_non_partitioned_tables, \ schema_calendar, schema_flights from constants import dict_dbs_locations, dict_dbs_names from helper_functions.initialize_spark_session import initialize_spark_session import os from pyspark.sql.functions import col logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': spark = initialize_spark_session('create_presentation_layer') from delta.tables import * # Creating the presentation_layer database in spark sql try: db_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME') db_loc = dict_dbs_locations.get('PRESENTATION_LAYER_LOC') spark.sql( ddl_create_presentation_layer_db.format( presentation_layer_db_name=db_name, presentation_layer_db_loc=db_loc)) spark.sql(f'USE {db_name}') logging.info(f'{db_name} has been created.')
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_weekdays(spark, integration_layer_loc, landing_zone_name): delta_l_weekdays = DeltaTable.forPath( spark, integration_layer_loc + '/L_WEEKDAYS') df_LZ_l_weekdays = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_WEEKDAYS """) delta_l_weekdays.alias("oldData") \ .merge(df_LZ_l_weekdays.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_weekdays') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_weekdays(spark, integration_layer_loc, landing_zone_name)
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_world_area_codes(spark, integration_layer_loc, landing_zone_name): delta_l_world_area_codes = DeltaTable.forPath( spark, integration_layer_loc + '/L_WORLD_AREA_CODES') df_LZ_l_wworld_area_codes = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_WORLD_AREA_CODES """) delta_l_world_area_codes.alias("oldData") \ .merge(df_LZ_l_wworld_area_codes.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_il_l_world_area_codes') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_world_area_codes(spark, integration_layer_loc, landing_zone_name)
import logging from helper_functions.initialize_spark_session import initialize_spark_session from pyspark.sql.types import StructField, StructType, StringType from constants import dict_dbs_locations, edge_node_path from sql_queries.landing_zone_ddl import list_landing_zone_standard_lookups from helper_functions.zip_csv_to_gzip_parquet import zip_csv_to_gzip_parquet from helper_functions.loop_files import loop_files import os logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': # Initializing a Spark session spark = initialize_spark_session('load_landing_zone') landing_zone_location = dict_dbs_locations.get('LANDING_ZONE_LOC') # Loading the standard lookups with the same schema in landing_zone try: # Standard schema for the standard lookups schema_lookups_schema = StructType([ StructField("Code", StringType(), True), StructField("Description", StringType(), True) ]) # Loops over all the standard lookups to load them for table_name in list_landing_zone_standard_lookups: df_file = spark \
import logging from sql_queries.landing_zone_ddl import ddl_create_land_zone_db, dict_landing_zone_ddls from constants import dict_dbs_locations, dict_dbs_names from helper_functions.initialize_spark_session import initialize_spark_session logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': spark = initialize_spark_session('create_landing_zone') # Creating the landing_zone database in spark sql try: db_name = dict_dbs_names.get('LANDING_ZONE_NAME') db_loc = dict_dbs_locations.get('LANDING_ZONE_LOC') spark.sql( ddl_create_land_zone_db.format(landing_zone_db_name=db_name, landing_zone_db_loc=db_loc)) logging.info(f'{db_name} has been created.') except Exception as e: logging.error(f'Failed to create the {db_name} db in spark sql,{e}') spark.stop() raise Exception(f'Failed to create the {db_name}, {e}') # creating landing zone tables try:
except Exception as e: logging.error('Failed to load CALENDAR in the Presentation Layer') spark.stop() raise Exception(f'Failed to load CALENDAR in the Presentation Layer,{e}') if __name__ == '__main__': start_date = None end_date = None # finding and parsing start date & end date args to be provided to the function for arg in sys.argv: if 'start_date' in arg: start_date = arg.split('=')[1] elif 'end_date' in arg: end_date = arg.split('=')[1] if (start_date is not None) and (end_date is not None): spark = initialize_spark_session('load_pl_calendar') from delta.tables import * presentation_layer_loc = dict_dbs_locations.get('PRESENTATION_LAYER_LOC') load_pl_calendar(spark, presentation_layer_loc, start_date, end_date) else: raise Exception( 'start_date and end_date arguments are required to start loading, ex: start_date=2020-01-01 end_date=2022-12-31')
from helper_functions.initialize_spark_session import initialize_spark_session from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_distance_group_250(spark, integration_layer_loc, landing_zone_name): delta_l_distance_group_250 = DeltaTable.forPath(spark, integration_layer_loc + '/L_DISTANCE_GROUP_250') df_LZ_l_distance_group_250 = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_DISTANCE_GROUP_250 """) delta_l_distance_group_250.alias("oldData") \ .merge(df_LZ_l_distance_group_250.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_distance_group_250') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_distance_group_250(spark, integration_layer_loc, landing_zone_name)
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_months(spark, integration_layer_loc, landing_zone_name): delta_l_months = DeltaTable.forPath(spark, integration_layer_loc + '/L_MONTHS') df_LZ_l_months = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_MONTHS """) delta_l_months.alias("oldData") \ .merge(df_LZ_l_months.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_months') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_months(spark, integration_layer_loc, landing_zone_name)
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_state_fips(spark, integration_layer_loc, landing_zone_name): delta_l_state_state_fips = DeltaTable.forPath( spark, integration_layer_loc + '/L_STATE_FIPS') df_LZ_l_state_state_fips = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_STATE_FIPS """) delta_l_state_state_fips.alias("oldData") \ .merge(df_LZ_l_state_state_fips.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_il_l_state_fips') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_state_fips(spark, integration_layer_loc, landing_zone_name)
import logging from helper_functions.initialize_spark_session import initialize_spark_session from sql_queries.integration_layer_ddl import ddl_drop_integration_layer_db from constants import dict_dbs_names logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s") if __name__ == '__main__': spark = initialize_spark_session('drop_integration_layer') try: db_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') spark.sql( ddl_drop_integration_layer_db.format( integration_layer_db_name=db_name)) logging.info(f'The {db_name} Db has been Dropped') except Exception as e: logging.error(f"Failed to drop the {db_name},{e}")
from helper_functions.initialize_spark_session import initialize_spark_session from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_quarters(spark, integration_layer_loc, landing_zone_name): delta_l_quarters = DeltaTable.forPath(spark, integration_layer_loc + '/L_QUARTERS') df_LZ_l_quarters = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_QUARTERS """) delta_l_quarters.alias("oldData") \ .merge(df_LZ_l_quarters.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_quarters') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_quarters(spark, integration_layer_loc, landing_zone_name)
0: 'NULL_COUNT' }, inplace=True) df_null_columns = pd_df_nulls[pd_df_nulls.NULL_COUNT != 0] if len(df_null_columns) > 0: raise Exception( f'{df_null_columns} NULLs exist in {schehma_name}.{table_name}\n{df_null_columns.to_string()}' ) else: logging.info(pd_df_nulls.to_string()) if __name__ == '__main__': spark = initialize_spark_session('spark_count_nulls') from delta.tables import * schehma_name = None table_name = None query_args = None # finding and parsing the yearmonth argument for flights table partition for arg in sys.argv: if 'schehma_name' in arg: schehma_name = arg.split('=')[1] if 'table_name' in arg: table_name = arg.split('=')[1] if 'query_args' in arg: query_args = arg.split('=', maxsplit=1)[1]
.format("delta") \ .mode("overwrite") \ .option("replaceWhere", f"FLIGHT_YEARMON = {yearmonth}") \ .save(pl_loc + '/FLIGHTS') logging.info('FLIGHTS has been loaded in the Presentation layer') except Exception as e: logging.error('Failed to load FLIGHTS in the Presentation Layer') spark.stop() raise Exception(f'Failed to load FLIGHTS in the Presentation Layer,{e}') if __name__ == '__main__': spark = initialize_spark_session('load_pl_flights') from delta.tables import * yearmonth = None # finding and parsing the yearmonth argument for flights table partition for arg in sys.argv: if 'yearmonth' in arg: yearmonth = arg.split('=')[1] if yearmonth is not None: presentation_layer_loc = dict_dbs_locations.get('PRESENTATION_LAYER_LOC') integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') load_pl_flights(spark,presentation_layer_loc, integration_layer_name, yearmonth) else:
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_airport_seq_id(spark, integration_layer_loc, landing_zone_name): delta_l_airport_seq_id = DeltaTable.forPath( spark, integration_layer_loc + '/L_AIRPORT_SEQ_ID') df_LZ_l_airport_seq_id = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_AIRPORT_SEQ_ID """) delta_l_airport_seq_id.alias("oldData") \ .merge(df_LZ_l_airport_seq_id.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_integration_layer') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_airport_seq_id(spark, integration_layer_loc, landing_zone_name)
"STATE_ABR": col("newData.STATE_ABR"), "STATE_FIPS": col("newData.STATE_FIPS"), "STATE_NAME": col("newData.STATE_NAME"), "WAC_CODE": col("newData.WAC_CODE") }) \ .execute() logging.info('STATE has been loaded in the Presentation layer') except Exception as e: logging.error('Failed to load STATE in the Presentation Layer') spark.stop() raise Exception(f'Failed to load STATE in the Presentation Layer,{e}') if __name__ == '__main__': spark = initialize_spark_session('load_pl_state') from delta.tables import * try: presentation_layer_loc = dict_dbs_locations.get( 'PRESENTATION_LAYER_LOC') integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') except Exception as e: logging.error('Failed to retrieve Environment variables') spark.stop() raise Exception(f'Failed to load STATE in the Presentation Layer,{e}') load_pl_state(spark, presentation_layer_loc, integration_layer_name)
from helper_functions.initialize_spark_session import initialize_spark_session from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_yesno_resp(spark, integration_layer_loc, landing_zone_name): delta_l_yesno_resp = DeltaTable.forPath(spark, integration_layer_loc + '/L_YESNO_RESP') df_LZ_l_yesno_resp = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_YESNO_RESP """) delta_l_yesno_resp.alias("oldData") \ .merge(df_LZ_l_yesno_resp.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_yesno_resp') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_yesno_resp(spark, integration_layer_loc, landing_zone_name)
df_LZ_city = df_LZ_city.withColumn("CITY_ID", monotonically_increasing_id()) df_LZ_city.write.format("delta").mode("append").save(pl_loc + '/CITY') logging.info('CITY has been loaded in the Presentation layer') except Exception as e: logging.error('Failed to load CITY in the Presentation Layer') spark.stop() raise Exception(f'Failed to load CITY in the Presentation Layer,{e}') if __name__ == '__main__': spark = initialize_spark_session('load_pl_city') from delta.tables import * try: presentation_layer_loc = dict_dbs_locations.get( 'PRESENTATION_LAYER_LOC') presentation_layer_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME') integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') except Exception as e: logging.error('Failed to retrieve Environment variables') spark.stop() raise Exception(f'Failed to load CITY in the Presentation Layer,{e}') load_pl_city(spark, presentation_layer_loc, presentation_layer_name, integration_layer_name)
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_airline_id(spark, integration_layer_loc, landing_zone_name): delta_l_airline_id = DeltaTable.forPath( spark, integration_layer_loc + '/L_AIRLINE_ID') df_LZ_l_airline_id = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_AIRLINE_ID """) delta_l_airline_id.alias("oldData") \ .merge(df_LZ_l_airline_id.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_il_airline_id') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_airline_id(spark, integration_layer_loc, landing_zone_name)
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_city_market(spark, integration_layer_loc, landing_zone_name): delta_l_city_market = DeltaTable.forPath( spark, integration_layer_loc + '/L_CITY_MARKET_ID') df_LZ_l_city_market = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_CITY_MARKET_ID """) delta_l_city_market.alias("oldData") \ .merge(df_LZ_l_city_market.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_city_market') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_city_market(spark, integration_layer_loc, landing_zone_name)
"AIRLINE_NAME": col("newData.AIRLINE_NAME"), "AIRLINE_CODE": col("newData.AIRLINE_CODE") }) \ .execute() logging.info('AIRLINE has been loaded in the Presentation layer') except Exception as e: logging.error('Failed to load AIRLINE in the Presentation Layer') spark.stop() raise Exception( f'Failed to load Airline in the Presentation Layer,{e}') if __name__ == '__main__': spark = initialize_spark_session('load_pl_airline') from delta.tables import * try: presentation_layer_loc = dict_dbs_locations.get( 'PRESENTATION_LAYER_LOC') integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') except Exception as e: logging.error('Failed to retrieve Environment variables') spark.stop() raise Exception( f'Failed to load Airline in the Presentation Layer,{e}') load_pl_airline(spark, presentation_layer_loc, integration_layer_name)
}) \ .execute() logging.info( 'CITY_DEMOGRAPHICS has been loaded in the Presentation layer') except Exception as e: logging.error( 'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer') spark.stop() raise Exception( f'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer,{e}') if __name__ == '__main__': spark = initialize_spark_session('load_pl_city_demographics') from delta.tables import * try: presentation_layer_loc = dict_dbs_locations.get( 'PRESENTATION_LAYER_LOC') presentation_layer_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME') integration_layer_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') except Exception as e: logging.error('Failed to retrieve Environment variables') spark.stop() raise Exception( f'Failed to load CITY_DEMOGRAPHICS in the Presentation Layer,{e}')
import logging from helper_functions.initialize_spark_session import initialize_spark_session logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s") if __name__ == '__main__': # TODO : Update this as in drop_integration_layer.py spark = initialize_spark_session('drop_landing_zone') try: spark.sql("""DROP DATABASE LANDING_ZONE CASCADE""") logging.info('The Landing Zone Db has been Dropped') except Exception as e: logging.error(f"Failed to drop the Landing Zone,{e}")
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_cancellation(spark, integration_layer_loc, landing_zone_name): delta_l_cancellation = DeltaTable.forPath( spark, integration_layer_loc + '/L_CANCELLATION') df_LZ_l_cancellation = spark.sql(f""" SELECT CODE ,DESCRIPTION FROM {landing_zone_name}.L_CANCELLATION """) delta_l_cancellation.alias("oldData") \ .merge(df_LZ_l_cancellation.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_il_l_cancellation') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_cancellation(spark, integration_layer_loc, landing_zone_name)
import logging from helper_functions.initialize_spark_session import initialize_spark_session from sql_queries.integration_layer_ddl import ddl_drop_integration_layer_db from constants import dict_dbs_names logging.basicConfig(level=logging.INFO, format="%(asctime)s:%(levelname)s:%(message)s") if __name__ == '__main__': spark = initialize_spark_session('drop_presentation_layer') try: db_name = dict_dbs_names.get('PRESENTATION_LAYER_NAME') spark.sql(ddl_drop_integration_layer_db.format(integration_layer_db_name=db_name)) logging.info(f'The {db_name} Db has been Dropped') except Exception as e: logging.error(f"Failed to drop the {db_name},{e}")
from helper_functions.initialize_spark_session import initialize_spark_session from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_diversions(spark, integration_layer_loc, landing_zone_name): delta_l_diversions = DeltaTable.forPath(spark, integration_layer_loc + '/L_DIVERSIONS') df_LZ_l_diversions = spark.sql(f""" SELECT CAST(CODE AS INTEGER) AS CODE ,DESCRIPTION FROM {landing_zone_name}.L_DIVERSIONS """) delta_l_diversions.alias("oldData") \ .merge(df_LZ_l_diversions.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_diversions') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_diversions(spark, integration_layer_loc, landing_zone_name)
import logging from sql_queries.integration_layer_ddl import ddl_create_integration_layer_db, dict_integration_layer_standard_lookups, \ schema_flights, schema_city_demographics from constants import dict_dbs_locations, dict_dbs_names from helper_functions.initialize_spark_session import initialize_spark_session import os from pyspark.sql.functions import col logging.basicConfig(level=logging.INFO, format="%(asctime)s: %(levelname)s: %(message)s ") if __name__ == '__main__': spark = initialize_spark_session('create_integration_layer') from delta.tables import * # Creating the integration_layer database in spark sql try: db_name = dict_dbs_names.get('INTEGRATION_LAYER_NAME') db_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') spark.sql( ddl_create_integration_layer_db.format( integration_layer_db_name=db_name, integration_layer_db_loc=db_loc)) spark.sql(f'USE {db_name}') logging.info(f'{db_name} has been created.')
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_airport(spark, integration_layer_loc, landing_zone_name): delta_l_airport = DeltaTable.forPath(spark, integration_layer_loc + '/L_AIRPORT') df_LZ_l_airport = spark.sql(f""" SELECT CODE ,DESCRIPTION FROM {landing_zone_name}.L_AIRPORT """) delta_l_airport.alias("oldData") \ .merge(df_LZ_l_airport.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_l_airport') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_airport(spark, integration_layer_loc, landing_zone_name)
from pyspark.sql.functions import col from constants import dict_dbs_locations, dict_dbs_names def load_l_state_abr_aviation(spark, integration_layer_loc, landing_zone_name): delta_l_state_abr_aviation = DeltaTable.forPath( spark, integration_layer_loc + '/L_STATE_ABR_AVIATION') df_LZ_l_state_abr_aviation = spark.sql(f""" SELECT CODE ,DESCRIPTION FROM {landing_zone_name}.L_STATE_ABR_AVIATION """) delta_l_state_abr_aviation.alias("oldData") \ .merge(df_LZ_l_state_abr_aviation.alias("newData"), "oldData.CODE = newData.CODE") \ .whenMatchedUpdate(set={"DESCRIPTION": col("newData.DESCRIPTION")}) \ .whenNotMatchedInsert(values={"CODE": col("newData.CODE"), "DESCRIPTION": col("newData.DESCRIPTION")}) \ .execute() if __name__ == '__main__': spark = initialize_spark_session('load_il_l_state_abr_aviation') from delta.tables import * integration_layer_loc = dict_dbs_locations.get('INTEGRATION_LAYER_LOC') landing_zone_name = dict_dbs_names.get('LANDING_ZONE_NAME') load_l_state_abr_aviation(spark, integration_layer_loc, landing_zone_name)