def run(input_directory): #create logger today = datetime.datetime.now().strftime('%Y%m%d') logger = logging.getLogger('dwetl') file_handler = logging.FileHandler(f'logs/dwetl.log.{today}') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.setLevel(logging.INFO) time_started = datetime.datetime.now() logger.info(f'DWETL.py started') ''' create job_info for current process ''' with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] job_info = JobInfoFactory.create_job_info_from_db( session, job_info_table_class) ''' load_stage_1 ''' load_stage_1.load_stage_1(job_info, input_directory, logger) ''' load_stage_2 load 'in_' values from stg1 to stg 2 tables load 'in_' values ''' load_stage_2.load_stage_2(job_info, logger) ''' stg 2 intertable processing ''' stage_2_intertable_processing.stage_2_intertable_processing( job_info, logger) ''' end of job metadata writing ''' endtime = datetime.datetime.now() # write end time to processing cycle table with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] # get row for current id and write end time to it max_prcsng_id = session.query(job_info_table_class).\ filter(job_info_table_class.dw_prcsng_cycle_id == job_info.prcsng_cycle_id).\ update({'dw_prcsng_cycle_exectn_end_tmstmp': endtime}) elapsed_time = endtime - time_started print("elapsed time: ", str(elapsed_time))
def load_fact_table(job_info, logger): print('EZProxy loading fact table...') logger.info('Loading to the fact table.... ') stage2_table = dwetl.Base.classes['dw_stg_2_ezp_sessns_snap'] fact_table = dwetl.Base.classes['fact_ezp_sessns_snap'] processing_cycle_id = job_info.prcsng_cycle_id # get max value for fact key from the reporting db with dwetl.reporting_database_session() as session2: reporting_fact_table = dwetl.ReportingBase.classes[ 'fact_ezp_sessns_snap'] max_ezp_sessns_snap_fact_key = session2.query( func.max(reporting_fact_table.ezp_sessns_snap_fact_key)).scalar() if max_ezp_sessns_snap_fact_key is None: max_ezp_sessns_snap_fact_key = 1 # load etl ezp fact table with dwetl.database_session() as session: reader = SqlAlchemyReader(session, stage2_table, 'em_create_dw_prcsng_cycle_id', processing_cycle_id) writer = SqlAlchemyWriter(session, fact_table) processor = EzproxyFactProcessor(reader, writer, job_info, logger, max_ezp_sessns_snap_fact_key) processor.execute() logger.info('Finished loading to the fact table.... ')
def load_stage_1(job_info, input_directory, logger): print('Loading stage 1...') logger.info('Loading stage 1...') ''' file to table mapping ''' ALEPH_TSV_TABLE_MAPPING = { "mai01_z00_data": "dw_stg_1_mai01_z00", "mai39_z00_data": "dw_stg_1_mai39_z00", "mai01_z13_data": "dw_stg_1_mai01_z13", "mai39_z13_data": "dw_stg_1_mai39_z13", "mai01_z13u_data": "dw_stg_1_mai01_z13u", "mai39_z13u_data": "dw_stg_1_mai39_z13u", # "mai60_z00_data": "dw_stg_1_mai60_z00", # "mai60_z13_data": "dw_stg_1_mai60_z13", # "mai60_z13u_data": "dw_stg_1_mai60_z13u", # "mai60_z103_bib_data": "dw_stg_1_mai50_z103_bib", # "mai50_z30_data": "dw_stg_1_mai50_z30", # "mai50_z35_data": "dw_stg_1_mai50_z35", # "mai50_z30_full_data": "dw_stg_1_mai50_z30_full", # "mai50_z103_bib_full_data": "dw_stg_2_lbry_item_z103_bib_full", } # Z00_FIELD_TABLE_MAPPING = { # "mai01_z00_field_data": "dw_stg_1_mai01_z00_field", # "mai39_z00_field_data": "dw_stg_1_mai39_z00_field", # "mai60_z00_field_data": "dw_stg_1_mai60_z00_field", # } # # MPF_TABLE_MAPPING = { # "mpf_member-library-dimension.txt": "dw_stg_1_mpf_mbr_lbry", # "mpf_library-entity-dimension.txt": "dw_stg_1_mpf_lbry_entity", # "mpf_library-collection-dimension.txt": "dw_stg_1_mpf_collection", # "mpf_item-status-dimension.txt": "dw_stg_1_mpf_item_status", # "mpf_item-process-status-dimension.txt": "dw_stg_1_mpf_item_prcs_status", # "mpf_material-form-dimension.txt": "dw_stg_1_mpf_matrl_form" # } ''' load aleph tsv files minus z00_field tables ''' for file, table in ALEPH_TSV_TABLE_MAPPING.items(): file_path = os.path.join(input_directory, file) print(file_path) logger.info(file_path) with dwetl.database_session() as session: reader = TsvFileReader(file_path) # writer = PrintWriter() writer = SqlAlchemyWriter(session, dwetl.Base.classes[table]) processor = LoadAlephTsv(reader, writer, job_info, logger) processor.execute()
def load_stage_1(job_info, input_file, logger): print('EZProxy Loading stage 1...') logger.info('EZ Proxy Loading stage 1...') table = 'dw_stg_1_ezp_sessns_snap' with dwetl.database_session() as session: reader = EzproxyReader(input_file) # writer = PrintWriter() writer = SqlAlchemyWriter(session, dwetl.Base.classes[table]) processor = LoadAlephTsv(reader, writer, job_info, logger) processor.execute()
def intertable_processing(job_info, logger): print('EZProxy transformations started...') logger.info('EZProxy intertable processing starts...') stage2_table = dwetl.Base.classes['dw_stg_2_ezp_sessns_snap'] processing_cycle_id = job_info.prcsng_cycle_id with dwetl.database_session() as session: reader = SqlAlchemyReader(session, stage2_table, 'em_create_dw_prcsng_cycle_id', processing_cycle_id) writer = SqlAlchemyWriter(session, stage2_table) processor = EzproxyProcessor(reader, writer, job_info, logger) processor.execute() logger.info('Finished EZProxy intertable processing .... ')
def load_stage_2(job_info, logger): print('Loading stage 2...') logger.info('Loading stage 2...') stage1_to_stage2_table_mappings = { "dw_stg_1_mai39_z13": "dw_stg_2_bib_rec_z13", 'dw_stg_1_mai01_z13': "dw_stg_2_bib_rec_z13", "dw_stg_1_mai01_z13u": "dw_stg_2_bib_rec_z13u", "dw_stg_1_mai01_z00": "dw_stg_2_bib_rec_z00", "dw_stg_1_mai39_z00": "dw_stg_2_bib_rec_z00", "dw_stg_1_mai39_z13u": "dw_stg_2_bib_rec_z13u", # "dw_stg_1_mai60_z00": "dw_stg_2_lbry_holding_z00", # "dw_stg_1_mai60_z13": "dw_stg_2_lbry_holding_z13", # "dw_stg_1_mai60_z13u": "dw_stg_2_lbry_holding_z13u", # "dw_stg_1_mai50_z30": "dw_stg_2_lbry_item_z30", # "dw_stg_1_mai50_z35": "dw_stg_2_lbry_item_event_z35", # "dw_stg_1_mai01_z00_field": "dw_stg_2_bib_rec_z00_field", # "dw_stg_1_mai39_z00_field": "dw_stg_2_bib_rec_z00_field", # "dw_stg_1_mai60_z00_field": "dw_stg_2_lbry_holding_z00_field", # "dw_stg_1_mpf_mbr_lbry": "dw_stg_2_mpf_mbr_lbry", # "dw_stg_1_mpf_lbry_entity": "dw_stg_2_mpf_lbry_entity", # "dw_stg_1_mpf_collection": "dw_stg_2_mpf_collection", # "dw_stg_1_mpf_item_status": "dw_stg_2_mpf_item_status", # "dw_stg_1_mpf_item_prcs_status": "dw_stg_2_mpf_item_prcs_status", # "dw_stg_1_mpf_matrl_form": "dw_stg_2_mpf_matrl_form" } processing_cycle_id = job_info.prcsng_cycle_id for stage1_table, stage2_table in stage1_to_stage2_table_mappings.items(): print(stage1_table) logger.info(stage1_table) library = aleph_library(stage1_table) with dwetl.database_session() as session: stage1_table_class = dwetl.Base.classes[stage1_table] stage2_table_class = dwetl.Base.classes[stage2_table] reader = SqlAlchemyReader(session, stage1_table_class, 'em_create_dw_prcsng_cycle_id', processing_cycle_id) writer = SqlAlchemyWriter(session, stage2_table_class) processor = CopyStage1ToStage2.create(reader, writer, job_info, logger, library) processor.execute()
def load_stage_2(job_info, logger): print('EZProxy Loading stage 2...') logger.info('EZProxy Loading stage 2...') processing_cycle_id = job_info.prcsng_cycle_id with dwetl.database_session() as session: stage1_table_class = dwetl.Base.classes["dw_stg_1_ezp_sessns_snap"] stage2_table_class = dwetl.Base.classes["dw_stg_2_ezp_sessns_snap"] reader = SqlAlchemyReader(session, stage1_table_class, 'em_create_dw_prcsng_cycle_id', processing_cycle_id) writer = SqlAlchemyWriter(session, stage2_table_class) # there is no aleph library for ez proxy data, but CopyStage1ToStage2 still will work library = '' processor = CopyStage1ToStage2.create(reader, writer, job_info, logger, library) processor.execute() logger.info('Finished EZProxy loading stage 2 .... ')
def copy_new_facts_to_reporting_db(job_info, logger): etl_fact_table = dwetl.Base.classes['fact_ezp_sessns_snap'] processing_cycle_id = job_info.prcsng_cycle_id # query and select records from etl fact table # should we use the create update processing cycle ID? or the Update processing cycle id? with dwetl.database_session() as session: reader = SqlAlchemyReader(session, etl_fact_table, 'em_create_dw_prcsng_cycle_id', processing_cycle_id) session.expunge_all() # insert records into reporting db ezp fact table with dwetl.reporting_database_session() as session2: reporting_fact_table = dwetl.ReportingBase.classes[ 'fact_ezp_sessns_snap'] writer = SqlAlchemyWriter(session2, reporting_fact_table) processor = EzproxyReportingFactProcessor(reader, writer, job_info, logger) processor.execute()
def stage_2_intertable_processing(job_info, logger): print("Stage 2 Intertable Processing...") logger.info("Stage 2 Intertable Processing...") STG_2_TABLE_CONFIG_MAPPING = { 'dw_stg_2_bib_rec_z00': 'bibliographic_record_dimension', 'dw_stg_2_bib_rec_z13': 'bibliographic_record_dimension', 'dw_stg_2_bib_rec_z13u': 'bibliographic_record_dimension', 'dw_stg_2_bib_rec_z00_field': 'bibliographic_record_dimension' } processing_cycle_id = job_info.prcsng_cycle_id for table, dimension in STG_2_TABLE_CONFIG_MAPPING.items(): print(table) logger.info(table) # get json_config for current dimension table_config_path = os.path.join('table_config', dimension + '.json') json_config = load_table_config(table_config_path) with dwetl.database_session() as session: # gets SA base class for the current table stage2_table_class = dwetl.Base.classes[table] # gets list of PKs for the current table pk_list = [pk.name for pk in stage2_table_class.__table__.primary_key] reader = SqlAlchemyReader(session, stage2_table_class, 'em_create_dw_prcsng_cycle_id', processing_cycle_id) writer = SqlAlchemyWriter(session, stage2_table_class) ''' Preprocessing ''' preprocessor = Preprocess(reader, writer, job_info, logger, json_config, pk_list) preprocessor.execute() ''' # Data Quality Checks # ''' data_quality_checker = DataQualityProcessor(reader, writer, job_info, logger, json_config, pk_list) data_quality_checker.execute()
def run(input_file): #create logger today = datetime.datetime.now().strftime('%Y%m%d') logger = logging.getLogger('dwetl') file_handler = logging.FileHandler(f'logs/ezproxy.log.{today}') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.setLevel(logging.INFO) time_started = datetime.datetime.now() logger.info(f'EzProxy ETL started') ''' check current hostname environment configuration to prevent errors ''' hostname = socket.gethostname() configured_host = database_credentials.configured_host() if hostname != configured_host: # quit program if env file hostname doesn't match with the current hostname print( 'ERROR: EzProxy ETL ended because .env contained an error. Please double check the configured host and db configuration.' ) logger.error( f'EzProxy ETL ended because .env contained an error. please double check the configured host and db configuration.' ) sys.exit() ''' create job_info for current process ''' with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] job_info = JobInfoFactory.create_job_info_from_db( session, job_info_table_class) ''' load ezproxy stage 1 ''' ezproxy_load.load_stage_1(job_info, input_file, logger) ''' load ezproxy stage 2 ''' ezproxy_load.load_stage_2(job_info, logger) ''' stg 2 intertable processing ''' ezproxy_load.intertable_processing(job_info, logger) ''' fact table load ''' ezproxy_load.load_fact_table(job_info, logger) ''' copy new ezproxy data to reporting database ''' ezproxy_load.copy_new_facts_to_reporting_db(job_info, logger) ''' end of job metadata writing ''' endtime = datetime.datetime.now() # write end time to processing cycle table with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] # get row for current id and write end time to it max_prcsng_id = session.query(job_info_table_class).\ filter(job_info_table_class.dw_prcsng_cycle_id == job_info.prcsng_cycle_id).\ update({'dw_prcsng_cycle_exectn_end_tmstmp': endtime}) elapsed_time = endtime - time_started print("Ezproxy ETL elapsed time: ", str(elapsed_time)) logger.info(f'EzProxy ETL elapsed time: {str(elapsed_time)}')