def test_job_info_factory_create_from_db_existing(self): '''test when there are existing entries in the dw_prcsng_cycle table''' with dwetl.test_database_session() as session: table_base_class = dwetl.Base.classes.dw_prcsng_cycle prcsng_cycle_id = self.get_max_prcsng_cycle_id( session, table_base_class) job_info = JobInfoFactory.create_job_info_from_db( session, table_base_class) self.assertEqual(prcsng_cycle_id + 1, job_info.prcsng_cycle_id) expected_version = dwetl.version self.assertEqual(expected_version, job_info.job_version_no) self.assertEqual(1, job_info.job_exectn_id) job_info = JobInfoFactory.create_job_info_from_db( session, table_base_class) self.assertEqual(prcsng_cycle_id + 2, job_info.prcsng_cycle_id) expected_version = dwetl.version self.assertEqual(expected_version, job_info.job_version_no)
def run(input_directory): #create logger today = datetime.datetime.now().strftime('%Y%m%d') logger = logging.getLogger('dwetl') file_handler = logging.FileHandler(f'logs/dwetl.log.{today}') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.setLevel(logging.INFO) time_started = datetime.datetime.now() logger.info(f'DWETL.py started') ''' create job_info for current process ''' with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] job_info = JobInfoFactory.create_job_info_from_db( session, job_info_table_class) ''' load_stage_1 ''' load_stage_1.load_stage_1(job_info, input_directory, logger) ''' load_stage_2 load 'in_' values from stg1 to stg 2 tables load 'in_' values ''' load_stage_2.load_stage_2(job_info, logger) ''' stg 2 intertable processing ''' stage_2_intertable_processing.stage_2_intertable_processing( job_info, logger) ''' end of job metadata writing ''' endtime = datetime.datetime.now() # write end time to processing cycle table with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] # get row for current id and write end time to it max_prcsng_id = session.query(job_info_table_class).\ filter(job_info_table_class.dw_prcsng_cycle_id == job_info.prcsng_cycle_id).\ update({'dw_prcsng_cycle_exectn_end_tmstmp': endtime}) elapsed_time = endtime - time_started print("elapsed time: ", str(elapsed_time))
def test_job_info_factory_create_from_db(self): # expected user expected_user = getpass.getuser() with dwetl.test_database_session() as session: table_base_class = dwetl.Base.classes.dw_prcsng_cycle prcsng_cycle_id = self.get_max_prcsng_cycle_id( session, table_base_class) next_pcid = prcsng_cycle_id + 1 job_info = JobInfoFactory.create_job_info_from_db( session, table_base_class) self.assertEqual(next_pcid, job_info.prcsng_cycle_id) self.assertEqual(1, job_info.job_exectn_id) self.assertEqual(dwetl.version, job_info.job_version_no) self.assertEqual(expected_user, job_info.user_id) added_record = session.query(table_base_class).filter( table_base_class.dw_prcsng_cycle_id == next_pcid).one().__dict__ self.assertEqual(next_pcid, added_record['dw_prcsng_cycle_id']) self.assertEqual(expected_user, added_record['em_create_user_id'])
# #writer = PrintWriter() # writer = SqlAlchemyWriter(session, dwetl.Base.classes[table]) # logger = None # processor = LoadMpfTsv(reader, writer, job_info, logger) # # processor.execute() ''' main function for running script from the command line ''' if __name__=='__main__': arguments = sys.argv if len(arguments) < 2 or len(arguments) > 3: print('Usage: ') print('\tload_stage_1.py [prcsng_cycle_id] [data_directory] ') sys.exit(1) prcsng_cycle_id = arguments[1] input_directory = os.path.dirname(os.path.realpath(__file__)) today = datetime.datetime.now().strftime('%Y%m%d') # if 2nd argument isn't provided use today as data directory data_directory = os.path.join(input_directory,'data', today) # data directory can be specified as 2nd argument if len(arguments) == 3: data_directory = arguments[2] job_info = JobInfoFactory.create_from_prcsng_cycle_id(prcsng_cycle_id) load_stage_1(job_info, data_directory)
def test_job_info_factory_create_from_prcsng_cycle_id(self): job_info = JobInfoFactory.create_from_prcsng_cycle_id(1) self.assertEqual(1, job_info.prcsng_cycle_id)
def setUp(self): self.writer = ListWriter() self.job_info = JobInfoFactory.create_from_prcsng_cycle_id(-1) self.logger = None
def run(input_file): #create logger today = datetime.datetime.now().strftime('%Y%m%d') logger = logging.getLogger('dwetl') file_handler = logging.FileHandler(f'logs/ezproxy.log.{today}') formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.setLevel(logging.INFO) time_started = datetime.datetime.now() logger.info(f'EzProxy ETL started') ''' check current hostname environment configuration to prevent errors ''' hostname = socket.gethostname() configured_host = database_credentials.configured_host() if hostname != configured_host: # quit program if env file hostname doesn't match with the current hostname print( 'ERROR: EzProxy ETL ended because .env contained an error. Please double check the configured host and db configuration.' ) logger.error( f'EzProxy ETL ended because .env contained an error. please double check the configured host and db configuration.' ) sys.exit() ''' create job_info for current process ''' with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] job_info = JobInfoFactory.create_job_info_from_db( session, job_info_table_class) ''' load ezproxy stage 1 ''' ezproxy_load.load_stage_1(job_info, input_file, logger) ''' load ezproxy stage 2 ''' ezproxy_load.load_stage_2(job_info, logger) ''' stg 2 intertable processing ''' ezproxy_load.intertable_processing(job_info, logger) ''' fact table load ''' ezproxy_load.load_fact_table(job_info, logger) ''' copy new ezproxy data to reporting database ''' ezproxy_load.copy_new_facts_to_reporting_db(job_info, logger) ''' end of job metadata writing ''' endtime = datetime.datetime.now() # write end time to processing cycle table with dwetl.database_session() as session: job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle'] # get row for current id and write end time to it max_prcsng_id = session.query(job_info_table_class).\ filter(job_info_table_class.dw_prcsng_cycle_id == job_info.prcsng_cycle_id).\ update({'dw_prcsng_cycle_exectn_end_tmstmp': endtime}) elapsed_time = endtime - time_started print("Ezproxy ETL elapsed time: ", str(elapsed_time)) logger.info(f'EzProxy ETL elapsed time: {str(elapsed_time)}')