Esempio n. 1
0
def run(input_directory):

    #create logger
    today = datetime.datetime.now().strftime('%Y%m%d')
    logger = logging.getLogger('dwetl')
    file_handler = logging.FileHandler(f'logs/dwetl.log.{today}')
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    time_started = datetime.datetime.now()
    logger.info(f'DWETL.py started')
    '''
    create job_info for current process
    '''
    with dwetl.database_session() as session:
        job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle']
        job_info = JobInfoFactory.create_job_info_from_db(
            session, job_info_table_class)
    '''
    load_stage_1
    '''
    load_stage_1.load_stage_1(job_info, input_directory, logger)
    '''
    load_stage_2
    load 'in_' values from stg1 to stg 2 tables
    load 'in_' values
    '''
    load_stage_2.load_stage_2(job_info, logger)
    '''
    stg 2 intertable processing
    '''
    stage_2_intertable_processing.stage_2_intertable_processing(
        job_info, logger)
    '''
    end of job metadata writing
    '''

    endtime = datetime.datetime.now()
    # write end time to processing cycle table
    with dwetl.database_session() as session:
        job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle']
        # get row for current id and write end time to it
        max_prcsng_id = session.query(job_info_table_class).\
            filter(job_info_table_class.dw_prcsng_cycle_id == job_info.prcsng_cycle_id).\
            update({'dw_prcsng_cycle_exectn_end_tmstmp': endtime})

    elapsed_time = endtime - time_started
    print("elapsed time: ", str(elapsed_time))
Esempio n. 2
0
def load_fact_table(job_info, logger):
    print('EZProxy loading fact table...')
    logger.info('Loading to the fact table.... ')
    stage2_table = dwetl.Base.classes['dw_stg_2_ezp_sessns_snap']
    fact_table = dwetl.Base.classes['fact_ezp_sessns_snap']
    processing_cycle_id = job_info.prcsng_cycle_id

    # get max value for fact key from the reporting db
    with dwetl.reporting_database_session() as session2:
        reporting_fact_table = dwetl.ReportingBase.classes[
            'fact_ezp_sessns_snap']
        max_ezp_sessns_snap_fact_key = session2.query(
            func.max(reporting_fact_table.ezp_sessns_snap_fact_key)).scalar()

    if max_ezp_sessns_snap_fact_key is None:
        max_ezp_sessns_snap_fact_key = 1

    # load etl ezp fact table
    with dwetl.database_session() as session:
        reader = SqlAlchemyReader(session, stage2_table,
                                  'em_create_dw_prcsng_cycle_id',
                                  processing_cycle_id)
        writer = SqlAlchemyWriter(session, fact_table)
        processor = EzproxyFactProcessor(reader, writer, job_info, logger,
                                         max_ezp_sessns_snap_fact_key)
        processor.execute()
    logger.info('Finished loading to the fact table.... ')
Esempio n. 3
0
def load_stage_1(job_info, input_directory, logger):

    print('Loading stage 1...')
    logger.info('Loading stage 1...')
    '''
    file to table mapping
    '''

    ALEPH_TSV_TABLE_MAPPING = {
        "mai01_z00_data": "dw_stg_1_mai01_z00",
        "mai39_z00_data": "dw_stg_1_mai39_z00",
        "mai01_z13_data": "dw_stg_1_mai01_z13",
        "mai39_z13_data": "dw_stg_1_mai39_z13",
        "mai01_z13u_data": "dw_stg_1_mai01_z13u",
        "mai39_z13u_data": "dw_stg_1_mai39_z13u",
        # "mai60_z00_data": "dw_stg_1_mai60_z00",
        # "mai60_z13_data": "dw_stg_1_mai60_z13",
        # "mai60_z13u_data": "dw_stg_1_mai60_z13u",
        # "mai60_z103_bib_data": "dw_stg_1_mai50_z103_bib",
        # "mai50_z30_data": "dw_stg_1_mai50_z30",
        # "mai50_z35_data": "dw_stg_1_mai50_z35",
        # "mai50_z30_full_data": "dw_stg_1_mai50_z30_full",
        # "mai50_z103_bib_full_data": "dw_stg_2_lbry_item_z103_bib_full",
    }

    # Z00_FIELD_TABLE_MAPPING = {
    #     "mai01_z00_field_data": "dw_stg_1_mai01_z00_field",
    #     "mai39_z00_field_data": "dw_stg_1_mai39_z00_field",
    #     "mai60_z00_field_data": "dw_stg_1_mai60_z00_field",
    # }
    #
    # MPF_TABLE_MAPPING = {
    #     "mpf_member-library-dimension.txt": "dw_stg_1_mpf_mbr_lbry",
    #     "mpf_library-entity-dimension.txt": "dw_stg_1_mpf_lbry_entity",
    #     "mpf_library-collection-dimension.txt": "dw_stg_1_mpf_collection",
    #     "mpf_item-status-dimension.txt": "dw_stg_1_mpf_item_status",
    #     "mpf_item-process-status-dimension.txt": "dw_stg_1_mpf_item_prcs_status",
    #     "mpf_material-form-dimension.txt": "dw_stg_1_mpf_matrl_form"
    # }

    '''
    load aleph tsv files minus z00_field tables
    '''

    for file, table in ALEPH_TSV_TABLE_MAPPING.items():
        file_path = os.path.join(input_directory, file)
        print(file_path)
        logger.info(file_path)
        with dwetl.database_session() as session:
            reader = TsvFileReader(file_path)
            # writer = PrintWriter()
            writer = SqlAlchemyWriter(session, dwetl.Base.classes[table])
            processor = LoadAlephTsv(reader, writer, job_info, logger)
            processor.execute()
Esempio n. 4
0
def load_stage_1(job_info, input_file, logger):
    print('EZProxy Loading stage 1...')
    logger.info('EZ Proxy Loading stage 1...')

    table = 'dw_stg_1_ezp_sessns_snap'

    with dwetl.database_session() as session:
        reader = EzproxyReader(input_file)
        # writer = PrintWriter()
        writer = SqlAlchemyWriter(session, dwetl.Base.classes[table])
        processor = LoadAlephTsv(reader, writer, job_info, logger)
        processor.execute()
Esempio n. 5
0
def intertable_processing(job_info, logger):
    print('EZProxy transformations started...')
    logger.info('EZProxy intertable processing starts...')
    stage2_table = dwetl.Base.classes['dw_stg_2_ezp_sessns_snap']
    processing_cycle_id = job_info.prcsng_cycle_id

    with dwetl.database_session() as session:
        reader = SqlAlchemyReader(session, stage2_table,
                                  'em_create_dw_prcsng_cycle_id',
                                  processing_cycle_id)
        writer = SqlAlchemyWriter(session, stage2_table)
        processor = EzproxyProcessor(reader, writer, job_info, logger)
        processor.execute()
    logger.info('Finished EZProxy intertable processing .... ')
Esempio n. 6
0
def load_stage_2(job_info, logger):

    print('Loading stage 2...')
    logger.info('Loading stage 2...')

    stage1_to_stage2_table_mappings = {
        "dw_stg_1_mai39_z13": "dw_stg_2_bib_rec_z13",
        'dw_stg_1_mai01_z13': "dw_stg_2_bib_rec_z13",
        "dw_stg_1_mai01_z13u": "dw_stg_2_bib_rec_z13u",
        "dw_stg_1_mai01_z00": "dw_stg_2_bib_rec_z00",
        "dw_stg_1_mai39_z00": "dw_stg_2_bib_rec_z00",
        "dw_stg_1_mai39_z13u": "dw_stg_2_bib_rec_z13u",
        # "dw_stg_1_mai60_z00": "dw_stg_2_lbry_holding_z00",
        # "dw_stg_1_mai60_z13": "dw_stg_2_lbry_holding_z13",
        # "dw_stg_1_mai60_z13u": "dw_stg_2_lbry_holding_z13u",
        # "dw_stg_1_mai50_z30": "dw_stg_2_lbry_item_z30",
        # "dw_stg_1_mai50_z35": "dw_stg_2_lbry_item_event_z35",
        # "dw_stg_1_mai01_z00_field": "dw_stg_2_bib_rec_z00_field",
        # "dw_stg_1_mai39_z00_field": "dw_stg_2_bib_rec_z00_field",
        # "dw_stg_1_mai60_z00_field": "dw_stg_2_lbry_holding_z00_field",
        # "dw_stg_1_mpf_mbr_lbry": "dw_stg_2_mpf_mbr_lbry",
        # "dw_stg_1_mpf_lbry_entity": "dw_stg_2_mpf_lbry_entity",
        # "dw_stg_1_mpf_collection": "dw_stg_2_mpf_collection",
        # "dw_stg_1_mpf_item_status": "dw_stg_2_mpf_item_status",
        # "dw_stg_1_mpf_item_prcs_status": "dw_stg_2_mpf_item_prcs_status",
        # "dw_stg_1_mpf_matrl_form": "dw_stg_2_mpf_matrl_form"
    }

    processing_cycle_id = job_info.prcsng_cycle_id
    for stage1_table, stage2_table in stage1_to_stage2_table_mappings.items():
        print(stage1_table)
        logger.info(stage1_table)

        library = aleph_library(stage1_table)

        with dwetl.database_session() as session:
            stage1_table_class = dwetl.Base.classes[stage1_table]
            stage2_table_class = dwetl.Base.classes[stage2_table]
            reader = SqlAlchemyReader(session, stage1_table_class,
                                      'em_create_dw_prcsng_cycle_id',
                                      processing_cycle_id)
            writer = SqlAlchemyWriter(session, stage2_table_class)
            processor = CopyStage1ToStage2.create(reader, writer, job_info,
                                                  logger, library)
            processor.execute()
Esempio n. 7
0
def load_stage_2(job_info, logger):
    print('EZProxy Loading stage 2...')
    logger.info('EZProxy Loading stage 2...')

    processing_cycle_id = job_info.prcsng_cycle_id

    with dwetl.database_session() as session:
        stage1_table_class = dwetl.Base.classes["dw_stg_1_ezp_sessns_snap"]
        stage2_table_class = dwetl.Base.classes["dw_stg_2_ezp_sessns_snap"]
        reader = SqlAlchemyReader(session, stage1_table_class,
                                  'em_create_dw_prcsng_cycle_id',
                                  processing_cycle_id)
        writer = SqlAlchemyWriter(session, stage2_table_class)
        # there is no aleph library for ez proxy data, but CopyStage1ToStage2 still will work
        library = ''
        processor = CopyStage1ToStage2.create(reader, writer, job_info, logger,
                                              library)
        processor.execute()
    logger.info('Finished EZProxy loading stage 2 .... ')
Esempio n. 8
0
def copy_new_facts_to_reporting_db(job_info, logger):
    etl_fact_table = dwetl.Base.classes['fact_ezp_sessns_snap']
    processing_cycle_id = job_info.prcsng_cycle_id

    # query and select records from etl fact table
    # should we use the create update processing cycle ID? or the Update processing cycle id?
    with dwetl.database_session() as session:
        reader = SqlAlchemyReader(session, etl_fact_table,
                                  'em_create_dw_prcsng_cycle_id',
                                  processing_cycle_id)
        session.expunge_all()

    # insert records into reporting db ezp fact table
    with dwetl.reporting_database_session() as session2:
        reporting_fact_table = dwetl.ReportingBase.classes[
            'fact_ezp_sessns_snap']
        writer = SqlAlchemyWriter(session2, reporting_fact_table)
        processor = EzproxyReportingFactProcessor(reader, writer, job_info,
                                                  logger)
        processor.execute()
def stage_2_intertable_processing(job_info, logger):
    print("Stage 2 Intertable Processing...")
    logger.info("Stage 2 Intertable Processing...")

    STG_2_TABLE_CONFIG_MAPPING = {
        'dw_stg_2_bib_rec_z00': 'bibliographic_record_dimension',
        'dw_stg_2_bib_rec_z13': 'bibliographic_record_dimension',
        'dw_stg_2_bib_rec_z13u': 'bibliographic_record_dimension',
        'dw_stg_2_bib_rec_z00_field': 'bibliographic_record_dimension'
    }

    processing_cycle_id = job_info.prcsng_cycle_id

    for table, dimension in STG_2_TABLE_CONFIG_MAPPING.items():
        print(table)
        logger.info(table)
        # get json_config for current dimension
        table_config_path = os.path.join('table_config', dimension + '.json')
        json_config = load_table_config(table_config_path)

        with dwetl.database_session() as session:
            # gets SA base class for the current table
            stage2_table_class = dwetl.Base.classes[table]
            # gets list of PKs for the current table
            pk_list = [pk.name for pk in stage2_table_class.__table__.primary_key]

            reader = SqlAlchemyReader(session, stage2_table_class, 'em_create_dw_prcsng_cycle_id', processing_cycle_id)
            writer = SqlAlchemyWriter(session, stage2_table_class)

            '''
            Preprocessing
            '''
            preprocessor = Preprocess(reader, writer, job_info, logger, json_config, pk_list)
            preprocessor.execute()

            '''
            # Data Quality Checks
            # '''
            data_quality_checker = DataQualityProcessor(reader, writer, job_info, logger, json_config, pk_list)
            data_quality_checker.execute()
Esempio n. 10
0
def run(input_file):
    #create logger
    today = datetime.datetime.now().strftime('%Y%m%d')
    logger = logging.getLogger('dwetl')
    file_handler = logging.FileHandler(f'logs/ezproxy.log.{today}')
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logger.setLevel(logging.INFO)

    time_started = datetime.datetime.now()
    logger.info(f'EzProxy ETL started')
    '''
    check current hostname environment configuration to prevent errors
    '''
    hostname = socket.gethostname()
    configured_host = database_credentials.configured_host()

    if hostname != configured_host:
        # quit program if env file hostname doesn't match with the current hostname
        print(
            'ERROR: EzProxy ETL ended because .env contained an error. Please double check the configured host and db configuration.'
        )
        logger.error(
            f'EzProxy ETL ended because .env contained an error. please double check the configured host and db configuration.'
        )
        sys.exit()
    '''
    create job_info for current process
    '''
    with dwetl.database_session() as session:
        job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle']
        job_info = JobInfoFactory.create_job_info_from_db(
            session, job_info_table_class)
    '''
    load ezproxy stage 1 
    '''
    ezproxy_load.load_stage_1(job_info, input_file, logger)
    '''
    load ezproxy stage 2 
    '''
    ezproxy_load.load_stage_2(job_info, logger)
    '''
    stg 2 intertable processing
    '''
    ezproxy_load.intertable_processing(job_info, logger)
    '''
    fact table load
    '''
    ezproxy_load.load_fact_table(job_info, logger)
    '''
    copy new ezproxy data to reporting database 
    '''
    ezproxy_load.copy_new_facts_to_reporting_db(job_info, logger)
    '''
    end of job metadata writing
    '''

    endtime = datetime.datetime.now()
    # write end time to processing cycle table
    with dwetl.database_session() as session:
        job_info_table_class = dwetl.Base.classes['dw_prcsng_cycle']
        # get row for current id and write end time to it
        max_prcsng_id = session.query(job_info_table_class).\
            filter(job_info_table_class.dw_prcsng_cycle_id == job_info.prcsng_cycle_id).\
            update({'dw_prcsng_cycle_exectn_end_tmstmp': endtime})

    elapsed_time = endtime - time_started
    print("Ezproxy ETL elapsed time: ", str(elapsed_time))
    logger.info(f'EzProxy ETL elapsed time: {str(elapsed_time)}')