Beispiel #1
0
	def start(self):
		"""
		Override: Code called on initial start and subsequent restarts.

		Must set:
		- self.option = Option()
		- self.schedule = Schedule()

		Note: We don't load resources here; resources are loaded on demand.
		"""

		# load standard config
		config = ConfigSectionKey('../conf', '../local')
		self.config = config
		config.load('bootstrap.ini', 'bootstrap')
		config.load('init.ini')
		config.load('connect.ini')

		# load project specific config
		self.config.load(self.project_file)

		# load project specific options from optional project specific environ var
		environ_var = just_file_stem(self.project_file).lower()
		self.option = Option(environ_var, options=config('project').options)

		# load project specific schedule
		self.schedule = Schedule(config('schedule'))

		# diagnostics
		self.option.dump()
		self.config('project').dump(False)
		self.config('schedule').dump(False)
    def setup(self):
        """Generic setup code."""

        # load standard config
        config = ConfigSectionKey('../conf', '../local')
        self.config = config
        config.load('bootstrap.ini', 'bootstrap')
        config.load('init.ini')
        config.load('connect.ini')

        # load project specific config
        self.config.load(self.project_file)
        self.project = self.config('project')

        # load project specific options from optional project specific environ var
        environ_var = just_file_stem(self.project_file).lower()
        self.option = Option(environ_var, options=config('project').options)

        # load project namespace
        self.namespace = self.config('namespace')

        # load project specific schedule
        self.schedule = Schedule(config('schedule'))

        # job specific folders
        self.state_folder = f'{self.session_folder}/{self.namespace.dataset}/state'
        self.work_folder = f'{self.session_folder}/{self.namespace.dataset}/work'
        self.publish_folder = f'{self.session_folder}/{self.namespace.dataset}/publish'
    def __init__(self, project_file=None):
        # session folder (acts as root path for job specific folders)
        self.session_folder = '../sessions'

        # configuration engines
        self.config = None
        self.option = None

        # project metadata
        self.project = None
        self.namespace = None

        # project resources
        self.database = None
        self.schedule = None

        # project dataset specific working folders
        self.state_folder = None
        self.work_folder = None
        self.publish_folder = None

        # project database connections (db_conn)
        self.source_db_conn = None
        self.target_db_conn = None

        # project file and name
        self.project_file = ''
        self.project_name = ''

        # if optional project file supplied use it; otherwise try command line
        if project_file:
            self.project_file = project_file
        elif len(sys.argv) > 1:
            self.project_file = sys.argv[1]

        # make sure we have a valid project file
        app_name = script_name()
        if not self.project_file:
            print(f'{app_name}: error - must specify project file')
        elif not is_file(f'../conf/{self.project_file}'):
            print(
                f'{app_name}: error - project file not found ({project_file})')
        else:
            # project file controls configuration
            self.project_name = just_file_stem(self.project_file)
Beispiel #4
0
    def stage_file(self, archive_capture_file_name):
        logger.info(
            f"Getting {archive_capture_file_name} from archive blob store")

        # make sure work folder exists and is empty
        clear_folder(self.work_folder)

        # connect to the archive blobstore
        resource = self.config(self.project.blobstore_archive)
        bs_archive = BlobStore()
        bs_archive.connect(resource)

        # extract dataset name and job id from archive capture file name
        dataset_name, _, job_id = just_file_stem(
            archive_capture_file_name).partition("#")

        # copy archive_capture_file_name to our local working folder
        capture_file_name = just_file_name(archive_capture_file_name)
        local_work_file_name = f"{self.work_folder}/{capture_file_name}"
        archive_capture_file_blob_name = f"{archive_capture_file_name}"
        bs_archive.get(local_work_file_name, archive_capture_file_blob_name)
        bs_archive.disconnect()

        # unzip the capture file we retrieved from archive
        with zipfile.ZipFile(local_work_file_name) as zf:
            zf.extractall(self.work_folder)

        # create the file's dataset_name schema if missing
        self.target_db_conn.create_schema(dataset_name)

        # process all table files in our work folder
        for file_name in sorted(glob.glob(f"{self.work_folder}/*.table")):
            table_name = just_file_stem(file_name)
            logger.info(f"Processing {table_name} ...")

            # always load table objects
            table_object = load_jsonpickle(
                f"{self.work_folder}/{table_name}.table")

            # skip table if no schema file exists
            schema_file_name = f"{self.work_folder}/{table_name}.schema"
            if not is_file(schema_file_name):
                logger.warning(
                    f"Table skipped ({table_name}); schema file not found")
                continue

            # always load table schema
            table_schema = load_jsonpickle(schema_file_name)

            # always load table pk
            table_pk = load_text(f"{self.work_folder}/{table_name}.pk").strip()

            # extend table object with table table and column names from table_schema object
            table_object.table_name = table_name
            table_object.column_names = [
                column_name for column_name in table_schema.columns
            ]

            # if drop_table, drop table and exit
            if table_object.drop_table:
                logger.info(f"Table drop request; table_drop=1")
                self.target_db_conn.drop_table(dataset_name, table_name)
                return

            # convert table schema to our target database and add extended column definitions
            extended_definitions = "udp_jobid int, udp_timestamp datetime2".split(
                ",")
            convert_to_mssql(table_schema, extended_definitions)

            # Future: support custom staging table type overrides
            # [table].table_type = < blank > | standard, columnar, memory, columnar - memory

            # handle cdc vs non-cdc table workflows differently
            logger.debug(
                f"{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}"
            )
            if (not table_object.cdc or table_object.cdc.lower() == "none"
                    or not table_pk):
                # if table cdc=none, drop the target table
                logger.info(
                    f"Table cdc=[{table_object.cdc}]; rebuilding table")
                self.target_db_conn.drop_table(dataset_name, table_name)

                # then re-create target table with latest schema
                # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
                logger.info(
                    f"Re-creating non-CDC table: {dataset_name}.{table_name}")
                self.target_db_conn.create_table_from_table_schema(
                    dataset_name, table_name, table_schema,
                    extended_definitions)

                # no cdc in effect for this table - insert directly to target table
                work_folder_obj = pathlib.Path(self.work_folder)
                batch_number = 0
                for json_file in sorted(
                        work_folder_obj.glob(f"{table_name}#*.json")):
                    # load rows from json file
                    # input_stream = open(json_file)
                    # rows = json.load(input_stream)
                    # input_stream.close()
                    rows = load_jsonpickle(json_file)

                    # insert/upsert/merge *.json into target tables
                    if not rows:
                        logger.info(
                            f"Table {table_name} has 0 rows; no updates")
                    else:
                        batch_number += 1
                        logger.info(
                            f"Job {job_id}, batch {batch_number}, table {table_name}"
                        )
                        self.progress_message(
                            f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..."
                        )

                        # convert date/datetime columns to date/datetime values
                        convert_data_types(rows, table_schema)
                        self.target_db_conn.bulk_insert_into_table(
                            dataset_name, table_name, table_schema, rows)

            else:
                # table has cdc updates

                # create target table if it doesn't exist
                if not self.target_db_conn.does_table_exist(
                        dataset_name, table_name):
                    # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns
                    logger.info(f"Creating table: {dataset_name}.{table_name}")
                    self.target_db_conn.create_table_from_table_schema(
                        dataset_name, table_name, table_schema,
                        extended_definitions)

                # create temp table to receive captured changes
                # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'.
                temp_table_name = f"_{table_name}"
                self.target_db_conn.drop_table(dataset_name, temp_table_name)
                self.target_db_conn.create_table_from_table_schema(
                    dataset_name, temp_table_name, table_schema,
                    extended_definitions)

                # insert captured updates into temp table
                work_folder_obj = pathlib.Path(self.work_folder)
                batch_number = 0
                for json_file in sorted(
                        work_folder_obj.glob(f"{table_name}#*.json")):
                    # load rows from json file
                    # input_stream = open(json_file)
                    # rows = json.load(input_stream)
                    # input_stream.close()
                    rows = load_jsonpickle(json_file)

                    # insert/upsert/merge *.json into target tables
                    if not rows:
                        logger.info(
                            f"Table {table_name} has 0 rows; no updates")
                        break
                    else:
                        batch_number += 1
                        logger.info(
                            f"Job {job_id}, batch {batch_number}, table {table_name}"
                        )
                        self.progress_message(
                            f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..."
                        )

                        # convert date/datetime columns to date/datetime values
                        convert_data_types(rows, table_schema)
                        self.target_db_conn.bulk_insert_into_table(
                            dataset_name, temp_table_name, table_schema, rows)
                else:
                    # merge (upsert) temp table to target table
                    merge_cdc = cdc_merge.MergeCDC(table_object,
                                                   extended_definitions)
                    sql_command = merge_cdc.merge(dataset_name, table_pk)

                    # TODO: Capture SQL commands in a sql specific log.
                    logger.debug(sql_command)
                    self.target_db_conn.cursor.execute(sql_command)

                # drop temp table after merge
                self.target_db_conn.drop_table(dataset_name, temp_table_name)
def main():
    sdlc = 'dev'

    # ref_file_list will hold all paths to the reference definition excel files
    ref_wb_dict = {}

    #
    masterdata_directory_list = ['access', 'common', 'reference']

    # Detect and load audit reference definition files into a list
    # ToDo: make glob case insensitive
    for masterdata_directory in masterdata_directory_list:
        for ref_file in sorted(
                pathlib.Path(f'../ref_docs/{masterdata_directory}/').glob(
                    '*.xlsx')):
            ref_wb = openpyxl.load_workbook(ref_file, data_only=True)
            ref_table_name = just_file_stem(str(ref_file))

            # Add file name (Table Name) as key. Add workbook object as value.
            ref_wb_dict.update([(f'{masterdata_directory}.{ref_table_name}',
                                 ref_wb)])

    # Delete all output files so new ones can be generated
    for output_file in sorted(
            pathlib.Path('../ref_docs/ddl_output/').glob('*.sql')):
        os.remove(output_file)

    config = ConfigSectionKey('../conf', '../local')
    config.load('connect.ini')

    # ToDo: Fill in uat and prod connection names when added to connect.ini
    if sdlc == 'dev':
        connection_name = 'database:amc_dsg_udp_01_stage_dev'
    elif sdlc == 'uat':
        connection_name = 'unknown connection point'
    elif sdlc == 'prod':
        connection_name = 'database:amc_dsg_udp_01_stage_prod'
    else:
        connection_name = 'unknown connection point'

    udp_conn_config = config(connection_name)
    udp_conn_config = MSSQL(udp_conn_config)
    udp_db = DatabaseExcel('mssql_excel_upload', udp_conn_config.conn)

    for key, value in ref_wb_dict.items():
        # Instantiate DatabaseExcel object using mssql_excel_upload.cfg as platform and udp_conn_config

        sql_file = open(f"../ref_docs/ddl_output/{key}.sql",
                        "x",
                        encoding='utf8')

        sql_use_statement = udp_db.use_database_sql(f'udp_masterdata_{sdlc}')
        sql_drop_table = udp_db.drop_table_sql(
            key.split('.')[0],
            key.split('.')[1])
        sql_create_schema = udp_db.create_schema_sql(key.split('.')[0])

        # sql_create_table = udp_db.create_table_sql(schema_name='udp_ref', table_name=key, workbook=value)
        sql_create_table = udp_db.create_table_sql_v2(
            schema_name=key.split('.')[0],
            table_name=key.split('.')[1],
            worksheet=value.worksheets[0])

        sql_file.write(sql_use_statement)
        # sql_file.write('\n begin transaction \n')
        sql_file.write(sql_create_schema)
        sql_file.write(sql_drop_table)
        sql_file.write(sql_create_table)

        # print(sql_use_statement)
        # udp_db.direct_execute(sql_use_statement)
        # print(sql_create_schema)
        # udp_db.direct_execute(sql_create_schema)
        # print(sql_drop_table)
        # udp_db.direct_execute(sql_drop_table)
        # print(sql_create_table)
        # udp_db.direct_execute(sql_create_table)
        # udp_db.direct_execute('commit')

        for sheet in [
                x for x in value.worksheets
                if x.title.lower() not in ('documentation', 'change log',
                                           'changelog')
        ]:
            sql_insert_values = udp_db.insert_into_table_sql(
                schema_name=key.split('.')[0],
                table_name=key.split('.')[1],
                worksheet=sheet)
            sql_file.write(sql_insert_values)
            # print(sql_insert_values)
            # udp_db.direct_execute(sql_insert_values)

        # sql_file.write('\n end transaction \n')
        # sql_file.write('\n commit \n')

    # Clear all err_files
    for err_file in sorted(pathlib.Path(f'../ref_docs/log/').glob('*_error*')):
        os.remove(err_file)

    # Publish directly to udp_reference_<SLDC>
    for ddl_file in sorted(
            pathlib.Path(f'../ref_docs/ddl_output/').glob('*.sql')):
        print(f'executing {ddl_file}')
        ddl_sql = open(ddl_file, mode='r', encoding='utf8').read()
        try:
            # print(f'SQL Code: \n {ddl_sql}')
            udp_db.direct_execute(ddl_sql)
            udp_db.direct_execute('\n commit')
            print('execution successful!')
        except Exception as e:
            err_sql_file = open(f'../ref_docs/log/{ddl_file.stem}_error.sql',
                                'x',
                                encoding='utf8')
            err_log_file = open(f'../ref_docs/log/{ddl_file.stem}_error.log',
                                'x',
                                encoding='utf8')
            err_sql_file.write(ddl_sql)
            err_log_file.write(str(e))
            err_sql_file.close()
            err_log_file.close()