def stage_archive_file(archive_object_store, notification): logger.info(f'Archive object store update notification: {notification}') # TODO: Add activity_log (vs job_log/stat_log) references. # get file key and name work_folder = 'stage_work' object_key = notification.object_key archive_file_name = f'{work_folder}/' + just_file_name(object_key) logger.info( f'Getting {archive_file_name} from archive_object_store::{object_key}') archive_object_store.get(archive_file_name, object_key) # TODO: Insert stage file code block here # unzip archive_file_name # load schema info so we can build target tables if missing # loop through all <table>.json files # conditional: cdc = timestamp vs none vs ... # note: if no nk/pk then cdc must be none - override and log warning # load json data # create target table if missing with capture (udp_jobid, udp_jobpk, udp_timestamp) and stage (udp_pk, udp_nk) # create #source table # insert into #source via insertmany() # update #source with udp_nk using pk from capture # merge #source to target on udp_nk # update job_log, table_log with stats # return name of processed file archive_file_name = object_key.rsplit('-', 1) + '/' + object_key return archive_file_name
def archive_capture_file(self, notification): # get name of file (key) that triggered this call source_object_key = notification.object_key # extract out the file name source_file_name = just_file_name(source_object_key) # if source_file_name is empty, ignore notification if not source_file_name: logger.debug( f'Ignoring notification without object key (file name): {notification}' ) return # if source_file_name is capture_state.zip, ignore it # Note: This keeps the latest capture_state.zip file in each capture folder for recovery purposes. if source_file_name == 'capture_state.zip': logger.debug(f'Ignoring capture_state.zip file notification') return # TODO: Add activity_log (vs job_log/stat_log) references. # make sure work folder exists and is empty work_folder = 'sessions/archive_work' clear_folder(work_folder) # get file, copy to archive then delete from capture source_objectstore_name = notification.objectstore_name source_file_name = f'{work_folder}/' + source_file_name # get the posted file logger.info( f'Getting {source_file_name} from {source_objectstore_name}::{source_object_key}' ) source_objectstore = Objectstore(source_objectstore_name, self.cloud) source_objectstore.get(source_file_name, source_object_key) # move (copy) the posted file to the archive object_store logger.info( f'Moving {source_file_name} to archive_object_store::{source_object_key}' ) archive_objectstore_name = self.cloud.archive_objectstore archive_objectstore = Objectstore(archive_objectstore_name, self.cloud) archive_objectstore.put(source_file_name, source_object_key) # then delete file from source object_store and local work folder logger.info( f'Deleting {source_object_key} from {source_objectstore_name}') source_objectstore.delete(source_object_key) delete_file(source_file_name) # TODO: update stat_log # TODO: update stage queue return
def upload_to_blobstore(self): """Upload publish_folder's <dataset_name>-<job_id>.zip to landing blobstore.""" # don't upload captured data if we're in --notransfer mode if self.option('notransfer'): logger.warning( 'Not uploading data to landing per --notransfer option') return # upload capture file to landing blobstore self.events.start('upload', 'step') resource = self.config(self.project.blobstore_landing) bs_landing = BlobStore() bs_landing.connect(resource) bs_landing.put(self.zip_file_name, just_file_name(self.zip_file_name)) bs_landing.disconnect() # finish self.events.stop('upload', 0, file_size(self.zip_file_name))
def stage_file(db_conn, archive_objectstore, object_key): # make sure work folder exists and is empty work_folder = 'stage_work' clear_folder(work_folder) if not os.path.exists(work_folder): os.mkdir(work_folder) # get the posted file source_file_name = f'{work_folder}/' + just_file_name(object_key) logger.info(f'Getting {source_file_name} from archive::{object_key}') archive_objectstore.get(source_file_name, object_key) # create the file's namespace schema if missing namespace = object_key.split('/')[0] job_id = object_key db_conn.create_schema(namespace) # unzip the file # shutil.unpack_archive(source_file_name, extract_dir=work_folder) file_names = FileList(source_file_name) file_names.include('*') extract_archive(source_file_name, work_folder, file_names) # process all table files in our work folder for file_name in sorted(glob.glob(f'{work_folder}/*.table')): table_name = pathlib.Path(file_name).stem logger.info(f'Processing {table_name} ...') # TODO: rename files to use a _table, _schema suffix and .json file extension # always load table objects # input_stream = open(f'{work_folder}/{table_name}.table', 'rb') # table_object = pickle.load(input_stream) # input_stream.close() table_object = load_json(f'{work_folder}/{table_name}.table') # always load table schema # input_stream = open(f'{work_folder}/{table_name}.schema', 'rb') # table_schema = pickle.load(input_stream) # input_stream.close() table_schema = load_json(f'{work_folder}/{table_name}.schema') # always load table pk # input_stream = open(f'{work_folder}/{table_name}.pk') # table_pk = input_stream.read().strip() # input_stream.close() table_pk = load_text(f'{work_folder}/{table_name}.pk').strip() # extend table object with table table and column names from table_schema object table_object.table_name = table_name table_object.column_names = [ column_name for column_name in table_schema.columns ] # if drop_table, drop table and exit if table_object.drop_table: logger.info(f'Table drop request; table_drop=1') db_conn.drop_table(namespace, table_name) return # convert table schema to our target database and add extended column definitions extended_definitions = 'udp_jobid int, udp_timestamp datetime2'.split( ',') convert_to_mssql(table_schema, extended_definitions) # 2018-09-12 support custom staging table type overrides # [table].table_type = < blank > | standard, columnar, memory, columnar - memory # create target table if it doesn't exist if not db_conn.does_table_exist(namespace, table_name): # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns logger.info(f'Creating table: {namespace}.{table_name}') db_conn.create_table_from_table_schema(namespace, table_name, table_schema, extended_definitions) # handle cdc vs non-cdc table workflows differently logger.debug( f'{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}' ) if not table_object.cdc or table_object.cdc.lower( ) == 'none' or not table_pk: # if table cdc=none, drop the target table logger.info(f'Table cdc=[{table_object.cdc}]; rebuilding table') db_conn.drop_table(namespace, table_name) # no cdc in effect for this table - insert directly to target table work_folder_obj = pathlib.Path(work_folder) batch_number = 0 for json_file in sorted( work_folder_obj.glob(f'{table_name}#*.json')): # load rows from json file # input_stream = open(json_file) # rows = json.load(input_stream) # input_stream.close() rows = load_json(json_file) # insert/upsert/merge *.json into target tables if not rows: logger.info(f'Table {table_name} has 0 rows; no updates') else: batch_number += 1 logger.info( f'Job {job_id}, batch {batch_number}, table {table_name}' ) # convert date/datetime columns to date/datetime values convert_data_types(rows, table_schema) # db_conn.insert_many( namespace, table_name, rows ) db_conn.bulk_insert_into_table(namespace, table_name, table_schema, rows) else: # table has cdc updates # create temp table to receive captured changes # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'. temp_table_name = f'_{table_name}' db_conn.drop_table(namespace, temp_table_name) # print(f'namespace = {namespace}') # print(f'temp_table_name = {temp_table_name}') # print(f'table_object = {dir(table_object)}') # print(f'extended definitions = {extended_definitions}') db_conn.create_table_from_table_schema(namespace, temp_table_name, table_schema, extended_definitions) # insert captured updates into temp table work_folder_obj = pathlib.Path(work_folder) batch_number = 0 for json_file in sorted( work_folder_obj.glob(f'{table_name}#*.json')): # load rows from json file # input_stream = open(json_file) # rows = json.load(input_stream) # input_stream.close() rows = load_json(json_file) # insert/upsert/merge *.json into target tables if not rows: logger.info(f'Table {table_name} has 0 rows; no updates') break else: batch_number += 1 logger.info( f'Job {job_id}, batch {batch_number}, table {table_name}' ) # convert date/datetime columns to date/datetime values convert_data_types(rows, table_schema) # db_conn.insert_many( namespace, table_name, rows ) db_conn.bulk_insert_into_table(namespace, temp_table_name, table_schema, rows) else: # merge (upsert) temp table to target table merge_cdc = cdc_merge.MergeCDC(table_object, extended_definitions) sql_command = merge_cdc.merge(namespace, table_pk) # TODO: Capture SQL commands in a sql specific log. logger.debug(sql_command) db_conn.cursor.execute(sql_command) # drop temp table after merge db_conn.drop_table(namespace, temp_table_name)
def stage_file(self, archive_capture_file_name): logger.info( f"Getting {archive_capture_file_name} from archive blob store") # make sure work folder exists and is empty clear_folder(self.work_folder) # connect to the archive blobstore resource = self.config(self.project.blobstore_archive) bs_archive = BlobStore() bs_archive.connect(resource) # extract dataset name and job id from archive capture file name dataset_name, _, job_id = just_file_stem( archive_capture_file_name).partition("#") # copy archive_capture_file_name to our local working folder capture_file_name = just_file_name(archive_capture_file_name) local_work_file_name = f"{self.work_folder}/{capture_file_name}" archive_capture_file_blob_name = f"{archive_capture_file_name}" bs_archive.get(local_work_file_name, archive_capture_file_blob_name) bs_archive.disconnect() # unzip the capture file we retrieved from archive with zipfile.ZipFile(local_work_file_name) as zf: zf.extractall(self.work_folder) # create the file's dataset_name schema if missing self.target_db_conn.create_schema(dataset_name) # process all table files in our work folder for file_name in sorted(glob.glob(f"{self.work_folder}/*.table")): table_name = just_file_stem(file_name) logger.info(f"Processing {table_name} ...") # always load table objects table_object = load_jsonpickle( f"{self.work_folder}/{table_name}.table") # skip table if no schema file exists schema_file_name = f"{self.work_folder}/{table_name}.schema" if not is_file(schema_file_name): logger.warning( f"Table skipped ({table_name}); schema file not found") continue # always load table schema table_schema = load_jsonpickle(schema_file_name) # always load table pk table_pk = load_text(f"{self.work_folder}/{table_name}.pk").strip() # extend table object with table table and column names from table_schema object table_object.table_name = table_name table_object.column_names = [ column_name for column_name in table_schema.columns ] # if drop_table, drop table and exit if table_object.drop_table: logger.info(f"Table drop request; table_drop=1") self.target_db_conn.drop_table(dataset_name, table_name) return # convert table schema to our target database and add extended column definitions extended_definitions = "udp_jobid int, udp_timestamp datetime2".split( ",") convert_to_mssql(table_schema, extended_definitions) # Future: support custom staging table type overrides # [table].table_type = < blank > | standard, columnar, memory, columnar - memory # handle cdc vs non-cdc table workflows differently logger.debug( f"{table_name}.cdc={table_object.cdc}, timestamp={table_object.timestamp}" ) if (not table_object.cdc or table_object.cdc.lower() == "none" or not table_pk): # if table cdc=none, drop the target table logger.info( f"Table cdc=[{table_object.cdc}]; rebuilding table") self.target_db_conn.drop_table(dataset_name, table_name) # then re-create target table with latest schema # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns logger.info( f"Re-creating non-CDC table: {dataset_name}.{table_name}") self.target_db_conn.create_table_from_table_schema( dataset_name, table_name, table_schema, extended_definitions) # no cdc in effect for this table - insert directly to target table work_folder_obj = pathlib.Path(self.work_folder) batch_number = 0 for json_file in sorted( work_folder_obj.glob(f"{table_name}#*.json")): # load rows from json file # input_stream = open(json_file) # rows = json.load(input_stream) # input_stream.close() rows = load_jsonpickle(json_file) # insert/upsert/merge *.json into target tables if not rows: logger.info( f"Table {table_name} has 0 rows; no updates") else: batch_number += 1 logger.info( f"Job {job_id}, batch {batch_number}, table {table_name}" ) self.progress_message( f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..." ) # convert date/datetime columns to date/datetime values convert_data_types(rows, table_schema) self.target_db_conn.bulk_insert_into_table( dataset_name, table_name, table_schema, rows) else: # table has cdc updates # create target table if it doesn't exist if not self.target_db_conn.does_table_exist( dataset_name, table_name): # FUTURE: Add udp_pk, udp_nk, udp_nstk and other extended columns logger.info(f"Creating table: {dataset_name}.{table_name}") self.target_db_conn.create_table_from_table_schema( dataset_name, table_name, table_schema, extended_definitions) # create temp table to receive captured changes # FUTURE: Create a database wrapper function for creating 'portable' temp table names vs hard-coding '#'. temp_table_name = f"_{table_name}" self.target_db_conn.drop_table(dataset_name, temp_table_name) self.target_db_conn.create_table_from_table_schema( dataset_name, temp_table_name, table_schema, extended_definitions) # insert captured updates into temp table work_folder_obj = pathlib.Path(self.work_folder) batch_number = 0 for json_file in sorted( work_folder_obj.glob(f"{table_name}#*.json")): # load rows from json file # input_stream = open(json_file) # rows = json.load(input_stream) # input_stream.close() rows = load_jsonpickle(json_file) # insert/upsert/merge *.json into target tables if not rows: logger.info( f"Table {table_name} has 0 rows; no updates") break else: batch_number += 1 logger.info( f"Job {job_id}, batch {batch_number}, table {table_name}" ) self.progress_message( f"loading {just_file_stem(capture_file_name)}({table_name}.{batch_number:04}) ..." ) # convert date/datetime columns to date/datetime values convert_data_types(rows, table_schema) self.target_db_conn.bulk_insert_into_table( dataset_name, temp_table_name, table_schema, rows) else: # merge (upsert) temp table to target table merge_cdc = cdc_merge.MergeCDC(table_object, extended_definitions) sql_command = merge_cdc.merge(dataset_name, table_pk) # TODO: Capture SQL commands in a sql specific log. logger.debug(sql_command) self.target_db_conn.cursor.execute(sql_command) # drop temp table after merge self.target_db_conn.drop_table(dataset_name, temp_table_name)