def update_stat_log(self, source_file_name): # OLD: get references to stage database and catalog schema # data_stage_database = udp.udp_stage_database # data_catalog_schema = udp.udp_catalog_schema db = database.MSSQL(self.config(self.project.database)) conn = db.conn # cursor = conn.cursor() db_conn = database.Database('mssql', conn) db_conn.use_database('udp_stage') # TODO: Will json_pickle restore datetime values without explict conversion ??? # TODO: Wrapper for stat insert that does intersection of json and target record's schema # and explicitly inserts specific column/value pairs they have in common; this will # require that our json send includes column names, not just rows of column values !!!! # extract job.log/last_job.log from capture zip and merge these into stat_log table job_log_data = read_archived_file(source_file_name, 'job.log', default=None) if job_log_data: job_log_json = json.loads(job_log_data) for row in job_log_json: row['start_time'] = iso_to_datetime(row['start_time']).datetime row['end_time'] = iso_to_datetime(row['end_time']).datetime # skip capture stats which only have intermediate end_time and run_time values # next capture file will include an accurate version of this stat in last_job.job file if row['stat_name'] != 'capture': db_conn.insert_into_table('udp_catalog', 'stat_log', **row) # if 'last_job.log' in archive.namelist(): job_log_data = read_archived_file(source_file_name, 'last_job.log', default=None) if job_log_data: last_job_log_json = json.loads(job_log_data) for row in last_job_log_json: row['start_time'] = iso_to_datetime(row['start_time']).datetime row['end_time'] = iso_to_datetime(row['end_time']).datetime if row['stat_name'] in ('capture', 'compress', 'upload'): db_conn.insert_into_table('udp_catalog', 'stat_log', **row)
def test_iso_to_datetime(): iso_date = '2018-12-01' iso_datetime = '2018-11-21T19:53:12+00:00' iso_datetime_utc = '2018-11-21T19:53:12Z' converted_iso_date = iso_to_datetime(iso_date) converted_iso_datetime = iso_to_datetime(iso_datetime) converted_iso_datetime_utc = iso_to_datetime(iso_datetime_utc) # converted_iso_date assertions assert converted_iso_date.year == 2018 assert converted_iso_date.month == 12 assert converted_iso_date.day == 1 # converted_iso_datetime assertions assert converted_iso_datetime.year == 2018 assert converted_iso_datetime.month == 11 assert converted_iso_datetime.day == 21 # converted_iso_datetime_utc assertions assert converted_iso_datetime_utc.year == 2018 assert converted_iso_datetime_utc.month == 11 assert converted_iso_datetime_utc.day == 21
def process_table(self, db, db_engine, schema_name, table_name, table_object, table_history, current_timestamp): """Process a specific table.""" # skip default table and ignored tables if table_name == 'default': return elif table_object.ignore_table: logger.info(f'Skipping table: {table_name} (ignore_table=1)') return elif table_object.drop_table: logger.info(f'Skipping table: {table_name} (drop_table=1)') return # initialize table history's last time stamp to first timestamp if not set yet if not table_history.last_timestamp: # default first timestamp to 1900-01-01 if project has no first timestamp if not table_object.first_timestamp: table_object.first_timestamp = '1900-01-01' table_history.last_timestamp = iso_to_datetime( table_object.first_timestamp) # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future if table_history.last_timestamp > current_timestamp: explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}' logger.info(f'Skipping table: {table_name} ({explanation})') return # if we're here then we have a legit last timestamp value to use for CDC last_timestamp = table_history.last_timestamp self.stats.start(table_name, 'table') # logger.info(f'Processing {table_name} ...') # create a fresh cursor for each table cursor = db.conn.cursor() # save table object for stage output_stream = open(f'{self.work_folder_name}/{table_name}.table', 'wb') pickle.dump(table_object, output_stream) output_stream.close() # discover table schema table_schema = db_engine.select_table_schema(schema_name, table_name) # remove ignored columns from table schema if table_object.ignore_columns: # find columns to ignore (remove) based on ignore column names/glob-style patterns ignore_columns = [] for column_name in table_schema.columns: for pattern in split(table_object.ignore_columns): # use fnmatch() to provide glob style matching if fnmatch.fnmatch(column_name.lower(), pattern.lower()): ignore_columns.append(column_name) # delete ignored columns from our table schema for column_name in ignore_columns: logger.info(f'Ignore_column: {table_name}.{column_name}') table_schema.columns.pop(column_name) # save table schema for stage to use output_stream = open(f'{self.work_folder_name}/{table_name}.schema', 'wb') pickle.dump(table_schema, output_stream) output_stream.close() # save table pk for stage to use pk_columns = db_engine.select_table_pk(schema_name, table_name) if not pk_columns and table_object.primary_key: pk_columns = table_object.primary_key output_stream = open(f'{self.work_folder_name}/{table_name}.pk', 'w') output_stream.write(pk_columns) output_stream.close() # clear cdc if it doesn't match timestamp/rowversion table_object.cdc = table_object.cdc.lower() if not table_object.cdc or table_object.cdc not in ('timestamp', 'rowversion'): table_object.cdc = '' # if no pk_columns, then clear table cdc if not pk_columns: if table_object.cdc and table_object.cdc != 'none': logger.info( f'Warning: {table_name} cdc={table_object.cdc} but table has no pk column(s)' ) table_object.cdc = 'none' # we still keep timestamp because its required for filtering first_timestamp - current_timestamp # if table_object.timestamp: # logger.info(f'Warning: {table_name} timestamp={table_object.timestamp} but table has no pk column(s)') # table_object.timestamp = '' # update table object properties for cdc select build column_names = list(table_schema.columns.keys()) table_object.schema_name = schema_name table_object.table_name = table_name table_object.column_names = column_names select_cdc = cdc_select.SelectCDC(table_object) sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp) # logger.info(f'Capture SQL:\n{sql}\n') # run sql here vs via db_engine.capture_select # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp) cursor.execute(sql) # capture rows in fixed size batches to support unlimited size record counts # Note: Batching on capture side allows stage to insert multiple batches in parallel. if self.project.batch_size: batch_size = int(self.project.batch_size) # logger.info(f'Using project specific batch size: {self.project.batch_size}') else: batch_size = 1_000_000 batch_number = 0 row_count = 0 file_size = 0 while True: batch_number += 1 rows = cursor.fetchmany(batch_size) if not rows: break logger.info( f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}' ) # flatten rows to list of column values json_rows = [list(row) for row in rows] output_file = f'{self.work_folder_name}/{table_name}#{batch_number:04}.json' with open(output_file, 'w') as output_stream: # indent=2 for debugging json.dump(json_rows, output_stream, indent=2, default=json_serializer) # track stats row_count += len(json_rows) file_size += pathlib.Path(output_file).stat().st_size # if no cdc, but order set, do a file hash see if output the same time as last file hash if (not table_object.cdc or table_object.cdc == 'none') and table_object.order: print( f'Checking {table_name} file hash based on cdc={table_object.cdc} and order={table_object.order}' ) table_data_files = f'{self.work_folder_name}/{table_name}#*.json' current_filehash = hash_files(table_data_files) if table_history.last_filehash == current_filehash: # suppress this update print( f'Table({table_name}): identical file hash, update suppressed' ) logger.info( f'Table({table_name}): identical file hash, update suppressed' ) row_count = 0 file_size = 0 # delete exported json files delete_files(table_data_files) else: print( f'Table({table_name}): {table_history.last_filehash} != {current_filehash}' ) table_history.last_filehash = current_filehash # update table history with new last timestamp value table_history.last_timestamp = current_timestamp # track total row count and file size across all of a table's batched json files self.stats.stop(table_name, row_count, file_size) # save interim state of stats for diagnostics self.stats.save() self.job_row_count += row_count self.job_file_size += file_size # explicitly close cursor when finished # cursor.close() return
def process_table(self, db, db_engine, schema_name, table_name, table_object, table_history, current_timestamp, current_sequence=0): """Process a specific table.""" # skip default table and ignored tables if table_name == 'default': return # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references) elif table_object.ignore_table: logger.info(f'Skipping table: {table_name} (ignore_table=1)') return elif table_object.drop_table: logger.info(f'Skipping table: {table_name} (drop_table=1)') return # initialize table history's last time stamp to first timestamp if not set yet if not table_history.last_timestamp: # default first timestamp to 1900-01-01 if project has no first timestamp if not table_object.first_timestamp: table_object.first_timestamp = '1900-01-01' table_history.last_timestamp = iso_to_datetime( table_object.first_timestamp) # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future if table_history.last_timestamp > current_timestamp: explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}' logger.info(f'Skipping table: {table_name} ({explanation})') return # if we're here then we have a legit last timestamp value to use for CDC last_timestamp = table_history.last_timestamp # initialize table's last_sequence to first_sequence if not set yet if not table_history.last_sequence: if not table_object.first_sequence: table_object.first_sequence = 0 table_history.last_sequence = table_object.first_sequence self.events.start(table_name, 'table') # logger.info(f'Processing {table_name} ...') # create a fresh cursor for each table cursor = db.conn.cursor() # save table object for stage table_file_name = f'{self.work_folder}/{table_name}.table' save_jsonpickle(table_file_name, table_object) # discover table schema table_schema = db_engine.select_table_schema(schema_name, table_name) # handle non-existent tables if table_schema is None: if table_object.optional_table: logger.info( f'Optional table not found; skipped ({table_name})') else: logger.warning(f'Table not found; skipped ({table_name})') return # remove ignored columns from table schema if table_object.ignore_columns: # find columns to ignore (remove) based on ignore column names/glob-style patterns ignore_columns = [] for column_name in table_schema.columns: for pattern in split(table_object.ignore_columns): if is_glob_match(column_name, pattern): ignore_columns.append(column_name) # delete ignored columns from our table schema for column_name in ignore_columns: logger.info(f'Ignore_column: {table_name}.{column_name}') table_schema.columns.pop(column_name) # save table schema for stage to use schema_table_name = f'{self.work_folder}/{table_name}.schema' save_jsonpickle(schema_table_name, table_schema) # save table pk for stage to use pk_columns = db_engine.select_table_pk(schema_name, table_name) if not pk_columns and table_object.primary_key: pk_columns = table_object.primary_key save_text(f'{self.work_folder}/{table_name}.pk', pk_columns) # normalize cdc setting table_object.cdc = table_object.cdc.lower() if table_object.cdc == 'none': table_object.cdc = '' # clear unknown cdc settings if table_object.cdc and table_object.cdc not in ( 'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'): logger.warning( f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # clear cdc setting when no pk_columns are present # NOTE: filehash cdc does not require pk_columns. if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns: logger.warning( f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # if no cdc, then clear cdc related attributes if not table_object.cdc: table_object.filehash = '' table_object.rowhash = '' table_object.rowversion = '' table_object.sequence = '' table_object.timestamp = '' # update table object properties for cdc select build column_names = list(table_schema.columns.keys()) table_object.schema_name = schema_name table_object.table_name = table_name table_object.column_names = column_names select_cdc = cdc_select.SelectCDC(db_engine, table_object) sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp) # save generated SQL to work folder for documentation purposes sql_file_name = f'{self.work_folder}/{table_name}.sql' save_text(sql_file_name, sql) # run sql here vs via db_engine.capture_select # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp) cursor.execute(sql) # capture rows in fixed size batches to support unlimited size record counts # Note: Batching on capture side allows stage to insert multiple batches in parallel. if self.project.batch_size: batch_size = int(self.project.batch_size) # logger.info(f'Using project specific batch size: {self.project.batch_size}') else: batch_size = 250_000 batch_number = 0 row_count = 0 data_size = 0 while True: batch_number += 1 rows = cursor.fetchmany(batch_size) if not rows: break logger.info( f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}' ) self.progress_message( f'extracting({table_name}.{batch_number:04}) ...') # flatten rows to list of column values json_rows = [list(row) for row in rows] output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json' save_jsonpickle(output_file, json_rows) # track metrics row_count += len(json_rows) data_size += file_size(output_file) # update table history with new last timestamp and sequence values table_history.last_timestamp = current_timestamp table_history.last_sequence = current_sequence # track total row count and file size across all of a table's batched json files self.events.stop(table_name, row_count, data_size) # save interim metrics for diagnostics self.events.save() self.job_row_count += row_count self.job_data_size += data_size # explicitly close cursor when finished # cursor.close() return