def encrypt_text_file(key, source_file_name, target_file_name=None): """Encrypt text file to target file name.""" if not target_file_name: # default target file name is source file name with '_e' added to file extension target_file_name = source_file_name + '_e' decrypted_data = load_text(source_file_name) encrypted_data = encrypt_data(key, decrypted_data) save_text(encrypted_data, target_file_name)
def test(): # configuration driven support config = ConfigSectionKey("../conf", "../local") config = config config.load("bootstrap.ini", "bootstrap") config.load("init.ini") config.load("connect.ini") bs_test = BlobStore() resource = config("resource:bs_test_local") bs_test.create(resource) bs_test.connect(resource) bs_test.remove(resource) bs_test.create(resource) # # good things save_text("testfile-1.txt", "test file") delete_file("testfile-2.txt", ignore_errors=True) # expected Connection exception try: bs_test.put("testfile-1.txt", "downloads/testfile-1.txt") except ConnectionError as e: logger.info(f"Non-connected resource raised ConnectionError as expected: {e}") bs_test.connect(resource) assert bs_test.put("testfile-1.txt", "downloads/testfile-1.txt") assert bs_test.put("testfile-1.txt", "downloads/testfile-2.txt") assert bs_test.put("testfile-1.txt", "downloads/testfile-3.txt") assert bs_test.get("testfile-2.txt", "downloads/testfile-2.txt") downloads_folder_only = ["downloads"] downloads_folder_files = [ "downloads/testfile-1.txt", "downloads/testfile-2.txt", "downloads/testfile-3.txt", ] # assert bs_test.list() == downloads_folder_only # assert bs_test.list('*') == downloads_folder_only # assert bs_test.list('/') == downloads_folder_only # assert bs_test.list('/downloads') == downloads_folder_files # assert bs_test.list('downloads') == downloads_folder_files # assert bs_test.list('downloads/') == downloads_folder_files bs_test.list("downloads") bs_test.list("downloads/") bs_test.list("downloads/*") bs_test.delete("downloads/testfile-1.txt") bs_test.list("downloads/*") # bad things assert not bs_test.list("bad-path*") assert not bs_test.put("bad-file-1.txt", "downloads/bad-file.txt") assert not bs_test.get("bad-file-2.txt", "downloads/bad-file.txt") assert not bs_test.delete("downloads/bad-file.txt") bs_test.clear()
def main(): from config import ConfigSectionKey # test data config = ConfigSectionKey('conf', 'local') config.load('bootstrap.ini', 'bootstrap') config.load('init.ini') config.load('connect.ini') # for testing purposes: # - test with both cloud connection values (*capture and *archive) # - these connections have different permissions and will yield different results # cloud_connection_name = 'cloud:amc_aws_capture_01_etl' cloud_connection_name = 'cloud:udp_aws_archive_01_etl' cloud = config(cloud_connection_name) capture_objectstore_name = cloud.capture_objectstore capture_queue_name = cloud.capture_queue cloud.dump() # create test files (must have *.zip extension for S3:SQS notification) test_folder = 'test_folder_1' test_file_1 = f'{test_folder}/test1.zip' test_file_2 = f'{test_folder}/test2.zip' save_text(f'Test @{now()}', test_file_1) # object store put, get, delete objectstore = Objectstore(capture_objectstore_name, cloud) objectstore.put(test_file_1, 'test/test1.zip') objectstore.get(test_file_2, 'test/test1.zip') objectstore.delete('test/test1.zip') # sleep for 3 seconds to give notification message time to post to queue time.sleep(3) # queue get, remove queue = Queue(capture_queue_name, cloud) queue.put('Test message 1') time.sleep(2) queue.put('Test message 2') time.sleep(2) while True: time.sleep(1) response = queue.get() notification = ObjectstoreNotification(response) queue.delete(notification.message_id) if notification.message_id: logger.info(f'Test mode: notification message = {notification}') else: break # debugging info logger.info(f'Available queues: {queue._list_queue_names()}') queue._dump()
def decrypt_text_file(key, source_file_name, target_file_name=None): """Decrypt text file to target file name.""" if not target_file_name: # if source file extension ends with '_e', decrypt to source file name minus this suffix if source_file_name.lower().endswith('_e'): target_file_name = source_file_name[:-2] else: raise Exception('target_file_name not specified') encrypted_data = load_text(source_file_name) decrypted_data = decrypt_data(key, encrypted_data) save_text(decrypted_data, target_file_name)
def setup_test_files(): # ensure clean (empty) ../tmp folder teardown_test_files() # set up files create_folder(test_folder_path) readonly_file_name = f'{test_folder_path}/readonly.txt' readwrite_file_name = f'{test_folder_path}/readwrite.txt' # ... OR BEST ... leverage common's save_text() save_text(readwrite_file_name, 'Hello world') # create a read only file save_text(readonly_file_name, 'Hello world') # do this type of operation after a file is closed os.chmod(readonly_file_name, S_IREAD | S_IRGRP | S_IROTH) # create a working dir in tmp folder create_folder(f'{test_folder_path}/working')
def test_encrypt_decrypt_text_file(): """Test text file encryption/decryption.""" # setup key = 'abc' original = 'This is text file data.' original_file_name = 'test_encryption_file.txt' encrypted_file_name = original_file_name + '_encrypted' decrypted_file_name = original_file_name + '_decrypted' save_text(original, original_file_name) # test encrypt_text_file(key, original_file_name, encrypted_file_name) encrypted = load_text(encrypted_file_name) decrypt_text_file(key, encrypted_file_name, decrypted_file_name) decrypted = load_text(decrypted_file_name) _output_encrypt_decrypt_results(original, encrypted, decrypted) # cleanup delete_file(original_file_name) delete_file(encrypted_file_name) delete_file(decrypted_file_name)
def test(): # secret prefix used to indicate secret lookup secret_prefix = 'secret' # non-existent key vault try: key_vault = KeyVault() key_vault.connect('missing', '$password$') except KeyVaultException as e: logger.debug(f'Expected exception: {e}') # corrupt key vault save_text('corrupt.vault', '~!@#$%^&*') key_vault = KeyVault() try: key_vault.connect('corrupt', '$password$') except KeyVaultException as e: logger.debug(f'Expected exception: {e}') # remove the corrupt vault we just created key_vault.remove('corrupt') # valid key vault testing # key vault name and password key_vault_name = 'test' os.environ[f'{secret_prefix.upper()}_UDP_KV'] = '$key-vault-password$' logger.debug(f'os.environ(SECRET_UDP_KV) = {os.environ.get("SECRET_UDP_KV", "")}') key_vault_password = Secret()(f'{secret_prefix}:UDP_KV') logger.debug(f'key_vault_password = {key_vault_password}') # create a key vault key_vault = KeyVault() key_vault.create(key_vault_name, key_vault_password) key_vault.remove(key_vault_name) key_vault.create(key_vault_name, key_vault_password) # key vault testing key_vault = KeyVault(key_vault_name, key_vault_password) key_vault.set('date_password', now()) key_vault.set('user-name', 'Malcolm') key_vault.disconnect() # test authentication failure (bad password) try: bad_key_vault = KeyVault(key_vault_name, 'bad-password') bad_key_vault.disconnect() except KeyVaultException as e: logger.debug(f'Expected exception: {e}') # test using a disconnected key vault try: key_vault.list() except KeyVaultException as e: logger.debug(f'Expected exception: {e}') with KeyVault(key_vault_name, key_vault_password) as key_vault: # key_vault.clear() key_vault.set('AMP_database_password', '$amp-password$') key_vault.set('RTP_DATABASE_PASSWORD', '$rtp-password$') key_vault.set('bye_database_password', 'bad-password$') key_vault.list() key_vault.delete('bye_database_password') key_vault.delete('bad_secret_name') key_vault.get('amp-database-PASSWORD') key_vault.get('$rtp-database-password$') key_vault.list() # test JIT secret expansion secret = Secret(key_vault_name, key_vault_password) secret('$amp_password$') secret(f'{secret_prefix}:amp-DATABASE-password$$$') secret(f'{secret_prefix}:@rtp-DATABASE_password') secret(f'{secret_prefix}:@USER-NAME:') secret(f'{secret_prefix}:bad_secret_name')
def process_table(self, db, db_engine, schema_name, table_name, table_object, table_history, current_timestamp, current_sequence=0): """Process a specific table.""" # skip default table and ignored tables if table_name == 'default': return # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references) elif table_object.ignore_table: logger.info(f'Skipping table: {table_name} (ignore_table=1)') return elif table_object.drop_table: logger.info(f'Skipping table: {table_name} (drop_table=1)') return # initialize table history's last time stamp to first timestamp if not set yet if not table_history.last_timestamp: # default first timestamp to 1900-01-01 if project has no first timestamp if not table_object.first_timestamp: table_object.first_timestamp = '1900-01-01' table_history.last_timestamp = iso_to_datetime( table_object.first_timestamp) # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future if table_history.last_timestamp > current_timestamp: explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}' logger.info(f'Skipping table: {table_name} ({explanation})') return # if we're here then we have a legit last timestamp value to use for CDC last_timestamp = table_history.last_timestamp # initialize table's last_sequence to first_sequence if not set yet if not table_history.last_sequence: if not table_object.first_sequence: table_object.first_sequence = 0 table_history.last_sequence = table_object.first_sequence self.events.start(table_name, 'table') # logger.info(f'Processing {table_name} ...') # create a fresh cursor for each table cursor = db.conn.cursor() # save table object for stage table_file_name = f'{self.work_folder}/{table_name}.table' save_jsonpickle(table_file_name, table_object) # discover table schema table_schema = db_engine.select_table_schema(schema_name, table_name) # handle non-existent tables if table_schema is None: if table_object.optional_table: logger.info( f'Optional table not found; skipped ({table_name})') else: logger.warning(f'Table not found; skipped ({table_name})') return # remove ignored columns from table schema if table_object.ignore_columns: # find columns to ignore (remove) based on ignore column names/glob-style patterns ignore_columns = [] for column_name in table_schema.columns: for pattern in split(table_object.ignore_columns): if is_glob_match(column_name, pattern): ignore_columns.append(column_name) # delete ignored columns from our table schema for column_name in ignore_columns: logger.info(f'Ignore_column: {table_name}.{column_name}') table_schema.columns.pop(column_name) # save table schema for stage to use schema_table_name = f'{self.work_folder}/{table_name}.schema' save_jsonpickle(schema_table_name, table_schema) # save table pk for stage to use pk_columns = db_engine.select_table_pk(schema_name, table_name) if not pk_columns and table_object.primary_key: pk_columns = table_object.primary_key save_text(f'{self.work_folder}/{table_name}.pk', pk_columns) # normalize cdc setting table_object.cdc = table_object.cdc.lower() if table_object.cdc == 'none': table_object.cdc = '' # clear unknown cdc settings if table_object.cdc and table_object.cdc not in ( 'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'): logger.warning( f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # clear cdc setting when no pk_columns are present # NOTE: filehash cdc does not require pk_columns. if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns: logger.warning( f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})' ) table_object.cdc = '' # if no cdc, then clear cdc related attributes if not table_object.cdc: table_object.filehash = '' table_object.rowhash = '' table_object.rowversion = '' table_object.sequence = '' table_object.timestamp = '' # update table object properties for cdc select build column_names = list(table_schema.columns.keys()) table_object.schema_name = schema_name table_object.table_name = table_name table_object.column_names = column_names select_cdc = cdc_select.SelectCDC(db_engine, table_object) sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp) # save generated SQL to work folder for documentation purposes sql_file_name = f'{self.work_folder}/{table_name}.sql' save_text(sql_file_name, sql) # run sql here vs via db_engine.capture_select # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp) cursor.execute(sql) # capture rows in fixed size batches to support unlimited size record counts # Note: Batching on capture side allows stage to insert multiple batches in parallel. if self.project.batch_size: batch_size = int(self.project.batch_size) # logger.info(f'Using project specific batch size: {self.project.batch_size}') else: batch_size = 250_000 batch_number = 0 row_count = 0 data_size = 0 while True: batch_number += 1 rows = cursor.fetchmany(batch_size) if not rows: break logger.info( f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}' ) self.progress_message( f'extracting({table_name}.{batch_number:04}) ...') # flatten rows to list of column values json_rows = [list(row) for row in rows] output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json' save_jsonpickle(output_file, json_rows) # track metrics row_count += len(json_rows) data_size += file_size(output_file) # update table history with new last timestamp and sequence values table_history.last_timestamp = current_timestamp table_history.last_sequence = current_sequence # track total row count and file size across all of a table's batched json files self.events.stop(table_name, row_count, data_size) # save interim metrics for diagnostics self.events.save() self.job_row_count += row_count self.job_data_size += data_size # explicitly close cursor when finished # cursor.close() return