Exemple #1
0
def encrypt_text_file(key, source_file_name, target_file_name=None):
    """Encrypt text file to target file name."""
    if not target_file_name:
        # default target file name is source file name with '_e' added to file extension
        target_file_name = source_file_name + '_e'

    decrypted_data = load_text(source_file_name)
    encrypted_data = encrypt_data(key, decrypted_data)
    save_text(encrypted_data, target_file_name)
def test():

    # configuration driven support
    config = ConfigSectionKey("../conf", "../local")
    config = config
    config.load("bootstrap.ini", "bootstrap")
    config.load("init.ini")
    config.load("connect.ini")

    bs_test = BlobStore()
    resource = config("resource:bs_test_local")
    bs_test.create(resource)
    bs_test.connect(resource)
    bs_test.remove(resource)
    bs_test.create(resource)

    # # good things
    save_text("testfile-1.txt", "test file")
    delete_file("testfile-2.txt", ignore_errors=True)

    # expected Connection exception
    try:
        bs_test.put("testfile-1.txt", "downloads/testfile-1.txt")
    except ConnectionError as e:
        logger.info(f"Non-connected resource raised ConnectionError as expected: {e}")

    bs_test.connect(resource)
    assert bs_test.put("testfile-1.txt", "downloads/testfile-1.txt")
    assert bs_test.put("testfile-1.txt", "downloads/testfile-2.txt")
    assert bs_test.put("testfile-1.txt", "downloads/testfile-3.txt")
    assert bs_test.get("testfile-2.txt", "downloads/testfile-2.txt")

    downloads_folder_only = ["downloads"]
    downloads_folder_files = [
        "downloads/testfile-1.txt",
        "downloads/testfile-2.txt",
        "downloads/testfile-3.txt",
    ]
    # assert bs_test.list() == downloads_folder_only
    # assert bs_test.list('*') == downloads_folder_only
    # assert bs_test.list('/') == downloads_folder_only
    # assert bs_test.list('/downloads') == downloads_folder_files
    # assert bs_test.list('downloads') == downloads_folder_files
    # assert bs_test.list('downloads/') == downloads_folder_files

    bs_test.list("downloads")
    bs_test.list("downloads/")
    bs_test.list("downloads/*")
    bs_test.delete("downloads/testfile-1.txt")
    bs_test.list("downloads/*")

    # bad things
    assert not bs_test.list("bad-path*")
    assert not bs_test.put("bad-file-1.txt", "downloads/bad-file.txt")
    assert not bs_test.get("bad-file-2.txt", "downloads/bad-file.txt")
    assert not bs_test.delete("downloads/bad-file.txt")
    bs_test.clear()
def main():
	from config import ConfigSectionKey

	# test data
	config = ConfigSectionKey('conf', 'local')
	config.load('bootstrap.ini', 'bootstrap')
	config.load('init.ini')
	config.load('connect.ini')

	# for testing purposes:
	# - test with both cloud connection values (*capture and *archive)
	# - these connections have different permissions and will yield different results

	# cloud_connection_name = 'cloud:amc_aws_capture_01_etl'
	cloud_connection_name = 'cloud:udp_aws_archive_01_etl'
	cloud = config(cloud_connection_name)
	capture_objectstore_name = cloud.capture_objectstore
	capture_queue_name = cloud.capture_queue
	cloud.dump()

	# create test files (must have *.zip extension for S3:SQS notification)
	test_folder = 'test_folder_1'
	test_file_1 = f'{test_folder}/test1.zip'
	test_file_2 = f'{test_folder}/test2.zip'
	save_text(f'Test @{now()}', test_file_1)

	# object store put, get, delete
	objectstore = Objectstore(capture_objectstore_name, cloud)
	objectstore.put(test_file_1, 'test/test1.zip')
	objectstore.get(test_file_2, 'test/test1.zip')
	objectstore.delete('test/test1.zip')

	# sleep for 3 seconds to give notification message time to post to queue
	time.sleep(3)

	# queue get, remove
	queue = Queue(capture_queue_name, cloud)
	queue.put('Test message 1')
	time.sleep(2)
	queue.put('Test message 2')
	time.sleep(2)

	while True:
		time.sleep(1)
		response = queue.get()
		notification = ObjectstoreNotification(response)
		queue.delete(notification.message_id)
		if notification.message_id:
			logger.info(f'Test mode: notification message = {notification}')
		else:
			break

	# debugging info
	logger.info(f'Available queues: {queue._list_queue_names()}')
	queue._dump()
Exemple #4
0
def decrypt_text_file(key, source_file_name, target_file_name=None):
    """Decrypt text file to target file name."""
    if not target_file_name:
        # if source file extension ends with '_e', decrypt to source file name minus this suffix
        if source_file_name.lower().endswith('_e'):
            target_file_name = source_file_name[:-2]
        else:
            raise Exception('target_file_name not specified')

    encrypted_data = load_text(source_file_name)
    decrypted_data = decrypt_data(key, encrypted_data)
    save_text(decrypted_data, target_file_name)
def setup_test_files():
    # ensure clean (empty) ../tmp folder
    teardown_test_files()

    # set up files
    create_folder(test_folder_path)
    readonly_file_name = f'{test_folder_path}/readonly.txt'
    readwrite_file_name = f'{test_folder_path}/readwrite.txt'

    # ... OR BEST ... leverage common's save_text()
    save_text(readwrite_file_name, 'Hello world')

    # create a read only file
    save_text(readonly_file_name, 'Hello world')

    # do this type of operation after a file is closed
    os.chmod(readonly_file_name, S_IREAD | S_IRGRP | S_IROTH)

    # create a working dir in tmp folder
    create_folder(f'{test_folder_path}/working')
Exemple #6
0
def test_encrypt_decrypt_text_file():
    """Test text file encryption/decryption."""

    # setup
    key = 'abc'
    original = 'This is text file data.'
    original_file_name = 'test_encryption_file.txt'
    encrypted_file_name = original_file_name + '_encrypted'
    decrypted_file_name = original_file_name + '_decrypted'
    save_text(original, original_file_name)

    # test
    encrypt_text_file(key, original_file_name, encrypted_file_name)
    encrypted = load_text(encrypted_file_name)
    decrypt_text_file(key, encrypted_file_name, decrypted_file_name)
    decrypted = load_text(decrypted_file_name)
    _output_encrypt_decrypt_results(original, encrypted, decrypted)

    # cleanup
    delete_file(original_file_name)
    delete_file(encrypted_file_name)
    delete_file(decrypted_file_name)
def test():
	# secret prefix used to indicate secret lookup
	secret_prefix = 'secret'

	# non-existent key vault
	try:
		key_vault = KeyVault()
		key_vault.connect('missing', '$password$')
	except KeyVaultException as e:
		logger.debug(f'Expected exception: {e}')

	# corrupt key vault
	save_text('corrupt.vault', '~!@#$%^&*')
	key_vault = KeyVault()
	try:
		key_vault.connect('corrupt', '$password$')
	except KeyVaultException as e:
		logger.debug(f'Expected exception: {e}')

	# remove the corrupt vault we just created
	key_vault.remove('corrupt')

	# valid key vault testing

	# key vault name and password
	key_vault_name = 'test'
	os.environ[f'{secret_prefix.upper()}_UDP_KV'] = '$key-vault-password$'
	logger.debug(f'os.environ(SECRET_UDP_KV) = {os.environ.get("SECRET_UDP_KV", "")}')
	key_vault_password = Secret()(f'{secret_prefix}:UDP_KV')
	logger.debug(f'key_vault_password = {key_vault_password}')

	# create a key vault
	key_vault = KeyVault()
	key_vault.create(key_vault_name, key_vault_password)
	key_vault.remove(key_vault_name)
	key_vault.create(key_vault_name, key_vault_password)

	# key vault testing
	key_vault = KeyVault(key_vault_name, key_vault_password)
	key_vault.set('date_password', now())
	key_vault.set('user-name', 'Malcolm')
	key_vault.disconnect()

	# test authentication failure (bad password)
	try:
		bad_key_vault = KeyVault(key_vault_name, 'bad-password')
		bad_key_vault.disconnect()
	except KeyVaultException as e:
		logger.debug(f'Expected exception: {e}')

	# test using a disconnected key vault
	try:
		key_vault.list()
	except KeyVaultException as e:
		logger.debug(f'Expected exception: {e}')

	with KeyVault(key_vault_name, key_vault_password) as key_vault:
		# key_vault.clear()
		key_vault.set('AMP_database_password', '$amp-password$')
		key_vault.set('RTP_DATABASE_PASSWORD', '$rtp-password$')
		key_vault.set('bye_database_password', 'bad-password$')
		key_vault.list()
		key_vault.delete('bye_database_password')
		key_vault.delete('bad_secret_name')
		key_vault.get('amp-database-PASSWORD')
		key_vault.get('$rtp-database-password$')
		key_vault.list()

	# test JIT secret expansion
	secret = Secret(key_vault_name, key_vault_password)
	secret('$amp_password$')
	secret(f'{secret_prefix}:amp-DATABASE-password$$$')
	secret(f'{secret_prefix}:@rtp-DATABASE_password')
	secret(f'{secret_prefix}:@USER-NAME:')
	secret(f'{secret_prefix}:bad_secret_name')
Exemple #8
0
    def process_table(self,
                      db,
                      db_engine,
                      schema_name,
                      table_name,
                      table_object,
                      table_history,
                      current_timestamp,
                      current_sequence=0):
        """Process a specific table."""

        # skip default table and ignored tables
        if table_name == 'default':
            return

        # TODO: Allow ignore and drop table conditions to be passed to archive (log table state) and stage (to drop table and table references)
        elif table_object.ignore_table:
            logger.info(f'Skipping table: {table_name} (ignore_table=1)')
            return
        elif table_object.drop_table:
            logger.info(f'Skipping table: {table_name} (drop_table=1)')
            return

        # initialize table history's last time stamp to first timestamp if not set yet
        if not table_history.last_timestamp:
            # default first timestamp to 1900-01-01 if project has no first timestamp
            if not table_object.first_timestamp:
                table_object.first_timestamp = '1900-01-01'
            table_history.last_timestamp = iso_to_datetime(
                table_object.first_timestamp)

        # skip table if last timestamp > current timestamp, eg. tables pre-configured for the future
        if table_history.last_timestamp > current_timestamp:
            explanation = f'first/last timestamp {table_history.last_timestamp} > current timestamp {current_timestamp}'
            logger.info(f'Skipping table: {table_name} ({explanation})')
            return

        # if we're here then we have a legit last timestamp value to use for CDC
        last_timestamp = table_history.last_timestamp

        # initialize table's last_sequence to first_sequence if not set yet
        if not table_history.last_sequence:
            if not table_object.first_sequence:
                table_object.first_sequence = 0
            table_history.last_sequence = table_object.first_sequence

        self.events.start(table_name, 'table')
        # logger.info(f'Processing {table_name} ...')

        # create a fresh cursor for each table
        cursor = db.conn.cursor()

        # save table object for stage
        table_file_name = f'{self.work_folder}/{table_name}.table'
        save_jsonpickle(table_file_name, table_object)

        # discover table schema
        table_schema = db_engine.select_table_schema(schema_name, table_name)

        # handle non-existent tables
        if table_schema is None:
            if table_object.optional_table:
                logger.info(
                    f'Optional table not found; skipped ({table_name})')
            else:
                logger.warning(f'Table not found; skipped ({table_name})')
            return

        # remove ignored columns from table schema
        if table_object.ignore_columns:
            # find columns to ignore (remove) based on ignore column names/glob-style patterns
            ignore_columns = []
            for column_name in table_schema.columns:
                for pattern in split(table_object.ignore_columns):
                    if is_glob_match(column_name, pattern):
                        ignore_columns.append(column_name)

            # delete ignored columns from our table schema
            for column_name in ignore_columns:
                logger.info(f'Ignore_column: {table_name}.{column_name}')
                table_schema.columns.pop(column_name)

        # save table schema for stage to use
        schema_table_name = f'{self.work_folder}/{table_name}.schema'
        save_jsonpickle(schema_table_name, table_schema)

        # save table pk for stage to use
        pk_columns = db_engine.select_table_pk(schema_name, table_name)
        if not pk_columns and table_object.primary_key:
            pk_columns = table_object.primary_key
        save_text(f'{self.work_folder}/{table_name}.pk', pk_columns)

        # normalize cdc setting
        table_object.cdc = table_object.cdc.lower()
        if table_object.cdc == 'none':
            table_object.cdc = ''

        # clear unknown cdc settings
        if table_object.cdc and table_object.cdc not in (
                'filehash', 'rowhash', 'rowversion', 'sequence', 'timestamp'):
            logger.warning(
                f'Warning: Unknown CDC setting; CDC setting cleared ({table_name}.cdc={table_object.cdc})'
            )
            table_object.cdc = ''

        # clear cdc setting when no pk_columns are present
        # NOTE: filehash cdc does not require pk_columns.
        if table_object.cdc and table_object.cdc != 'filehash' and not pk_columns:
            logger.warning(
                f'Warning: CDC enabled but no PK; CDC setting cleared ({table_name}.cdc={table_object.cdc})'
            )
            table_object.cdc = ''

        # if no cdc, then clear cdc related attributes
        if not table_object.cdc:
            table_object.filehash = ''
            table_object.rowhash = ''
            table_object.rowversion = ''
            table_object.sequence = ''
            table_object.timestamp = ''

        # update table object properties for cdc select build
        column_names = list(table_schema.columns.keys())
        table_object.schema_name = schema_name
        table_object.table_name = table_name
        table_object.column_names = column_names
        select_cdc = cdc_select.SelectCDC(db_engine, table_object)
        sql = select_cdc.select(self.job_id, current_timestamp, last_timestamp)

        # save generated SQL to work folder for documentation purposes
        sql_file_name = f'{self.work_folder}/{table_name}.sql'
        save_text(sql_file_name, sql)

        # run sql here vs via db_engine.capture_select
        # cursor = db_engine.capture_select(schema_name, table_name, column_names, last_timestamp, current_timestamp)
        cursor.execute(sql)

        # capture rows in fixed size batches to support unlimited size record counts
        # Note: Batching on capture side allows stage to insert multiple batches in parallel.

        if self.project.batch_size:
            batch_size = int(self.project.batch_size)
            # logger.info(f'Using project specific batch size: {self.project.batch_size}')
        else:
            batch_size = 250_000

        batch_number = 0
        row_count = 0
        data_size = 0
        while True:
            batch_number += 1
            rows = cursor.fetchmany(batch_size)
            if not rows:
                break

            logger.info(
                f'Table({table_name}): batch={batch_number} using batch size {batch_size:,}'
            )
            self.progress_message(
                f'extracting({table_name}.{batch_number:04}) ...')

            # flatten rows to list of column values
            json_rows = [list(row) for row in rows]
            output_file = f'{self.work_folder}/{table_name}#{batch_number:04}.json'
            save_jsonpickle(output_file, json_rows)

            # track metrics
            row_count += len(json_rows)
            data_size += file_size(output_file)

        # update table history with new last timestamp and sequence values
        table_history.last_timestamp = current_timestamp
        table_history.last_sequence = current_sequence

        # track total row count and file size across all of a table's batched json files
        self.events.stop(table_name, row_count, data_size)

        # save interim metrics for diagnostics
        self.events.save()

        self.job_row_count += row_count
        self.job_data_size += data_size

        # explicitly close cursor when finished
        # cursor.close()
        return