Beispiel #1
0
def test():

	# test data
	connect_name = 'aws_amc_capture_dev'
	object_store_name = 'udp-s3-capture-amc-sandbox'

	# get connection info
	connect_config = config.Config('conf/_connect.ini', config.ConnectionSection)
	cloud_connection = connect_config.sections[connect_name]

	# push test files to capture bucket to give polling loop something to test
	object_store = cloud.ObjectStore(object_store_name, cloud_connection)
	object_store.put('test1.data', 'test/test1.data')
	time.sleep(2)
	object_store.put('test1.data', 'test/test2.data')
	time.sleep(2)
	object_store.put('test1.data', 'test/test3.data')
	time.sleep(2)
Beispiel #2
0
    def upload_to_objectstore(self):
        """Upload publish_folder's <namespace>-<job_id>.zip to objectstore."""

        # don't upload captured data if we're in --notransfer mode
        if self.options.get('notransfer') == '1':
            return

        # setup
        self.stats.start('upload', 'step')
        cloud_connection = self.connect_config.sections[self.project.cloud]
        capture_objectstore = cloud.ObjectStore(
            self.project.capture_objectstore, cloud_connection)
        objectstore_file_name = f'{self.namespace}/{self.capture_file_name}.zip'

        # upload
        # capture_objectstore.put(f'{self.publish_folder_name}/{self.capture_file_name}.zip', objectstore_file_name)
        capture_objectstore.put(self.zip_file_name, objectstore_file_name)

        # finish
        zip_file_size = pathlib.Path(self.zip_file_name).stat().st_size
        self.stats.stop('upload', 0, zip_file_size)
Beispiel #3
0
    def save_recovery_state_file(self):

        # don't upload captured data if we're in --notransfer mode
        if self.options.get('notransfer') == '1':
            return

        # FUTURE: Save recovery file in capture.zip file and have archive extract and push back to namespace folder.
        # This way capture_state.zip is only updated AFTER its container file has been successfully archived.

        # setup
        cloud_connection = self.connect_config.sections[self.project.cloud]
        capture_objectstore = cloud.ObjectStore(
            self.project.capture_objectstore, cloud_connection)
        objectstore_file_name = f'{self.namespace}/capture_state.zip'

        # create capture_state archive file
        zip_file_name = f'{self.publish_folder_name}/capture_state'
        zip_file_name = shutil.make_archive(zip_file_name,
                                            format='zip',
                                            root_dir=self.state_folder_name)

        # upload
        capture_objectstore.put(zip_file_name, objectstore_file_name)
Beispiel #4
0
def main():

    # bootstrap configuration settings
    bootstrap = config.Bootstrap()
    bootstrap.debug_flag = False
    bootstrap.load('conf/init.ini')
    bootstrap.load('conf/bootstrap.ini')

    # project
    project_name = 'udp_aws_stage_01_etl'
    project_config = config.Config(f'conf/{project_name}.project',
                                   config.ProjectSection, bootstrap)
    project_config.debug_flag = False
    project_config.dump()
    project_object = project_config.sections['stage_project']

    # make sure core database environment in place
    udp.setup()

    # get references to stage database and catalog schema
    # udp_stage_database = udp.udp_stage_database
    # udp_catalog_schema = udp.udp_catalog_schema

    # connections
    database_connect_name = f'{project_object.database}'
    cloud_connect_name = f'{project_object.cloud}'

    # SQL Server
    connect_config = config.Config('conf/_connect.ini',
                                   config.ConnectionSection, bootstrap)
    sql_server_connect = connect_config.sections[database_connect_name]

    db = database.SQLServerConnection(sql_server_connect)
    db.debug_flag = True
    conn = db.conn
    # cursor = conn.cursor()

    # create udp_staging database if not present; then use
    db_conn = database.Database('sqlserver', conn)
    db_conn.use_database('udp_stage')

    # Todo: These names should come from project file
    # Todo: queue_name should be input_queue_name, output_queue_name
    # archive_object_store_name = 'udp-s3-archive-sandbox'
    # queue_name = 'udp-sqs-archive-sandbox'
    archive_object_store_name = f'{project_object.archive_objectstore}'
    archive_queue_name = f'{project_object.archive_queue}'
    stage_queue_name = f'{project_object.stage_queue}'

    # get connection info
    connect_config = config.Config('conf/_connect.ini',
                                   config.ConnectionSection, bootstrap)
    cloud_connection = connect_config.sections[cloud_connect_name]

    archive_object_store = cloud.ObjectStore(archive_object_store_name,
                                             cloud_connection)
    if project_object.archive_queue:
        archive_queue = cloud.Queue(archive_queue_name, cloud_connection)
    else:
        archive_queue = None

    if project_object.stage_queue:
        stage_queue = cloud.Queue(stage_queue_name, cloud_connection)
    else:
        stage_queue = None

    # main poll loop
    while True:
        print(
            f'{datetime.datetime.today():%Y-%m-%d %H:%M:%S}: Polling for archive updates ...'
        )
        archive_file_found = process_next_file_to_stage(
            db_conn, archive_object_store, stage_queue)

        # clear archive queue messages
        # TODO: Drop archive queue except as a diagnostic monitoring tool?
        if archive_queue:
            response = archive_queue.get()
            notification = cloud.ObjectStoreNotification(response)
            if notification.message_id:
                if not notification.objectstore_name:
                    # unexpected notification - log it, then delete it
                    print(f'Ignoring message: {notification.message}')
                    archive_queue.delete(notification.message_id)
                else:
                    # archive_file_name = stage_archive_file(archive_object_store, notification)
                    archive_queue.delete(notification.message_id)

        # poll if we didn't find an archived file, otherwise keep processing
        if not archive_file_found:
            # poll
            time.sleep(int(project_object.poll_frequency))
Beispiel #5
0
def main():
	# TODO: Create a project file for all udp cloud based ETL scripts (archive, stage, udp, etc)

	# bootstrap configuration settings
	bootstrap = config.Bootstrap()
	bootstrap.debug_flag = False
	bootstrap.load('conf/init.ini')
	bootstrap.load('conf/bootstrap.ini')

	# project info
	project_name = 'udp_aws_archive_01_etl'
	project_config = config.Config(f'conf/{project_name}.project', config.ProjectSection, bootstrap)
	project_config.debug_flag = True
	project_config.dump()
	project_object = project_config.sections['archive_project']

	# make sure core database environment in place
	udp.setup()

	# get references to stage database and catalog schema
	# data_stage_database = udp.udp_stage_database
	# data_catalog_schema = udp.udp_catalog_schema

	database_connect_name = f'{project_object.database}'
	cloud_connect_name = f'{project_object.cloud}'
	# print(f'database_connect_name = {database_connect_name}')

	# SQL Server
	connect_config = config.Config('conf/_connect.ini', config.ConnectionSection, bootstrap)
	sql_server_connect = connect_config.sections[database_connect_name]

	db = database.SQLServerConnection(sql_server_connect)
	conn = db.conn
	# cursor = conn.cursor()

	db_conn = database.Database('sqlserver', conn)
	db_conn.debug_flag = False
	db_conn.use_database('udp_stage')

	# create udp_staging database if not present; then use
	# db_conn.create_database('udp_staging')
	# db_conn.use_database('udp_staging')
	# db_conn.create_schema('udp_admin')
	# db_conn.create_named_table('udp_admin', 'nst_lookup')

	# Todo: These names should come from project file
	# Todo: queue_name should be input_queue_name, output_queue_name
	# archive_objectstore_name = 'udp-s3-archive-sandbox'
	# queue_name = 'udp-sqs-capture-sandbox'
	archive_objectstore_name = f'{project_object.archive_objectstore}'
	capture_queue_name = f'{project_object.capture_queue}'

	# get connection info
	connect_config = config.Config('conf/_connect.ini', config.ConnectionSection, bootstrap)
	cloud_connection = connect_config.sections[cloud_connect_name]

	# main poll loop
	archive_object_store = cloud.ObjectStore(archive_objectstore_name, cloud_connection)
	queue = cloud.Queue(capture_queue_name, cloud_connection)
	while True:
		print(f'{datetime.datetime.today():%Y-%m-%d %H:%M:%S}: Polling for capture updates ...')
		response = queue.get()
		notification = cloud.ObjectStoreNotification(response)
		if notification.message_id:
			archive_capture_file(archive_object_store, cloud_connection, notification, db_conn)
			queue.delete(notification.message_id)

		# poll if we didn't find a captured file, otherwise keep processing
		if not notification.message_id:
			# poll
			time.sleep(int(project_object.poll_frequency))
Beispiel #6
0
def archive_capture_file(archive_object_store, cloud_connection, notification, db_conn):

	log(f'Notification: {notification}')

	# get name of file (key) that triggered this call
	source_object_key = notification.object_key

	# extract out the file name
	source_file_name = pathlib.Path(source_object_key).name

	# if source_file_name is empty, ignore notification
	if not source_file_name:
		log(f'Ignoring notification without object key (file name)')
		return

	# if source_file_name is capture_state.zip, ignore it
	# Note: This keeps the latest capture_state.zip file in each capture folder for recovery purposes.
	if source_file_name == 'capture_state.zip':
		log(f'Ignoring capture_state.zip file notification')
		return

	# TODO: Add activity_log (vs job_log/stat_log) references.

	# make sure work folder exists
	work_folder = 'archive_work'
	if not os.path.exists(work_folder):
		os.mkdir(work_folder)

	# make sure work folder is empty
	for file_name in glob.glob(f'{work_folder}/*'):
		# print(f'Deleting: {file_name}')
		os.remove(file_name)

	# get file, copy to archive then delete from capture
	source_object_store_name = notification.object_store_name
	source_file_name = f'{work_folder}/' + pathlib.Path(source_object_key).name

	# _connect to source object_store
	# TODO: Cache these object_store connections vs reconnecting each time.
	source_object_store = cloud.ObjectStore(source_object_store_name, cloud_connection)

	# get the posted file
	log(f'Getting {source_file_name} from {source_object_store_name}::{source_object_key}')
	source_object_store.get(source_file_name, source_object_key)

	# move (copy) the posted file to the archive object_store
	log(f'Moving {source_file_name} to archive_object_store::{source_object_key}')
	archive_object_store.put(source_file_name, source_object_key)

	# extract job.log/last_job.log from capture zip and merge these into stat_log table
	archive = zipfile.ZipFile(source_file_name, 'r')
	if 'job.log' in archive.namelist():
		job_log_json = json.loads(archive.read('job.log'))
		for row in job_log_json:
			row['start_time'] = arrow.get(row['start_time']).datetime
			row['end_time'] = arrow.get(row['end_time']).datetime

			# skip capture stats which only have intermediate end_time and run_time values
			# next capture file will include an accurate version of this stat in last_job.job file
			if row['stat_name'] != 'capture':
				db_conn.insert_into_table('udp_catalog', 'stat_log', **row)

	if 'last_job.log' in archive.namelist():
		last_job_log_json = json.loads(archive.read('last_job.log'))
		for row in last_job_log_json:
			row['start_time'] = arrow.get(row['start_time']).datetime
			row['end_time'] = arrow.get(row['end_time']).datetime
			if row['stat_name'] in ('capture', 'compress', 'upload'):
				db_conn.insert_into_table('udp_catalog', 'stat_log', **row)

	# close archive when done
	archive.close()

	# then delete file from source object_store	and local work folder
	log(f'Deleting {source_object_key} from {source_object_store_name}')
	source_object_store.delete(source_object_key)
	pathlib.Path(source_file_name).unlink()

	# TODO: Move tested component code here
	# extract stat.log from capture*.zip
	# update nst_lookup, job_log, table_log

	# register new file in stage_arrival_queue table
	file_name = pathlib.Path(source_object_key).name
	job_id = int(file_name.split('.')[0].rsplit('#', 1)[-1])
	new_file = dict(archive_file_name=file_name, job_id=job_id)
	db_conn.insert_into_table('udp_catalog', 'stage_arrival_queue', **new_file)