def test(): # test data connect_name = 'aws_amc_capture_dev' object_store_name = 'udp-s3-capture-amc-sandbox' # get connection info connect_config = config.Config('conf/_connect.ini', config.ConnectionSection) cloud_connection = connect_config.sections[connect_name] # push test files to capture bucket to give polling loop something to test object_store = cloud.ObjectStore(object_store_name, cloud_connection) object_store.put('test1.data', 'test/test1.data') time.sleep(2) object_store.put('test1.data', 'test/test2.data') time.sleep(2) object_store.put('test1.data', 'test/test3.data') time.sleep(2)
def upload_to_objectstore(self): """Upload publish_folder's <namespace>-<job_id>.zip to objectstore.""" # don't upload captured data if we're in --notransfer mode if self.options.get('notransfer') == '1': return # setup self.stats.start('upload', 'step') cloud_connection = self.connect_config.sections[self.project.cloud] capture_objectstore = cloud.ObjectStore( self.project.capture_objectstore, cloud_connection) objectstore_file_name = f'{self.namespace}/{self.capture_file_name}.zip' # upload # capture_objectstore.put(f'{self.publish_folder_name}/{self.capture_file_name}.zip', objectstore_file_name) capture_objectstore.put(self.zip_file_name, objectstore_file_name) # finish zip_file_size = pathlib.Path(self.zip_file_name).stat().st_size self.stats.stop('upload', 0, zip_file_size)
def save_recovery_state_file(self): # don't upload captured data if we're in --notransfer mode if self.options.get('notransfer') == '1': return # FUTURE: Save recovery file in capture.zip file and have archive extract and push back to namespace folder. # This way capture_state.zip is only updated AFTER its container file has been successfully archived. # setup cloud_connection = self.connect_config.sections[self.project.cloud] capture_objectstore = cloud.ObjectStore( self.project.capture_objectstore, cloud_connection) objectstore_file_name = f'{self.namespace}/capture_state.zip' # create capture_state archive file zip_file_name = f'{self.publish_folder_name}/capture_state' zip_file_name = shutil.make_archive(zip_file_name, format='zip', root_dir=self.state_folder_name) # upload capture_objectstore.put(zip_file_name, objectstore_file_name)
def main(): # bootstrap configuration settings bootstrap = config.Bootstrap() bootstrap.debug_flag = False bootstrap.load('conf/init.ini') bootstrap.load('conf/bootstrap.ini') # project project_name = 'udp_aws_stage_01_etl' project_config = config.Config(f'conf/{project_name}.project', config.ProjectSection, bootstrap) project_config.debug_flag = False project_config.dump() project_object = project_config.sections['stage_project'] # make sure core database environment in place udp.setup() # get references to stage database and catalog schema # udp_stage_database = udp.udp_stage_database # udp_catalog_schema = udp.udp_catalog_schema # connections database_connect_name = f'{project_object.database}' cloud_connect_name = f'{project_object.cloud}' # SQL Server connect_config = config.Config('conf/_connect.ini', config.ConnectionSection, bootstrap) sql_server_connect = connect_config.sections[database_connect_name] db = database.SQLServerConnection(sql_server_connect) db.debug_flag = True conn = db.conn # cursor = conn.cursor() # create udp_staging database if not present; then use db_conn = database.Database('sqlserver', conn) db_conn.use_database('udp_stage') # Todo: These names should come from project file # Todo: queue_name should be input_queue_name, output_queue_name # archive_object_store_name = 'udp-s3-archive-sandbox' # queue_name = 'udp-sqs-archive-sandbox' archive_object_store_name = f'{project_object.archive_objectstore}' archive_queue_name = f'{project_object.archive_queue}' stage_queue_name = f'{project_object.stage_queue}' # get connection info connect_config = config.Config('conf/_connect.ini', config.ConnectionSection, bootstrap) cloud_connection = connect_config.sections[cloud_connect_name] archive_object_store = cloud.ObjectStore(archive_object_store_name, cloud_connection) if project_object.archive_queue: archive_queue = cloud.Queue(archive_queue_name, cloud_connection) else: archive_queue = None if project_object.stage_queue: stage_queue = cloud.Queue(stage_queue_name, cloud_connection) else: stage_queue = None # main poll loop while True: print( f'{datetime.datetime.today():%Y-%m-%d %H:%M:%S}: Polling for archive updates ...' ) archive_file_found = process_next_file_to_stage( db_conn, archive_object_store, stage_queue) # clear archive queue messages # TODO: Drop archive queue except as a diagnostic monitoring tool? if archive_queue: response = archive_queue.get() notification = cloud.ObjectStoreNotification(response) if notification.message_id: if not notification.objectstore_name: # unexpected notification - log it, then delete it print(f'Ignoring message: {notification.message}') archive_queue.delete(notification.message_id) else: # archive_file_name = stage_archive_file(archive_object_store, notification) archive_queue.delete(notification.message_id) # poll if we didn't find an archived file, otherwise keep processing if not archive_file_found: # poll time.sleep(int(project_object.poll_frequency))
def main(): # TODO: Create a project file for all udp cloud based ETL scripts (archive, stage, udp, etc) # bootstrap configuration settings bootstrap = config.Bootstrap() bootstrap.debug_flag = False bootstrap.load('conf/init.ini') bootstrap.load('conf/bootstrap.ini') # project info project_name = 'udp_aws_archive_01_etl' project_config = config.Config(f'conf/{project_name}.project', config.ProjectSection, bootstrap) project_config.debug_flag = True project_config.dump() project_object = project_config.sections['archive_project'] # make sure core database environment in place udp.setup() # get references to stage database and catalog schema # data_stage_database = udp.udp_stage_database # data_catalog_schema = udp.udp_catalog_schema database_connect_name = f'{project_object.database}' cloud_connect_name = f'{project_object.cloud}' # print(f'database_connect_name = {database_connect_name}') # SQL Server connect_config = config.Config('conf/_connect.ini', config.ConnectionSection, bootstrap) sql_server_connect = connect_config.sections[database_connect_name] db = database.SQLServerConnection(sql_server_connect) conn = db.conn # cursor = conn.cursor() db_conn = database.Database('sqlserver', conn) db_conn.debug_flag = False db_conn.use_database('udp_stage') # create udp_staging database if not present; then use # db_conn.create_database('udp_staging') # db_conn.use_database('udp_staging') # db_conn.create_schema('udp_admin') # db_conn.create_named_table('udp_admin', 'nst_lookup') # Todo: These names should come from project file # Todo: queue_name should be input_queue_name, output_queue_name # archive_objectstore_name = 'udp-s3-archive-sandbox' # queue_name = 'udp-sqs-capture-sandbox' archive_objectstore_name = f'{project_object.archive_objectstore}' capture_queue_name = f'{project_object.capture_queue}' # get connection info connect_config = config.Config('conf/_connect.ini', config.ConnectionSection, bootstrap) cloud_connection = connect_config.sections[cloud_connect_name] # main poll loop archive_object_store = cloud.ObjectStore(archive_objectstore_name, cloud_connection) queue = cloud.Queue(capture_queue_name, cloud_connection) while True: print(f'{datetime.datetime.today():%Y-%m-%d %H:%M:%S}: Polling for capture updates ...') response = queue.get() notification = cloud.ObjectStoreNotification(response) if notification.message_id: archive_capture_file(archive_object_store, cloud_connection, notification, db_conn) queue.delete(notification.message_id) # poll if we didn't find a captured file, otherwise keep processing if not notification.message_id: # poll time.sleep(int(project_object.poll_frequency))
def archive_capture_file(archive_object_store, cloud_connection, notification, db_conn): log(f'Notification: {notification}') # get name of file (key) that triggered this call source_object_key = notification.object_key # extract out the file name source_file_name = pathlib.Path(source_object_key).name # if source_file_name is empty, ignore notification if not source_file_name: log(f'Ignoring notification without object key (file name)') return # if source_file_name is capture_state.zip, ignore it # Note: This keeps the latest capture_state.zip file in each capture folder for recovery purposes. if source_file_name == 'capture_state.zip': log(f'Ignoring capture_state.zip file notification') return # TODO: Add activity_log (vs job_log/stat_log) references. # make sure work folder exists work_folder = 'archive_work' if not os.path.exists(work_folder): os.mkdir(work_folder) # make sure work folder is empty for file_name in glob.glob(f'{work_folder}/*'): # print(f'Deleting: {file_name}') os.remove(file_name) # get file, copy to archive then delete from capture source_object_store_name = notification.object_store_name source_file_name = f'{work_folder}/' + pathlib.Path(source_object_key).name # _connect to source object_store # TODO: Cache these object_store connections vs reconnecting each time. source_object_store = cloud.ObjectStore(source_object_store_name, cloud_connection) # get the posted file log(f'Getting {source_file_name} from {source_object_store_name}::{source_object_key}') source_object_store.get(source_file_name, source_object_key) # move (copy) the posted file to the archive object_store log(f'Moving {source_file_name} to archive_object_store::{source_object_key}') archive_object_store.put(source_file_name, source_object_key) # extract job.log/last_job.log from capture zip and merge these into stat_log table archive = zipfile.ZipFile(source_file_name, 'r') if 'job.log' in archive.namelist(): job_log_json = json.loads(archive.read('job.log')) for row in job_log_json: row['start_time'] = arrow.get(row['start_time']).datetime row['end_time'] = arrow.get(row['end_time']).datetime # skip capture stats which only have intermediate end_time and run_time values # next capture file will include an accurate version of this stat in last_job.job file if row['stat_name'] != 'capture': db_conn.insert_into_table('udp_catalog', 'stat_log', **row) if 'last_job.log' in archive.namelist(): last_job_log_json = json.loads(archive.read('last_job.log')) for row in last_job_log_json: row['start_time'] = arrow.get(row['start_time']).datetime row['end_time'] = arrow.get(row['end_time']).datetime if row['stat_name'] in ('capture', 'compress', 'upload'): db_conn.insert_into_table('udp_catalog', 'stat_log', **row) # close archive when done archive.close() # then delete file from source object_store and local work folder log(f'Deleting {source_object_key} from {source_object_store_name}') source_object_store.delete(source_object_key) pathlib.Path(source_file_name).unlink() # TODO: Move tested component code here # extract stat.log from capture*.zip # update nst_lookup, job_log, table_log # register new file in stage_arrival_queue table file_name = pathlib.Path(source_object_key).name job_id = int(file_name.split('.')[0].rsplit('#', 1)[-1]) new_file = dict(archive_file_name=file_name, job_id=job_id) db_conn.insert_into_table('udp_catalog', 'stage_arrival_queue', **new_file)