def run(self): app.logger.info('Starting SIS Advising Notes schema creation job...') daily_path = get_s3_sis_sysadm_daily_path() bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket): daily_path = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket): raise BackgroundJobError( 'No timely SIS advising notes data found, aborting') else: app.logger.info( 'Falling back to yesterday\'s SIS advising notes data') app.logger.info('Executing SQL...') external_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) self.create_historical_tables(external_schema) self.create_internal_schema(external_schema, daily_path) app.logger.info('Redshift schema created. Creating RDS indexes...') self.create_indexes() app.logger.info('RDS indexes created.') return 'SIS Advising Notes schema creation job completed.'
def run(self): app.logger.info('Starting Canvas schema creation job...') canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') external_schema = app.config['REDSHIFT_SCHEMA_CANVAS'] s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/' s3_canvas_data_url = s3_prefix + canvas_path s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term( ) redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_canvas_schema.template.sql', loch_s3_canvas_data_path_today=s3_canvas_data_url, loch_s3_canvas_data_path_current_term= s3_canvas_data_path_current_term, ) if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) return 'Canvas schema creation job completed.' else: raise BackgroundJobError('Canvas schema creation job failed.')
def test_file_upload_and_delete(self, app, cleanup_s3): """Can upload and delete files in S3.""" url1 = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html' key1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html' url2 = 'http://shakespeare.mit.edu/Poetry/sonnet.LXII.html' key2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002/sonnet-xlii.html' assert s3.object_exists(key1) is False assert s3.upload_from_url(url1, key1)['ContentLength'] == 767 assert s3.object_exists(key1) is True assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001') == [key1] assert s3.object_exists(key2) is False assert s3.upload_from_url(url2, key2)['ContentLength'] == 743 assert s3.object_exists(key2) is True assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002') == [key2] client = s3.get_client() contents1 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'], Key=key1)['Body'].read().decode('utf-8') assert 'These present-absent with swift motion slide' in contents1 contents2 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'], Key=key2)['Body'].read().decode('utf-8') assert 'Beated and chopp\'d with tann\'d antiquity' in contents2
def verify_attachment_migration(self, source_prefix, dest_prefix): s3_attachment_sync_failures = [] bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] source_attachments = sorted( s3.get_keys_with_prefix(source_prefix, False, bucket)) dest_attachments = sorted( s3.get_keys_with_prefix(dest_prefix, False, bucket)) for source_key in source_attachments: file_name = source_key.split('/')[-1] sid = file_name.split('_')[0] dest_key = f'{dest_prefix}/{sid}/{file_name}' if dest_key not in dest_attachments: s3_attachment_sync_failures.append(source_key) if s3_attachment_sync_failures: app.logger.error( f'Total number of failed attachment syncs from {source_prefix} is {len(s3_attachment_sync_failures)} \ \n {s3_attachment_sync_failures}.') else: app.logger.info( f'No attachment sync failures found from {source_prefix}.') return s3_attachment_sync_failures
def run(self): app.logger.info('Start generating canvas caliper analytics') redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER'] redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS'] canvas_caliper_explode_table = 'canvas_caliper_explode' # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch # may have been delivered before last midnight UTC. s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path() if not s3.get_keys_with_prefix(s3_caliper_daily_path): s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_caliper_daily_path): raise BackgroundJobError('No timely S3 Caliper extracts found') else: app.logger.info('Falling back S3 Caliper extracts for yesterday') s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path) resolved_ddl_caliper_explode = resolve_sql_template( 'create_lrs_canvas_explode_table.template.sql', canvas_caliper_explode_table=canvas_caliper_explode_table, loch_s3_caliper_explode_url=s3_caliper_daily_url, ) redshift.drop_external_schema(redshift_schema_lrs_external) if redshift.execute_ddl_script(resolved_ddl_caliper_explode): app.logger.info('Caliper explode schema and table successfully created.') else: raise BackgroundJobError('Caliper explode schema and table creation failed.') # Sanity-check event times from the latest Caliper batch against previously transformed event times. def datetime_from_query(query): response = redshift.fetch(query) timestamp = response and response[0] and response[0].get('timestamp') if not timestamp: raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}') if isinstance(timestamp, str): timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ') return timestamp earliest_untransformed = datetime_from_query( f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}', ) latest_transformed = datetime_from_query( f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests', ) if not earliest_untransformed or not latest_transformed: return False timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds() lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE'] if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance: raise BackgroundJobError( f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, ' f'earliest untransformed {earliest_untransformed}', ) resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql') if redshift.execute_ddl_script(resolved_ddl_caliper_analytics): return 'Caliper analytics tables successfully created.' else: raise BackgroundJobError('Caliper analytics tables creation failed.')
def generate_canvas_path(self): canvas_path = get_s3_canvas_daily_path() if not s3.get_keys_with_prefix(canvas_path): canvas_path = get_s3_canvas_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(canvas_path): raise BackgroundJobError( 'No timely Canvas data found, aborting') else: app.logger.info('Falling back to yesterday\'s Canvas data') return canvas_path
def s3_path(self): s3_sis_daily = get_s3_sis_sysadm_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_sis_daily): raise BackgroundJobError('No timely SIS S3 advisor data found') else: app.logger.info('Falling back to SIS S3 daily advisor data for yesterday') return '/'.join([ f"s3://{app.config['LOCH_S3_BUCKET']}", s3_sis_daily, 'advisors', ])
def test_import_student_photos(self, app, metadata_db, student_tables, caplog): from nessie.jobs.import_student_photos import ImportStudentPhotos caplog.set_level(logging.DEBUG) with capture_app_logs(app): with mock_s3(app): result = ImportStudentPhotos().run_wrapped() assert result == 'Student photo import completed: 1 succeeded, 9 had no photo available, 0 failed.' response = s3.get_keys_with_prefix('cal1card-data/photos') assert len(response) == 1 assert response[0] == 'cal1card-data/photos/61889.jpg' success_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'success'" ) assert len(success_rows) == 1 assert success_rows[0]['sid'] == '11667051' failure_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'failure'" ) assert len(failure_rows) == 0 not_found_rows = rds.fetch( f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'photo_not_found'" ) assert len(not_found_rows) == 9
def run(self): app.logger.info('Starting migration task on LRS incrementals...') self.transient_bucket = app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'] self.get_pre_transform_statement_count() if not self.pre_transform_statement_count: raise BackgroundJobError('Failed to retrieve pre-transform statement count.') self.source_output_path = app.config['LRS_CANVAS_CALIPER_EXPLODE_OUTPUT_PATH'] if self.source_output_path.endswith('/'): self.source_output_path = self.source_output_path[:-1] output_url = 's3://' + self.transient_bucket + '/' + self.source_output_path self.verify_post_transform_statement_count(output_url) etl_output_keys = s3.get_keys_with_prefix(self.source_output_path, bucket=self.transient_bucket) if not etl_output_keys: raise BackgroundJobError('Could not retrieve S3 keys from transient bucket.') timestamped_destination_path = self.source_output_path + '/' + localize_datetime(datetime.now()).strftime('%Y/%m/%d/%H%M%S') for destination_bucket in app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']: self.migrate_transient_to_destination( etl_output_keys, destination_bucket, timestamped_destination_path, ) return ( f'Migrated {self.pre_transform_statement_count} statements to S3' f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={timestamped_destination_path})" )
def find_missing_notes_view_attachments(self, dest_prefix): # Checks for attachments in SIS view that are not on S3. missing_s3_attachments = [] sis_attachments_files_names = [] bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] sis_notes_view_attachments = sorted(self.get_all_notes_attachments()) sis_s3_attachments = sorted( s3.get_keys_with_prefix(dest_prefix, False, bucket)) for dest_key in sis_s3_attachments: file_name = dest_key.split('/')[-1] sis_attachments_files_names.append(file_name) for file_name in sis_notes_view_attachments: if file_name not in sis_attachments_files_names: missing_s3_attachments.append(file_name) if missing_s3_attachments: app.logger.error( f'Attachments missing on S3 when compared against SIS notes views: {len(missing_s3_attachments)} \ \n {missing_s3_attachments}.') else: app.logger.info( 'No attachments missing on S3 when compared against the view.') return missing_s3_attachments
def _get_yesterdays_advisor_data(): s3_sis_daily = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_sis_daily): raise BackgroundJobError('No timely SIS S3 advisor data found') app.logger.info('Falling back to SIS S3 daily advisor data for yesterday') return s3_sis_daily
def create_schema(self): base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH'] external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES'] redshift.drop_external_schema(external_schema) # Flatten E&I-sourced JSON files into two schema-friendly JSON files. notes = [] topics = [] for key in s3.get_keys_with_prefix(base_s3_key): if key.endswith('.json'): notes_json = s3.get_object_json(key) if notes_json and 'notes' in notes_json: notes += notes_json['notes'] for note in notes: topics += _extract_topics(note) if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \ and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'): # Create schema app.logger.info('Executing SQL...') resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) else: raise BackgroundJobError('E&I Advising Notes schema creation job failed.') else: raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
def migrate_oua_sftp_data(self): s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] oua_slate_sftp_path = app.config[ 'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset( ) + '/' oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/' # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location keys = s3.get_keys_with_prefix(oua_slate_sftp_path, full_objects=False, bucket=s3_protected_bucket) if len(keys) > 0: for source_key in keys: if source_key.endswith('.csv'): destination_key = source_key.replace( oua_slate_sftp_path, oua_daily_dest_path) if not s3.copy(s3_protected_bucket, source_key, s3_protected_bucket, destination_key): raise BackgroundJobError( f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.' ) else: raise BackgroundJobError( 'No OUA files found in SFTP location today. Skipping OUA data refresh' )
def delete_old_unloads(self): old_unloads = s3.get_keys_with_prefix(app.config['LRS_CANVAS_INCREMENTAL_ETL_PATH_REDSHIFT'], bucket=self.transient_bucket) if old_unloads is None: raise BackgroundJobError('Error listing old unloads, aborting job.') if len(old_unloads) > 0: delete_response = s3.delete_objects(old_unloads, bucket=self.transient_bucket) if not delete_response: raise BackgroundJobError(f'Error deleting old unloads from {self.transient_bucket}, aborting job.') else: app.logger.info(f'Deleted {len(old_unloads)} old unloads from {self.transient_bucket}.')
def delete_old_incrementals(self): old_incrementals = s3.get_keys_with_prefix(self.transient_path, bucket=self.transient_bucket) if old_incrementals is None: raise BackgroundJobError('Error listing old incrementals, aborting job.') if len(old_incrementals) > 0: delete_response = s3.delete_objects(old_incrementals, bucket=self.transient_bucket) if not delete_response: raise BackgroundJobError(f'Error deleting old incremental files from {self.transient_bucket}, aborting job.') else: app.logger.info(f'Deleted {len(old_incrementals)} old incremental files from {self.transient_bucket}.')
def run(self, truncate_lrs=True): app.logger.info('Starting DMS replication task...') task_id = app.config['LRS_CANVAS_INCREMENTAL_REPLICATION_TASK_ID'] self.transient_bucket = app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'] self.transient_path = app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_PATH'] self.delete_old_incrementals() response = dms.start_replication_task(task_id) if not response: raise BackgroundJobError('Failed to start DMS replication task (response={response}).') while True: response = dms.get_replication_task(task_id) if response.get('Status') == 'stopped': if response.get('StopReason') == 'Stop Reason FULL_LOAD_ONLY_FINISHED': app.logger.info('DMS replication task completed') break else: raise BackgroundJobError(f'Replication task stopped for unexpected reason: {response}') sleep(10) lrs_response = lrs.fetch('select count(*) from statements') if lrs_response: self.lrs_statement_count = lrs_response[0][0] else: raise BackgroundJobError('Failed to retrieve LRS statements for comparison.') transient_keys = s3.get_keys_with_prefix(self.transient_path, bucket=self.transient_bucket) if not transient_keys: raise BackgroundJobError('Could not retrieve S3 keys from transient bucket.') self.verify_and_unload_transient() timestamp_path = localize_datetime(datetime.now()).strftime('%Y/%m/%d/%H%M%S') destination_path = app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_PATH'] + '/' + timestamp_path for destination_bucket in app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']: self.migrate_transient_to_destination( transient_keys, destination_bucket, destination_path, ) if truncate_lrs: if lrs.execute('TRUNCATE statements'): app.logger.info('Truncated incremental LRS table.') else: raise BackgroundJobError('Failed to truncate incremental LRS table.') return ( f'Migrated {self.lrs_statement_count} statements to S3' f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={destination_path})" )
def copy_to_destination(self, source_prefix, dest_prefix): bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] objects = s3.get_keys_with_prefix(source_prefix, bucket=app.config['LOCH_S3_PROTECTED_BUCKET']) for o in objects: file_name = normalize_sis_note_attachment_file_name(o) sid = file_name.split('_')[0] dest_key = f'{dest_prefix}/{sid}/{file_name}' app.logger.info(dest_key) if not s3.copy(bucket, o, bucket, dest_key): raise BackgroundJobError(f'Copy from source to destination {dest_key} failed.') app.logger.info(f'Copied {len(objects) if objects else 0} attachments to the destination folder.')
def transform(self, s3_source, s3_dest, job_id): objects = s3.get_keys_with_prefix(s3_source) if len(objects) == 0: message = f'Zero objects found in {s3_source}. Quitting.' app.logger.info(message) return message app.logger.info(f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.') objects_updated = 0 new_objects = 0 objects_in_error = 0 total_objects = 0 for o in objects: file_name = o.split('/')[-1] app.logger.debug(f'processing {file_name}') # file_name is like 'daily_2020-08-14.zip' piazza_zip_file = s3.get_object_compressed_text_reader(o) for subfile in piazza_zip_file.namelist(): if '.json' in subfile: try: json_file = subfile.split('/')[-1] course_id = subfile.split('/')[-2] file_type = json_file.split('_')[0] record = piazza_zip_file.read(subfile) with tempfile.TemporaryFile() as result: s3_object = f'{s3_dest}/{file_type}/{course_id}/{json_file}' if s3.object_exists(s3_object): objects_updated += 1 else: new_objects += 1 result.write(record) s3.upload_file(result, s3_object) total_objects += 1 # update job queue every 1000 files... if total_objects % 1000 == 0: message = f'{subfile}, {total_objects} so far; ' \ + f'{new_objects} new files; ' \ + f'{objects_updated} existing files. {objects_in_error} files in error' \ + f'({len(objects)} objects in all)' update_background_job_status(job_id, 'transforming', details=message) except Exception as e: app.logger.error(f'could not extract {subfile}') app.logger.error(e) objects_in_error += 1 else: # not a json file, so we skip it continue message = f'Transformed {len(objects)} input files; created {new_objects} new objects; '\ + f'updated {objects_updated} existing objects. {objects_in_error} objects in error' app.logger.info(message) return message
def create_schema(self): app.logger.info('Executing SQL...') redshift.drop_external_schema(self.external_schema) s3_sis_daily = get_s3_sis_sysadm_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = _get_yesterdays_advisor_data() s3_path = '/'.join([f"s3://{app.config['LOCH_S3_BUCKET']}", s3_sis_daily, 'advisors']) sql_filename = 'edl_create_advisor_schema.template.sql' if self.feature_flag_edl else 'create_advisor_schema.template.sql' resolved_ddl = resolve_sql_template(sql_filename, advisor_data_path=s3_path) if not redshift.execute_ddl_script(resolved_ddl): raise BackgroundJobError(f'Redshift execute_ddl_script failed on {sql_filename}') verify_external_schema(self.external_schema, resolved_ddl) app.logger.info('Redshift schema created.')
def delete_objects_with_prefix(prefix, whitelist=[]): keys_to_delete = [] existing_keys = s3.get_keys_with_prefix(prefix) if existing_keys is None: app.logger.error('Error listing keys, aborting job.') return False for key in existing_keys: filename = key.split('/')[-1] if filename not in whitelist: keys_to_delete.append(key) app.logger.info( f'Found {len(existing_keys)} key(s) matching prefix "{prefix}", {len(existing_keys) - len(keys_to_delete)} ' f'key(s) in whitelist, will delete {len(keys_to_delete)} object(s)') if not keys_to_delete: return True if s3.delete_objects(keys_to_delete): metadata.delete_canvas_snapshots(keys_to_delete) return True else: return False
def update_manifests(self): app.logger.info('Updating manifests...') # Because the SIS S3 copy is managed by a different application running on a different schedule, # it may have been made before midnight by Nessie-time. s3_sis_daily = get_s3_sis_daily_path() if not s3.get_keys_with_prefix(s3_sis_daily): s3_sis_daily = get_s3_sis_daily_path(datetime.now() - timedelta(days=1)) if not s3.get_keys_with_prefix(s3_sis_daily): raise BackgroundJobError('No timely SIS S3 data found') else: app.logger.info('Falling back to SIS S3 daily data for yesterday') courses_daily = s3.get_keys_with_prefix(s3_sis_daily + '/courses', full_objects=True) courses_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/courses', full_objects=True) enrollments_daily = s3.get_keys_with_prefix(s3_sis_daily + '/enrollments', full_objects=True) enrollments_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/enrollments', full_objects=True) def deduplicate(prefix, s3list): filename_map = {} for s3obj in s3list: m = re.match(r'.+/(.+\.gz)', s3obj['Key']) if m: filename_map[m[1]] = s3obj for term_id in reverse_term_ids(include_future_terms=True): filename = f'{prefix}-{term_id}.gz' if filename not in filename_map: raise BackgroundJobError(f'Expected filename {filename} not found in S3, aborting') return list(filename_map.values()) all_courses = deduplicate('courses', courses_daily + courses_historical) all_enrollments = deduplicate('enrollments', enrollments_daily + enrollments_historical) def to_manifest_entry(_object): return { 'url': f"s3://{app.config['LOCH_S3_BUCKET']}/{_object['Key']}", 'meta': {'content_length': _object['Size']}, } def to_manifest(objects): return { 'entries': [to_manifest_entry(o) for o in objects], } courses_manifest = json.dumps(to_manifest(all_courses)) enrollments_manifest = json.dumps(to_manifest(all_enrollments)) courses_result = s3.upload_data(courses_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/courses.json') enrollments_result = s3.upload_data(enrollments_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/enrollments.json') return courses_result and enrollments_result
def run(self): app.logger.info('Starting OUA Slate schema creation job...') app.logger.info('Executing SQL...') s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET'] oua_slate_sftp_path = app.config[ 'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset( ) + '/' oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/' # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location keys = s3.get_keys_with_prefix(oua_slate_sftp_path, full_objects=False, bucket=s3_protected_bucket) if len(keys) > 0: for source_key in keys: if source_key.endswith('.csv'): destination_key = source_key.replace( oua_slate_sftp_path, oua_daily_dest_path) if not s3.copy(s3_protected_bucket, source_key, s3_protected_bucket, destination_key): raise BackgroundJobError( f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.' ) external_schema = app.config['REDSHIFT_SCHEMA_OUA'] redshift.drop_external_schema(external_schema) resolved_ddl = resolve_sql_template( 'create_oua_schema_template.sql') if redshift.execute_ddl_script(resolved_ddl): verify_external_schema(external_schema, resolved_ddl) self.create_rds_tables_and_indexes() app.logger.info('OUA Slate RDS indexes created.') return 'OUA schema creation job completed.' else: raise BackgroundJobError( 'OUA Slate schema creation job failed.') else: return 'No OUA files found in SFTP location today. Skipping OUA data refresh'
def test_list_keys_matching_prefix(self, app): """Lists keys matching prefix.""" bucket = app.config['LOCH_S3_BUCKET'] prefix = app.config[ 'LOCH_S3_CANVAS_DATA_PATH_CURRENT_TERM'] + '/requests' with mock_s3(app) as m: m.Object(bucket, f'{prefix}/requests-aaa.gz').put(Body=b'some data') m.Object(bucket, f'{prefix}/requests-bbb.gz').put(Body=b'some more data') m.Object(bucket, f'{prefix}/requests-ccc.gz').put(Body=b'yet more data') m.Object(bucket, 'another-prefix/requests-ddd.gz').put( Body=b'utterly unrelated data') response = s3.get_keys_with_prefix(prefix) assert len(response) == 3 assert f'{prefix}/requests-aaa.gz' in response assert f'{prefix}/requests-bbb.gz' in response assert f'{prefix}/requests-ccc.gz' in response
def transform(self, s3_source, s3_dest, key=None): objects = s3.get_keys_with_prefix(s3_source) app.logger.info( f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.' ) skip_count = 0 for o in objects: file_name = o.split('/')[-1] if s3.object_exists(f'{s3_dest}/{file_name}'): skip_count += 1 continue canvas_api_data = s3.get_object_json(o).get( key) if key else s3.get_object_json(o) with tempfile.TemporaryFile() as result: course_id = int(file_name.split('_')[-2]) for record in canvas_api_data: record['course_id'] = course_id result.write(json.dumps(record).encode() + b'\n') s3.upload_file(result, f'{s3_dest}/{file_name}') app.logger.info( f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.' )
def cleanup_s3(app): yield from nessie.externals import s3 keys = s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT']) s3.delete_objects(keys)