Python get_keys_with_prefix Examples, nessie.externals.s3.get_keys_with_prefix Python Examples

Example #1

0

Show file

File: create_sis_advising_notes_schema.py Project: dkawase/nessie

    def run(self):
        app.logger.info('Starting SIS Advising Notes schema creation job...')

        daily_path = get_s3_sis_sysadm_daily_path()
        bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes',
                                       bucket=bucket):
            daily_path = get_s3_sis_sysadm_daily_path(datetime.now() -
                                                      timedelta(days=1))
            if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes',
                                           bucket=bucket):
                raise BackgroundJobError(
                    'No timely SIS advising notes data found, aborting')
            else:
                app.logger.info(
                    'Falling back to yesterday\'s SIS advising notes data')

        app.logger.info('Executing SQL...')
        external_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES']
        redshift.drop_external_schema(external_schema)
        self.create_historical_tables(external_schema)
        self.create_internal_schema(external_schema, daily_path)
        app.logger.info('Redshift schema created. Creating RDS indexes...')
        self.create_indexes()
        app.logger.info('RDS indexes created.')

        return 'SIS Advising Notes schema creation job completed.'

Example #2

0

Show file

File: create_canvas_schema.py Project: ets-berkeley-edu/nessie

    def run(self):
        app.logger.info('Starting Canvas schema creation job...')

        canvas_path = get_s3_canvas_daily_path()
        if not s3.get_keys_with_prefix(canvas_path):
            canvas_path = get_s3_canvas_daily_path(datetime.now() -
                                                   timedelta(days=1))
            if not s3.get_keys_with_prefix(canvas_path):
                raise BackgroundJobError(
                    'No timely Canvas data found, aborting')
            else:
                app.logger.info('Falling back to yesterday\'s Canvas data')

        external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
        s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/'
        s3_canvas_data_url = s3_prefix + canvas_path
        s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term(
        )

        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template(
            'create_canvas_schema.template.sql',
            loch_s3_canvas_data_path_today=s3_canvas_data_url,
            loch_s3_canvas_data_path_current_term=
            s3_canvas_data_path_current_term,
        )
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'Canvas schema creation job completed.'
        else:
            raise BackgroundJobError('Canvas schema creation job failed.')

Example #3

0

Show file

File: test_s3.py Project: ets-berkeley-edu/nessie

    def test_file_upload_and_delete(self, app, cleanup_s3):
        """Can upload and delete files in S3."""
        url1 = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
        key1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html'

        url2 = 'http://shakespeare.mit.edu/Poetry/sonnet.LXII.html'
        key2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002/sonnet-xlii.html'

        assert s3.object_exists(key1) is False
        assert s3.upload_from_url(url1, key1)['ContentLength'] == 767
        assert s3.object_exists(key1) is True
        assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] +
                                       '/00001') == [key1]

        assert s3.object_exists(key2) is False
        assert s3.upload_from_url(url2, key2)['ContentLength'] == 743
        assert s3.object_exists(key2) is True
        assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] +
                                       '/00002') == [key2]

        client = s3.get_client()
        contents1 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'],
                                      Key=key1)['Body'].read().decode('utf-8')
        assert 'These present-absent with swift motion slide' in contents1
        contents2 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'],
                                      Key=key2)['Body'].read().decode('utf-8')
        assert 'Beated and chopp\'d with tann\'d antiquity' in contents2

Example #4

0

Show file

    def verify_attachment_migration(self, source_prefix, dest_prefix):
        s3_attachment_sync_failures = []
        bucket = app.config['LOCH_S3_PROTECTED_BUCKET']

        source_attachments = sorted(
            s3.get_keys_with_prefix(source_prefix, False, bucket))
        dest_attachments = sorted(
            s3.get_keys_with_prefix(dest_prefix, False, bucket))

        for source_key in source_attachments:
            file_name = source_key.split('/')[-1]
            sid = file_name.split('_')[0]
            dest_key = f'{dest_prefix}/{sid}/{file_name}'

            if dest_key not in dest_attachments:
                s3_attachment_sync_failures.append(source_key)

        if s3_attachment_sync_failures:
            app.logger.error(
                f'Total number of failed attachment syncs from {source_prefix} is {len(s3_attachment_sync_failures)} \
              \n {s3_attachment_sync_failures}.')
        else:
            app.logger.info(
                f'No attachment sync failures found from {source_prefix}.')

        return s3_attachment_sync_failures

Example #5

0

Show file

File: generate_canvas_caliper_analytics.py Project: dkawase/nessie

    def run(self):
        app.logger.info('Start generating canvas caliper analytics')
        redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER']
        redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS']
        canvas_caliper_explode_table = 'canvas_caliper_explode'

        # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch
        # may have been delivered before last midnight UTC.
        s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path()
        if not s3.get_keys_with_prefix(s3_caliper_daily_path):
            s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_caliper_daily_path):
                raise BackgroundJobError('No timely S3 Caliper extracts found')
            else:
                app.logger.info('Falling back S3 Caliper extracts for yesterday')
        s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path)

        resolved_ddl_caliper_explode = resolve_sql_template(
            'create_lrs_canvas_explode_table.template.sql',
            canvas_caliper_explode_table=canvas_caliper_explode_table,
            loch_s3_caliper_explode_url=s3_caliper_daily_url,
        )
        redshift.drop_external_schema(redshift_schema_lrs_external)
        if redshift.execute_ddl_script(resolved_ddl_caliper_explode):
            app.logger.info('Caliper explode schema and table successfully created.')
        else:
            raise BackgroundJobError('Caliper explode schema and table creation failed.')

        # Sanity-check event times from the latest Caliper batch against previously transformed event times.
        def datetime_from_query(query):
            response = redshift.fetch(query)
            timestamp = response and response[0] and response[0].get('timestamp')
            if not timestamp:
                raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}')
            if isinstance(timestamp, str):
                timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
            return timestamp

        earliest_untransformed = datetime_from_query(
            f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}',
        )
        latest_transformed = datetime_from_query(
            f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests',
        )
        if not earliest_untransformed or not latest_transformed:
            return False
        timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds()
        lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE']
        if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance:
            raise BackgroundJobError(
                f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, '
                f'earliest untransformed {earliest_untransformed}',
            )

        resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql')
        if redshift.execute_ddl_script(resolved_ddl_caliper_analytics):
            return 'Caliper analytics tables successfully created.'
        else:
            raise BackgroundJobError('Caliper analytics tables creation failed.')

Example #6

0

Show file

 def generate_canvas_path(self):
     canvas_path = get_s3_canvas_daily_path()
     if not s3.get_keys_with_prefix(canvas_path):
         canvas_path = get_s3_canvas_daily_path(datetime.now() -
                                                timedelta(days=1))
         if not s3.get_keys_with_prefix(canvas_path):
             raise BackgroundJobError(
                 'No timely Canvas data found, aborting')
         else:
             app.logger.info('Falling back to yesterday\'s Canvas data')
     return canvas_path

Example #7

0

Show file

    def s3_path(self):
        s3_sis_daily = get_s3_sis_sysadm_daily_path()
        if not s3.get_keys_with_prefix(s3_sis_daily):
            s3_sis_daily = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_sis_daily):
                raise BackgroundJobError('No timely SIS S3 advisor data found')
            else:
                app.logger.info('Falling back to SIS S3 daily advisor data for yesterday')

        return '/'.join([
            f"s3://{app.config['LOCH_S3_BUCKET']}",
            s3_sis_daily,
            'advisors',
        ])

Example #8

0

Show file

    def test_import_student_photos(self, app, metadata_db, student_tables,
                                   caplog):
        from nessie.jobs.import_student_photos import ImportStudentPhotos
        caplog.set_level(logging.DEBUG)
        with capture_app_logs(app):
            with mock_s3(app):
                result = ImportStudentPhotos().run_wrapped()
                assert result == 'Student photo import completed: 1 succeeded, 9 had no photo available, 0 failed.'
                response = s3.get_keys_with_prefix('cal1card-data/photos')
                assert len(response) == 1
                assert response[0] == 'cal1card-data/photos/61889.jpg'

            success_rows = rds.fetch(
                f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'success'"
            )
            assert len(success_rows) == 1
            assert success_rows[0]['sid'] == '11667051'

            failure_rows = rds.fetch(
                f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'failure'"
            )
            assert len(failure_rows) == 0

            not_found_rows = rds.fetch(
                f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'photo_not_found'"
            )
            assert len(not_found_rows) == 9

Example #9

0

Show file

File: migrate_lrs_incrementals.py Project: ets-berkeley-edu/nessie

    def run(self):
        app.logger.info('Starting migration task on LRS incrementals...')

        self.transient_bucket = app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET']
        self.get_pre_transform_statement_count()
        if not self.pre_transform_statement_count:
            raise BackgroundJobError('Failed to retrieve pre-transform statement count.')

        self.source_output_path = app.config['LRS_CANVAS_CALIPER_EXPLODE_OUTPUT_PATH']
        if self.source_output_path.endswith('/'):
            self.source_output_path = self.source_output_path[:-1]

        output_url = 's3://' + self.transient_bucket + '/' + self.source_output_path
        self.verify_post_transform_statement_count(output_url)

        etl_output_keys = s3.get_keys_with_prefix(self.source_output_path, bucket=self.transient_bucket)
        if not etl_output_keys:
            raise BackgroundJobError('Could not retrieve S3 keys from transient bucket.')

        timestamped_destination_path = self.source_output_path + '/' + localize_datetime(datetime.now()).strftime('%Y/%m/%d/%H%M%S')
        for destination_bucket in app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']:
            self.migrate_transient_to_destination(
                etl_output_keys,
                destination_bucket,
                timestamped_destination_path,
            )
        return (
            f'Migrated {self.pre_transform_statement_count} statements to S3'
            f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={timestamped_destination_path})"
        )

Example #10

0

Show file

    def find_missing_notes_view_attachments(self, dest_prefix):
        # Checks for attachments in SIS view that are not on S3.
        missing_s3_attachments = []
        sis_attachments_files_names = []
        bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        sis_notes_view_attachments = sorted(self.get_all_notes_attachments())
        sis_s3_attachments = sorted(
            s3.get_keys_with_prefix(dest_prefix, False, bucket))

        for dest_key in sis_s3_attachments:
            file_name = dest_key.split('/')[-1]
            sis_attachments_files_names.append(file_name)

        for file_name in sis_notes_view_attachments:
            if file_name not in sis_attachments_files_names:
                missing_s3_attachments.append(file_name)

        if missing_s3_attachments:
            app.logger.error(
                f'Attachments missing on S3 when compared against SIS notes views: {len(missing_s3_attachments)} \
             \n {missing_s3_attachments}.')
        else:
            app.logger.info(
                'No attachments missing on S3 when compared against the view.')

        return missing_s3_attachments

Example #11

0

Show file

def _get_yesterdays_advisor_data():
    s3_sis_daily = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1))
    if not s3.get_keys_with_prefix(s3_sis_daily):
        raise BackgroundJobError('No timely SIS S3 advisor data found')

    app.logger.info('Falling back to SIS S3 daily advisor data for yesterday')
    return s3_sis_daily

Example #12

0

Show file

File: create_e_i_advising_notes_schema.py Project: ets-berkeley-edu/nessie

    def create_schema(self):
        base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH']
        external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES']
        redshift.drop_external_schema(external_schema)
        # Flatten E&I-sourced JSON files into two schema-friendly JSON files.
        notes = []
        topics = []
        for key in s3.get_keys_with_prefix(base_s3_key):
            if key.endswith('.json'):
                notes_json = s3.get_object_json(key)
                if notes_json and 'notes' in notes_json:
                    notes += notes_json['notes']
                    for note in notes:
                        topics += _extract_topics(note)

        if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \
                and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'):
            # Create schema
            app.logger.info('Executing SQL...')
            resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql')
            if redshift.execute_ddl_script(resolved_ddl):
                verify_external_schema(external_schema, resolved_ddl)
            else:
                raise BackgroundJobError('E&I Advising Notes schema creation job failed.')
        else:
            raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')

Example #13

0

Show file

    def migrate_oua_sftp_data(self):
        s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        oua_slate_sftp_path = app.config[
            'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset(
            ) + '/'
        oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/'

        # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location
        keys = s3.get_keys_with_prefix(oua_slate_sftp_path,
                                       full_objects=False,
                                       bucket=s3_protected_bucket)

        if len(keys) > 0:
            for source_key in keys:
                if source_key.endswith('.csv'):
                    destination_key = source_key.replace(
                        oua_slate_sftp_path, oua_daily_dest_path)
                    if not s3.copy(s3_protected_bucket, source_key,
                                   s3_protected_bucket, destination_key):
                        raise BackgroundJobError(
                            f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.'
                        )
        else:
            raise BackgroundJobError(
                'No OUA files found in SFTP location today. Skipping OUA data refresh'
            )

Example #14

0

Show file

File: import_lrs_incrementals.py Project: ets-berkeley-edu/nessie

 def delete_old_unloads(self):
     old_unloads = s3.get_keys_with_prefix(app.config['LRS_CANVAS_INCREMENTAL_ETL_PATH_REDSHIFT'], bucket=self.transient_bucket)
     if old_unloads is None:
         raise BackgroundJobError('Error listing old unloads, aborting job.')
     if len(old_unloads) > 0:
         delete_response = s3.delete_objects(old_unloads, bucket=self.transient_bucket)
         if not delete_response:
             raise BackgroundJobError(f'Error deleting old unloads from {self.transient_bucket}, aborting job.')
         else:
             app.logger.info(f'Deleted {len(old_unloads)} old unloads from {self.transient_bucket}.')

Example #15

0

Show file

File: import_lrs_incrementals.py Project: ets-berkeley-edu/nessie

 def delete_old_incrementals(self):
     old_incrementals = s3.get_keys_with_prefix(self.transient_path, bucket=self.transient_bucket)
     if old_incrementals is None:
         raise BackgroundJobError('Error listing old incrementals, aborting job.')
     if len(old_incrementals) > 0:
         delete_response = s3.delete_objects(old_incrementals, bucket=self.transient_bucket)
         if not delete_response:
             raise BackgroundJobError(f'Error deleting old incremental files from {self.transient_bucket}, aborting job.')
         else:
             app.logger.info(f'Deleted {len(old_incrementals)} old incremental files from {self.transient_bucket}.')

Example #16

0

Show file

File: import_lrs_incrementals.py Project: ets-berkeley-edu/nessie

    def run(self, truncate_lrs=True):
        app.logger.info('Starting DMS replication task...')
        task_id = app.config['LRS_CANVAS_INCREMENTAL_REPLICATION_TASK_ID']

        self.transient_bucket = app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET']
        self.transient_path = app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_PATH']

        self.delete_old_incrementals()

        response = dms.start_replication_task(task_id)
        if not response:
            raise BackgroundJobError('Failed to start DMS replication task (response={response}).')

        while True:
            response = dms.get_replication_task(task_id)
            if response.get('Status') == 'stopped':
                if response.get('StopReason') == 'Stop Reason FULL_LOAD_ONLY_FINISHED':
                    app.logger.info('DMS replication task completed')
                    break
                else:
                    raise BackgroundJobError(f'Replication task stopped for unexpected reason: {response}')
            sleep(10)

        lrs_response = lrs.fetch('select count(*) from statements')
        if lrs_response:
            self.lrs_statement_count = lrs_response[0][0]
        else:
            raise BackgroundJobError('Failed to retrieve LRS statements for comparison.')

        transient_keys = s3.get_keys_with_prefix(self.transient_path, bucket=self.transient_bucket)
        if not transient_keys:
            raise BackgroundJobError('Could not retrieve S3 keys from transient bucket.')
        self.verify_and_unload_transient()

        timestamp_path = localize_datetime(datetime.now()).strftime('%Y/%m/%d/%H%M%S')
        destination_path = app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_PATH'] + '/' + timestamp_path
        for destination_bucket in app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']:
            self.migrate_transient_to_destination(
                transient_keys,
                destination_bucket,
                destination_path,
            )

        if truncate_lrs:
            if lrs.execute('TRUNCATE statements'):
                app.logger.info('Truncated incremental LRS table.')
            else:
                raise BackgroundJobError('Failed to truncate incremental LRS table.')

        return (
            f'Migrated {self.lrs_statement_count} statements to S3'
            f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={destination_path})"
        )

Example #17

0

Show file

    def copy_to_destination(self, source_prefix, dest_prefix):
        bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        objects = s3.get_keys_with_prefix(source_prefix, bucket=app.config['LOCH_S3_PROTECTED_BUCKET'])
        for o in objects:
            file_name = normalize_sis_note_attachment_file_name(o)
            sid = file_name.split('_')[0]

            dest_key = f'{dest_prefix}/{sid}/{file_name}'
            app.logger.info(dest_key)
            if not s3.copy(bucket, o, bucket, dest_key):
                raise BackgroundJobError(f'Copy from source to destination {dest_key} failed.')

        app.logger.info(f'Copied {len(objects) if objects else 0} attachments to the destination folder.')

Example #18

0

Show file

File: transform_piazza_api_data.py Project: ets-berkeley-edu/nessie

 def transform(self, s3_source, s3_dest, job_id):
     objects = s3.get_keys_with_prefix(s3_source)
     if len(objects) == 0:
         message = f'Zero objects found in {s3_source}. Quitting.'
         app.logger.info(message)
         return message
     app.logger.info(f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.')
     objects_updated = 0
     new_objects = 0
     objects_in_error = 0
     total_objects = 0
     for o in objects:
         file_name = o.split('/')[-1]
         app.logger.debug(f'processing {file_name}')
         # file_name is like 'daily_2020-08-14.zip'
         piazza_zip_file = s3.get_object_compressed_text_reader(o)
         for subfile in piazza_zip_file.namelist():
             if '.json' in subfile:
                 try:
                     json_file = subfile.split('/')[-1]
                     course_id = subfile.split('/')[-2]
                     file_type = json_file.split('_')[0]
                     record = piazza_zip_file.read(subfile)
                     with tempfile.TemporaryFile() as result:
                         s3_object = f'{s3_dest}/{file_type}/{course_id}/{json_file}'
                         if s3.object_exists(s3_object):
                             objects_updated += 1
                         else:
                             new_objects += 1
                         result.write(record)
                         s3.upload_file(result, s3_object)
                         total_objects += 1
                     # update job queue every 1000 files...
                     if total_objects % 1000 == 0:
                         message = f'{subfile}, {total_objects} so far; ' \
                                   + f'{new_objects} new files; ' \
                                   + f'{objects_updated} existing files. {objects_in_error} files in error' \
                                   + f'({len(objects)} objects in all)'
                         update_background_job_status(job_id, 'transforming', details=message)
                 except Exception as e:
                     app.logger.error(f'could not extract {subfile}')
                     app.logger.error(e)
                     objects_in_error += 1
             else:
                 # not a json file, so we skip it
                 continue
     message = f'Transformed {len(objects)} input files; created {new_objects} new objects; '\
               + f'updated {objects_updated} existing objects. {objects_in_error} objects in error'
     app.logger.info(message)
     return message

Example #19

0

Show file

    def create_schema(self):
        app.logger.info('Executing SQL...')
        redshift.drop_external_schema(self.external_schema)

        s3_sis_daily = get_s3_sis_sysadm_daily_path()
        if not s3.get_keys_with_prefix(s3_sis_daily):
            s3_sis_daily = _get_yesterdays_advisor_data()
        s3_path = '/'.join([f"s3://{app.config['LOCH_S3_BUCKET']}", s3_sis_daily, 'advisors'])

        sql_filename = 'edl_create_advisor_schema.template.sql' if self.feature_flag_edl else 'create_advisor_schema.template.sql'
        resolved_ddl = resolve_sql_template(sql_filename, advisor_data_path=s3_path)
        if not redshift.execute_ddl_script(resolved_ddl):
            raise BackgroundJobError(f'Redshift execute_ddl_script failed on {sql_filename}')

        verify_external_schema(self.external_schema, resolved_ddl)
        app.logger.info('Redshift schema created.')

Example #20

0

Show file

def delete_objects_with_prefix(prefix, whitelist=[]):
    keys_to_delete = []
    existing_keys = s3.get_keys_with_prefix(prefix)
    if existing_keys is None:
        app.logger.error('Error listing keys, aborting job.')
        return False
    for key in existing_keys:
        filename = key.split('/')[-1]
        if filename not in whitelist:
            keys_to_delete.append(key)
    app.logger.info(
        f'Found {len(existing_keys)} key(s) matching prefix "{prefix}", {len(existing_keys) - len(keys_to_delete)} '
        f'key(s) in whitelist, will delete {len(keys_to_delete)} object(s)')
    if not keys_to_delete:
        return True
    if s3.delete_objects(keys_to_delete):
        metadata.delete_canvas_snapshots(keys_to_delete)
        return True
    else:
        return False

Example #21

0

Show file

    def update_manifests(self):
        app.logger.info('Updating manifests...')

        # Because the SIS S3 copy is managed by a different application running on a different schedule,
        # it may have been made before midnight by Nessie-time.
        s3_sis_daily = get_s3_sis_daily_path()
        if not s3.get_keys_with_prefix(s3_sis_daily):
            s3_sis_daily = get_s3_sis_daily_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_sis_daily):
                raise BackgroundJobError('No timely SIS S3 data found')
            else:
                app.logger.info('Falling back to SIS S3 daily data for yesterday')

        courses_daily = s3.get_keys_with_prefix(s3_sis_daily + '/courses', full_objects=True)
        courses_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/courses', full_objects=True)
        enrollments_daily = s3.get_keys_with_prefix(s3_sis_daily + '/enrollments', full_objects=True)
        enrollments_historical = s3.get_keys_with_prefix(app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/enrollments', full_objects=True)

        def deduplicate(prefix, s3list):
            filename_map = {}
            for s3obj in s3list:
                m = re.match(r'.+/(.+\.gz)', s3obj['Key'])
                if m:
                    filename_map[m[1]] = s3obj
            for term_id in reverse_term_ids(include_future_terms=True):
                filename = f'{prefix}-{term_id}.gz'
                if filename not in filename_map:
                    raise BackgroundJobError(f'Expected filename {filename} not found in S3, aborting')
            return list(filename_map.values())

        all_courses = deduplicate('courses', courses_daily + courses_historical)
        all_enrollments = deduplicate('enrollments', enrollments_daily + enrollments_historical)

        def to_manifest_entry(_object):
            return {
                'url': f"s3://{app.config['LOCH_S3_BUCKET']}/{_object['Key']}",
                'meta': {'content_length': _object['Size']},
            }

        def to_manifest(objects):
            return {
                'entries': [to_manifest_entry(o) for o in objects],
            }

        courses_manifest = json.dumps(to_manifest(all_courses))
        enrollments_manifest = json.dumps(to_manifest(all_enrollments))
        courses_result = s3.upload_data(courses_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/courses.json')
        enrollments_result = s3.upload_data(enrollments_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/enrollments.json')
        return courses_result and enrollments_result

Example #22

0

Show file

File: create_oua_schema.py Project: ets-berkeley-edu/nessie

    def run(self):
        app.logger.info('Starting OUA Slate schema creation job...')
        app.logger.info('Executing SQL...')

        s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        oua_slate_sftp_path = app.config[
            'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset(
            ) + '/'
        oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/'

        # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location
        keys = s3.get_keys_with_prefix(oua_slate_sftp_path,
                                       full_objects=False,
                                       bucket=s3_protected_bucket)

        if len(keys) > 0:
            for source_key in keys:
                if source_key.endswith('.csv'):
                    destination_key = source_key.replace(
                        oua_slate_sftp_path, oua_daily_dest_path)
                    if not s3.copy(s3_protected_bucket, source_key,
                                   s3_protected_bucket, destination_key):
                        raise BackgroundJobError(
                            f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.'
                        )
            external_schema = app.config['REDSHIFT_SCHEMA_OUA']
            redshift.drop_external_schema(external_schema)
            resolved_ddl = resolve_sql_template(
                'create_oua_schema_template.sql')
            if redshift.execute_ddl_script(resolved_ddl):
                verify_external_schema(external_schema, resolved_ddl)
                self.create_rds_tables_and_indexes()
                app.logger.info('OUA Slate RDS indexes created.')
                return 'OUA schema creation job completed.'

            else:
                raise BackgroundJobError(
                    'OUA Slate schema creation job failed.')

        else:
            return 'No OUA files found in SFTP location today. Skipping OUA data refresh'

Example #23

0

Show file

File: test_s3.py Project: ets-berkeley-edu/nessie

    def test_list_keys_matching_prefix(self, app):
        """Lists keys matching prefix."""
        bucket = app.config['LOCH_S3_BUCKET']
        prefix = app.config[
            'LOCH_S3_CANVAS_DATA_PATH_CURRENT_TERM'] + '/requests'

        with mock_s3(app) as m:
            m.Object(bucket,
                     f'{prefix}/requests-aaa.gz').put(Body=b'some data')
            m.Object(bucket,
                     f'{prefix}/requests-bbb.gz').put(Body=b'some more data')
            m.Object(bucket,
                     f'{prefix}/requests-ccc.gz').put(Body=b'yet more data')
            m.Object(bucket, 'another-prefix/requests-ddd.gz').put(
                Body=b'utterly unrelated data')

            response = s3.get_keys_with_prefix(prefix)
            assert len(response) == 3
            assert f'{prefix}/requests-aaa.gz' in response
            assert f'{prefix}/requests-bbb.gz' in response
            assert f'{prefix}/requests-ccc.gz' in response

Example #24

0

Show file

 def transform(self, s3_source, s3_dest, key=None):
     objects = s3.get_keys_with_prefix(s3_source)
     app.logger.info(
         f'Will transform {len(objects)} objects from {s3_source} and put results to {s3_dest}.'
     )
     skip_count = 0
     for o in objects:
         file_name = o.split('/')[-1]
         if s3.object_exists(f'{s3_dest}/{file_name}'):
             skip_count += 1
             continue
         canvas_api_data = s3.get_object_json(o).get(
             key) if key else s3.get_object_json(o)
         with tempfile.TemporaryFile() as result:
             course_id = int(file_name.split('_')[-2])
             for record in canvas_api_data:
                 record['course_id'] = course_id
                 result.write(json.dumps(record).encode() + b'\n')
             s3.upload_file(result, f'{s3_dest}/{file_name}')
     app.logger.info(
         f'Transformed {len(objects) - skip_count} new objects; skipped {skip_count} existing objects.'
     )

Example #25

0

Show file

def cleanup_s3(app):
    yield
    from nessie.externals import s3
    keys = s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'])
    s3.delete_objects(keys)