コード例 #1
0
def _import_calnet_attributes(advisor_ids):
    calnet_attributes = calnet.client(app).search_csids(advisor_ids)
    calnet_row_count = len(calnet_attributes)
    if len(advisor_ids) != calnet_row_count:
        ldap_csids = [person['csid'] for person in calnet_attributes]
        missing = set(advisor_ids) - set(ldap_csids)
        app.logger.warning(f'Looked for {len(advisor_ids)} advisor CSIDs but only found {calnet_row_count} : missing {missing}')

    advisor_rows = []
    for index, a in enumerate(calnet_attributes):
        sid = a['csid']
        app.logger.info(f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {calnet_row_count})')
        first_name, last_name = calnet.split_sortable_name(a)
        data = [
            a['uid'],
            sid,
            first_name,
            last_name,
            a['title'],
            calnet.get_dept_code(a),
            a['email'],
            a['campus_email'],
        ]
        advisor_rows.append(encoded_tsv_row(data))

    s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv'
    app.logger.info(f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}')
    if not s3.upload_tsv_rows(advisor_rows, s3_key):
        raise BackgroundJobError('Error on S3 upload: aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    query = resolve_sql_template_string(
        """
        TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes;
        COPY {redshift_schema_advisor_internal}.advisor_attributes
            FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv'
            IAM_ROLE '{redshift_iam_role}'
            DELIMITER '\\t';
        """,
    )
    was_successful = redshift.execute(query)
    app.logger.info('Advisor attributes imported.' if was_successful else 'Error on Redshift copy: aborting job.')
    return was_successful
コード例 #2
0
    def run(self):
        app.logger.info('Starting CalNet schema creation job...')
        external_schema = app.config['REDSHIFT_SCHEMA_CALNET']
        redshift.drop_external_schema(external_schema)
        sid_snapshot_path = '/'.join([
            f"s3://{app.config['LOCH_S3_BUCKET']}",
            app.config['LOCH_S3_CALNET_DATA_PATH'],
            'sids',
        ])
        resolved_ddl = resolve_sql_template(
            'create_calnet_schema.template.sql',
            sid_snapshot_path=sid_snapshot_path,
        )

        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'CalNet schema creation job completed.'
        else:
            raise BackgroundJobError('CalNet schema creation job failed.')
コード例 #3
0
    def refresh_current_term_index(self):
        today = datetime.now(pytz.utc).astimezone(
            pytz.timezone(app.config['TIMEZONE'])).date()
        current_term = self.get_sis_current_term(today)

        if current_term:
            term_id = current_term['term_id']

            # Check if the advance enrollment period has started for the next two upcoming terms.
            future_term_id = term_id
            for _ in range(2):
                term_id = next_term_id(term_id)
                term = self.get_sis_term_for_id(term_id)
                advance_enrollment_period = 0
                if term_id[3] == '2':
                    advance_enrollment_period = 95
                elif term_id[3] == '5':
                    advance_enrollment_period = 124
                elif term_id[3] == '8':
                    advance_enrollment_period = 140
                if term['term_begins'] - timedelta(
                        days=advance_enrollment_period) < today:
                    future_term_id = term_id

            with rds.transaction() as transaction:
                transaction.execute(
                    f'TRUNCATE {rds_schema}.current_term_index')
                columns = ['current_term_name', 'future_term_name']
                values = tuple([
                    current_term['term_name'],
                    term_name_for_sis_id(future_term_id)
                ])
                if transaction.execute(
                        f'INSERT INTO {rds_schema}.current_term_index ({", ".join(columns)}) VALUES {values} '
                ):
                    transaction.commit()
                else:
                    transaction.rollback()
                    raise BackgroundJobError(
                        'Error refreshing RDS current term index.')
コード例 #4
0
    def refresh_student_enrollment_term(self, term_id, enrollment_term_map):
        with tempfile.TemporaryFile() as enrollment_term_file:
            for (sid, sid_term_feed) in enrollment_term_map.items():
                enrollment_term_file.write(
                    encoded_tsv_row([sid, term_id,
                                     json.dumps(sid_term_feed)]) + b'\n')

            drop_staged_enrollment_term(term_id)
            write_file_to_staging('student_enrollment_terms',
                                  enrollment_term_file,
                                  len(enrollment_term_map), term_id)

        with redshift.transaction() as transaction:
            refresh_from_staging('student_enrollment_terms',
                                 term_id,
                                 None,
                                 transaction,
                                 truncate_staging=False)
            if not transaction.commit():
                raise BackgroundJobError(
                    f'Final transaction commit failed on enrollment term refresh (term_id={term_id}).'
                )
コード例 #5
0
    def run(self):
        app.logger.info('Starting SIS Advising Notes schema creation job...')

        daily_path = get_s3_sis_sysadm_daily_path()
        bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket):
            daily_path = get_s3_sis_sysadm_daily_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes', bucket=bucket):
                raise BackgroundJobError('No timely SIS advising notes data found, aborting')
            else:
                app.logger.info('Falling back to yesterday\'s SIS advising notes data')

        app.logger.info('Executing SQL...')
        external_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES']
        redshift.drop_external_schema(external_schema)
        self.create_historical_tables(external_schema)
        self.create_internal_schema(external_schema, daily_path)
        app.logger.info('Redshift schema created. Creating RDS indexes...')
        self.create_indexes()
        app.logger.info('RDS indexes created.')

        return 'SIS Advising Notes schema creation job completed.'
コード例 #6
0
    def generate_feeds(self):
        non_advisee_sids = queries.get_fetched_non_advisees()
        non_advisee_sids = [r['sid'] for r in non_advisee_sids]

        profile_count = self.generate_student_profile_table(non_advisee_sids)
        enrollment_count = self.generate_student_enrollments_table(
            non_advisee_sids)

        if profile_count and enrollment_count:
            resolved_ddl_rds = resolve_sql_template(
                'update_rds_indexes_student_profiles_hist_enr.template.sql')
            if rds.execute(resolved_ddl_rds):
                app.logger.info('RDS indexes updated.')
            else:
                raise BackgroundJobError(
                    'Failed to refresh RDS copies of non-advisee data.')
        else:
            app.logger.warning(
                'No non-advisee data loaded into Redshift; will not refresh RDS copies.'
            )

        return f'Generated {profile_count} non-advisee profiles, {enrollment_count} enrollments.'
コード例 #7
0
    def create_lrs_caliper_relationalize_job(self):
        job_name = app.config['LRS_CANVAS_GLUE_JOB_NAME']
        glue_role = app.config['LRS_GLUE_SERVICE_ROLE']
        job_command = {
            'Name':
            'glueetl',
            'ScriptLocation':
            's3://{}/{}'.format(
                app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'],
                app.config['LRS_CANVAS_GLUE_JOB_SCRIPT_PATH'],
            ),
        }
        default_arguments = {
            '--LRS_INCREMENTAL_TRANSIENT_BUCKET':
            app.config['LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET'],
            '--LRS_CANVAS_CALIPER_SCHEMA_PATH':
            app.config['LRS_CANVAS_CALIPER_SCHEMA_PATH'],
            '--LRS_CANVAS_CALIPER_INPUT_DATA_PATH':
            app.config['LRS_CANVAS_CALIPER_INPUT_DATA_PATH'],
            '--LRS_GLUE_TEMP_DIR':
            app.config['LRS_GLUE_TEMP_DIR'],
            '--LRS_CANVAS_CALIPER_EXPLODE_OUTPUT_PATH':
            app.config['LRS_CANVAS_CALIPER_EXPLODE_OUTPUT_PATH'],
            '--job-bookmark-option':
            'job-bookmark-disable',
        }

        response = glue.create_glue_job(job_name, glue_role, job_command,
                                        default_arguments)
        if not response:
            raise BackgroundJobError('Failed to create Glue job.')
        elif response['Name']:
            app.logger.info(f'Response : {response}')
            app.logger.info(
                f'Glue Job created successfully with Job Name : {response}')
            return True
        else:
            return False
コード例 #8
0
    def unload_to_etl(self, schema, bucket, timestamped=True):
        s3_url = 's3://' + bucket + '/' + app.config['LRS_CANVAS_INCREMENTAL_ETL_PATH_REDSHIFT']
        if timestamped:
            s3_url += '/' + localize_datetime(datetime.now()).strftime('%Y/%m/%d/statements_%Y%m%d_%H%M%S_')
        else:
            s3_url += '/statements'

        redshift_iam_role = app.config['REDSHIFT_IAM_ROLE']
        if not redshift.execute(
            f"""
                UNLOAD ('SELECT statement FROM {schema}.statements')
                TO '{s3_url}'
                IAM_ROLE '{redshift_iam_role}'
                ENCRYPTED
                DELIMITER AS '  '
                NULL AS ''
                ALLOWOVERWRITE
                PARALLEL OFF
                MAXFILESIZE 1 gb
            """,
        ):
            raise BackgroundJobError(f'Error executing Redshift unload to {s3_url}.')
        self.verify_unloaded_count(s3_url)
コード例 #9
0
    def run(self, datestamp=None):
        s3_attachment_sync_failures = []
        missing_s3_attachments = []
        app.logger.info('Starting SIS Advising Note attachments validation job...')

        dest_prefix = app.config['LOCH_S3_ADVISING_NOTE_ATTACHMENT_DEST_PATH']

        for source_prefix in self.source_paths(datestamp):
            app.logger.info(f'Will validate files from {source_prefix}.')
            s3_attachment_sync_failures.extend(self.verify_attachment_migration(source_prefix, dest_prefix))

        missing_s3_attachments = self.find_missing_notes_view_attachments(dest_prefix)

        if s3_attachment_sync_failures or missing_s3_attachments:
            verification_results = {
                'attachment_sync_failure_count': len(s3_attachment_sync_failures),
                'missing_s3_attachments_count': len(missing_s3_attachments),
                'attachment_sync_failures': s3_attachment_sync_failures,
                'missing_s3_attachments': missing_s3_attachments,
            }
            raise BackgroundJobError(f'Attachments verification found missing attachments or sync failures:  {verification_results}.')
        else:
            return 'Note attachment verification completed successfully. No missing attachments or sync failures found.'
コード例 #10
0
    def run(self):
        app.logger.info('Starting ASC profile generation job...')
        asc_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)',
            schema=asc_schema_identifier,
        )

        profile_rows = []
        sids_for_inactive_deletion = []

        for sid, rows_for_student in groupby(asc_rows,
                                             operator.itemgetter('sid')):
            rows_for_student = list(rows_for_student)
            # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not
            # a team membership, a bit of brutal simplification is needed. Students who are active in at least
            # one sport have inactive team memberships dropped.
            any_active_athletics = reduce(
                operator.or_, [r['active'] for r in rows_for_student], False)
            if any_active_athletics:
                rows_for_student = [r for r in rows_for_student if r['active']]
                sids_for_inactive_deletion.append(sid)
            athletics_profile = {
                'athletics': [],
                'inIntensiveCohort': rows_for_student[0]['intensive'],
                'isActiveAsc': rows_for_student[0]['active'],
                'statusAsc': rows_for_student[0]['status_asc'],
            }
            for row in rows_for_student:
                athletics_profile['athletics'].append({
                    'groupCode':
                    row['group_code'],
                    'groupName':
                    row['group_name'],
                    'name':
                    row['group_name'],
                    'teamCode':
                    row['team_code'],
                    'teamName':
                    row['team_name'],
                })

            profile_rows.append(
                encoded_tsv_row([sid, json.dumps(athletics_profile)]))

        s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.student_profiles;
            COPY {redshift_schema_asc}.student_profiles
                FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(asc_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        if sids_for_inactive_deletion:
            redshift.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion, ),
            )
            rds.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion, ),
            )

        return 'ASC profile generation complete.'
コード例 #11
0
 def index_appointment_advisors(self):
     resolved_ddl = resolve_sql_template('index_sis_appointment_advisors.template.sql')
     if rds.execute(resolved_ddl):
         app.logger.info('Indexed appointment advisors.')
     else:
         raise BackgroundJobError('Failed to index appointment advisors.')
コード例 #12
0
 def _rollback():
     transaction.rollback()
     raise BackgroundJobError(
         f'Failed to populate table {student_schema()}.{table} from staging schema.'
     )
コード例 #13
0
ファイル: create_coe_schema.py プロジェクト: dkawase/nessie
    def run(self):
        app.logger.info('Starting COE schema creation job...')
        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template('create_coe_schema.template.sql')
        # TODO This DDL drops and recreates the internal schema before the external schema is verified. We
        # ought to set up proper staging in conjunction with verification. It's also possible that a persistent
        # external schema isn't needed.
        if redshift.execute_ddl_script(resolved_ddl):
            app.logger.info('COE external schema created.')
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError('COE external schema creation failed.')
        coe_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid',
            schema=internal_schema_identifier,
        )

        profile_rows = []
        index = 1
        for sid, rows_for_student in groupby(coe_rows,
                                             operator.itemgetter('sid')):
            app.logger.info(
                f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})'
            )
            index += 1
            row_for_student = list(rows_for_student)[0]
            coe_profile = {
                'advisorUid': row_for_student.get('advisor_ldap_uid'),
                'gender': row_for_student.get('gender'),
                'ethnicity': row_for_student.get('ethnicity'),
                'minority': row_for_student.get('minority'),
                'didPrep': row_for_student.get('did_prep'),
                'prepEligible': row_for_student.get('prep_eligible'),
                'didTprep': row_for_student.get('did_tprep'),
                'tprepEligible': row_for_student.get('tprep_eligible'),
                'sat1read': row_for_student.get('sat1read'),
                'sat1math': row_for_student.get('sat1math'),
                'sat2math': row_for_student.get('sat2math'),
                'inMet': row_for_student.get('in_met'),
                'gradTerm': row_for_student.get('grad_term'),
                'gradYear': row_for_student.get('grad_year'),
                'probation': row_for_student.get('probation'),
                'status': row_for_student.get('status'),
            }
            profile_rows.append(encoded_tsv_row([sid,
                                                 json.dumps(coe_profile)]))

        s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {redshift_schema_coe}.student_profiles
                FROM '{loch_s3_coe_data_path}/coe_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(coe_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        return 'COE internal schema created.'
コード例 #14
0
    def update_manifests(self):
        app.logger.info('Updating manifests...')

        # Because the SIS S3 copy is managed by a different application running on a different schedule,
        # it may have been made before midnight by Nessie-time.
        s3_sis_daily = get_s3_sis_daily_path()
        if not s3.get_keys_with_prefix(s3_sis_daily):
            s3_sis_daily = get_s3_sis_daily_path(datetime.now() -
                                                 timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_sis_daily):
                raise BackgroundJobError('No timely SIS S3 data found')
            else:
                app.logger.info(
                    'Falling back to SIS S3 daily data for yesterday')

        courses_daily = s3.get_keys_with_prefix(s3_sis_daily + '/courses',
                                                full_objects=True)
        courses_historical = s3.get_keys_with_prefix(
            app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/courses',
            full_objects=True)
        enrollments_daily = s3.get_keys_with_prefix(s3_sis_daily +
                                                    '/enrollments',
                                                    full_objects=True)
        enrollments_historical = s3.get_keys_with_prefix(
            app.config['LOCH_S3_SIS_DATA_PATH'] + '/historical/enrollments',
            full_objects=True)

        def deduplicate(prefix, s3list):
            filename_map = {}
            for s3obj in s3list:
                m = re.match(r'.+/(.+\.gz)', s3obj['Key'])
                if m:
                    filename_map[m[1]] = s3obj
            for term_id in reverse_term_ids(include_future_terms=True):
                filename = f'{prefix}-{term_id}.gz'
                if filename not in filename_map:
                    raise BackgroundJobError(
                        f'Expected filename {filename} not found in S3, aborting'
                    )
            return list(filename_map.values())

        all_courses = deduplicate('courses',
                                  courses_daily + courses_historical)
        all_enrollments = deduplicate(
            'enrollments', enrollments_daily + enrollments_historical)

        def to_manifest_entry(_object):
            return {
                'url': f"s3://{app.config['LOCH_S3_BUCKET']}/{_object['Key']}",
                'meta': {
                    'content_length': _object['Size']
                },
            }

        def to_manifest(objects):
            return {
                'entries': [to_manifest_entry(o) for o in objects],
            }

        courses_manifest = json.dumps(to_manifest(all_courses))
        enrollments_manifest = json.dumps(to_manifest(all_enrollments))
        courses_result = s3.upload_data(
            courses_manifest,
            app.config['LOCH_S3_SIS_DATA_PATH'] + '/manifests/courses.json')
        enrollments_result = s3.upload_data(
            enrollments_manifest, app.config['LOCH_S3_SIS_DATA_PATH'] +
            '/manifests/enrollments.json')
        return courses_result and enrollments_result
コード例 #15
0
    def run(self, load_mode='new'):
        all_sids = [row['sid'] for row in get_all_student_ids()]
        previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()}

        if load_mode == 'new':
            sids = list(set(all_sids).difference(previous_backfills))
        elif load_mode == 'batch':
            new_sids = list(set(all_sids).difference(previous_backfills))
            limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids)
            if limit > 0:
                oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)]
                sids = new_sids + oldest_backfills
            else:
                sids = new_sids
        elif load_mode == 'all':
            sids = all_sids

        app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
            'api_demographics': [],
        }
        successes, failures = self.get_registration_data_per_sids(rows, sids)
        if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0):
            raise BackgroundJobError('Failed to import registration histories: aborting job.')

        for key in rows.keys():
            s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv'
            app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}')
            if not s3.upload_tsv_rows(rows[key], s3_key):
                raise BackgroundJobError('Error on S3 upload: aborting job.')
            app.logger.info('Will copy S3 feeds into Redshift...')
            if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'):
                raise BackgroundJobError('Error truncating old staging rows: aborting job.')
            if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key):
                raise BackgroundJobError('Error on Redshift copy: aborting job.')
            staging_to_destination_query = resolve_sql_template_string(
                """
                DELETE FROM {student_schema}.student_{table_key}
                    WHERE sid IN
                    (SELECT sid FROM {student_schema}_staging.student_{table_key});
                INSERT INTO {student_schema}.student_{table_key}
                    (SELECT * FROM {student_schema}_staging.student_{table_key});
                TRUNCATE TABLE {student_schema}_staging.student_{table_key};
                """,
                table_key=key,
                student_schema=student_schema(),
            )
            if not redshift.execute(staging_to_destination_query):
                raise BackgroundJobError('Error inserting staging entries into destination: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Failed to refresh RDS indexes.')

        update_registration_import_status(successes, failures)

        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )
コード例 #16
0
ファイル: import_asc_athletes.py プロジェクト: dkawase/nessie
    def run(self):
        app.logger.info(
            'ASC import: Fetch team and student athlete data from ASC API')
        api_results = get_asc_feed()
        if 'error' in api_results:
            raise BackgroundJobError(
                'ASC import: Error from external API: {}'.format(
                    api_results['error']))
        elif not api_results:
            raise BackgroundJobError('ASC import: API returned zero students')
        sync_date = api_results[0]['SyncDate']
        if sync_date != api_results[-1]['SyncDate']:
            raise BackgroundJobError(
                f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}'
            )
        rows = []
        for r in api_results:
            if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r['SportCode']:
                asc_code = r['SportCodeCore']
                if asc_code in SPORT_TRANSLATIONS:
                    group_code = r['SportCode']
                    data = [
                        r['SID'],
                        str(r.get('ActiveYN', 'No') == 'Yes'),
                        str(r.get('IntensiveYN', 'No') == 'Yes'),
                        r.get('SportStatus', ''),
                        group_code,
                        _unambiguous_group_name(r['Sport'], group_code),
                        SPORT_TRANSLATIONS[asc_code],
                        r['SportCore'],
                    ]
                    rows.append(encoded_tsv_row(data))
                else:
                    sid = r['SID']
                    app.logger.error(
                        f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}'
                    )

        s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv'
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Copy data in S3 file to Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.students;
            COPY {redshift_schema_asc}.students
                FROM 's3://{s3_bucket}/{s3_key}'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """,
            s3_bucket=app.config['LOCH_S3_BUCKET'],
            s3_key=s3_key,
        )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        status = {
            'this_sync_date': sync_date,
            'api_results_count': len(api_results),
        }
        app.logger.info(
            f'ASC import: Successfully completed import job: {str(status)}')
        return status
コード例 #17
0
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})')

        snapshot_response = canvas_data.get_snapshots()
        if not snapshot_response:
            raise BackgroundJobError(
                'Error retrieving Canvas data snapshots, aborting job.')
        snapshots = snapshot_response.get('files', [])

        def should_sync(snapshot):
            # For tables other than requests, sync all snapshots.
            # For the requests table, sync snapshots that are partial or later than the configured cutoff date.
            def after_cutoff_date(url):
                match = re.search('requests/(20\d{6})', url)
                return match is not None and (
                    match[1] >=
                    app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE'])

            return snapshot['table'] != 'requests' or snapshot[
                'partial'] is True or after_cutoff_date(snapshot['url'])

        snapshots_to_sync = [s for s in snapshots if should_sync(s)]
        app.logger.info(
            f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.'
        )

        success = 0
        failure = 0

        for snapshot in snapshots_to_sync:
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=snapshot['filename'],
                canvas_table=snapshot['table'],
                source_url=snapshot['url'],
            )
            if snapshot['table'] == 'requests':
                key_components = [
                    berkeley.s3_canvas_data_path_current_term(),
                    snapshot['table'], snapshot['filename']
                ]
            else:
                key_components = [
                    get_s3_canvas_daily_path(), snapshot['table'],
                    snapshot['filename']
                ]

            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3',
                                data={
                                    'canvas_sync_job_id': job_id,
                                    'url': snapshot['url'],
                                    'key': key
                                })

            if not response:
                app.logger.error('Failed to dispatch S3 sync of snapshot ' +
                                 snapshot['filename'])
                metadata.update_canvas_sync_status(
                    job_id,
                    key,
                    'error',
                    details=f'Failed to dispatch: {response}')
                failure += 1
            else:
                app.logger.info('Dispatched S3 sync of snapshot ' +
                                snapshot['filename'])
                success += 1

        if cleanup:
            app.logger.info('Will remove obsolete snapshots from S3.')
            current_snapshot_filenames = [
                s['filename'] for s in snapshots_to_sync
            ]
            requests_prefix = berkeley.s3_canvas_data_path_current_term(
            ) + '/requests'
            delete_result = s3.delete_objects_with_prefix(
                requests_prefix, whitelist=current_snapshot_filenames)
            if not delete_result:
                app.logger.error('Cleanup of obsolete snapshots failed.')
        return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
コード例 #18
0
    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        #
        # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less
        # likely to be triggered.)
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting import of historical registration data for {len(sids)} students...'
        )
        redshift.execute('VACUUM; ANALYZE;')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.get_registration_data_per_sids(
            rows, sids, include_demographics=False)
        for key in rows.keys():
            if len(rows[key]) > 0:
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.'
                )
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        f'Error during S3 upload: {s3_key}. Aborting job.')

                staging_table = f'{student_schema()}_staging.hist_enr_{key}'
                if not redshift.execute(f'TRUNCATE {staging_table}'):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')

                app.logger.info(
                    f'Populate {staging_table} (Redshift table) with s3:{s3_key}'
                )
                if not redshift.copy_tsv_from_s3(staging_table, s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')

                app.logger.info(
                    f'Insert student data into {student_schema()}.hist_enr_{key}'
                )
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {student_schema}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key});
                    INSERT INTO {student_schema}.hist_enr_{table_key}
                        (SELECT * FROM {student_schema}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                    student_schema=student_schema(),
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )

        redshift.execute('VACUUM; ANALYZE;')
        return (
            f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.'
        )
コード例 #19
0
    def run(self, csids=None):
        if not csids:
            all_sids = get_all_student_ids()
            if all_sids:
                csids = [row['sid'] for row in all_sids]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append(encoded_tsv_row([csid, json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress'
        ):
            raise BackgroundJobError(
                'Error truncating old staging rows: aborting job.')

        query = resolve_sql_template_string(
            """
            CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog
                DATABASE '{redshift_schema_student}_staging_ext_tmp'
                IAM_ROLE '{redshift_iam_role}'
                CREATE EXTERNAL DATABASE IF NOT EXISTS;
            CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress (
                sid VARCHAR,
                feed VARCHAR(MAX)
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '\\t'
            STORED AS TEXTFILE
            LOCATION '{loch_s3_sis_api_data_path}/degree_progress';

            DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress);
            DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress;
            DROP SCHEMA {redshift_schema_student}_staging_ext_tmp;

            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress;
            """, )
        if not redshift.execute_ddl_script(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        redshift.execute('VACUUM; ANALYZE;')

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )
コード例 #20
0
def refresh_all_from_staging(tables):
    with redshift.transaction() as transaction:
        for table in tables:
            refresh_from_staging(table, None, None, transaction)
        if not transaction.commit():
            raise BackgroundJobError(f'Final transaction commit failed for {redshift_schema()}.')
コード例 #21
0
    def run(self):
        # Retrieve latest schema definitions from Canvas data API
        response = canvas_data.get_canvas_data_schema()
        external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
        redshift_iam_role = app.config['REDSHIFT_IAM_ROLE']
        canvas_schema = []

        # Parse and isolate table and column details
        for key, value in response['schema'].items():
            for column in value['columns']:
                # Not every column has description and length.
                description = None
                if 'description' in column:
                    description = column['description']

                length = None
                if 'length' in column:
                    length = column['length']

                canvas_schema.append([
                    value['tableName'],
                    column['name'],
                    column['type'],
                    description,
                    length,
                ])
        # Create a dataframe
        schema_df = pd.DataFrame(canvas_schema)
        schema_df.columns = [
            'table_name',
            'column_name',
            'column_type',
            'column_description',
            'column_length',
        ]

        # The schema definitions received from Canvas are Redshift compliant. We update
        # cetain column types to match Glue and Spectrum data types.
        schema_df['glue_type'] = schema_df['column_type'].replace({
            'enum':
            'varchar',
            'guid':
            'varchar',
            'text':
            'varchar(max)',
            'date':
            'timestamp',
            'datetime':
            'timestamp',
        })

        schema_df['transformed_column_name'] = schema_df[
            'column_name'].replace({
                'default': '"default"',
                'percent': '"percent"',
            })
        # Create Hive compliant storage descriptors
        canvas_external_catalog_ddl = self.generate_external_catalog(
            external_schema, schema_df)

        # Clean up and recreate refreshed tables on Glue using Spectrum
        redshift.drop_external_schema(external_schema)
        redshift.create_external_schema(external_schema, redshift_iam_role)

        if redshift.execute_ddl_script(canvas_external_catalog_ddl):
            app.logger.info('Canvas schema creation job completed.')
        else:
            app.logger.error('Canvas schema creation job failed.')
            raise BackgroundJobError('Canvas schema creation job failed.')

        self.verify_external_data_catalog()
        return 'Canvas external schema created and verified.'
コード例 #22
0
    def run(self):
        app.logger.info('Start generating canvas caliper analytics')
        redshift_schema_caliper_analytics = app.config[
            'REDSHIFT_SCHEMA_CALIPER']
        redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS']
        canvas_caliper_explode_table = 'canvas_caliper_explode'

        # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch
        # may have been delivered before last midnight UTC.
        s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path()
        if not s3.get_keys_with_prefix(s3_caliper_daily_path):
            s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(
                datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_caliper_daily_path):
                raise BackgroundJobError('No timely S3 Caliper extracts found')
            else:
                app.logger.info(
                    'Falling back S3 Caliper extracts for yesterday')
        s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path)

        resolved_ddl_caliper_explode = resolve_sql_template(
            'create_lrs_canvas_explode_table.template.sql',
            canvas_caliper_explode_table=canvas_caliper_explode_table,
            loch_s3_caliper_explode_url=s3_caliper_daily_url,
        )
        redshift.drop_external_schema(redshift_schema_lrs_external)
        if redshift.execute_ddl_script(resolved_ddl_caliper_explode):
            app.logger.info(
                'Caliper explode schema and table successfully created.')
        else:
            raise BackgroundJobError(
                'Caliper explode schema and table creation failed.')

        # Sanity-check event times from the latest Caliper batch against previously transformed event times.
        def datetime_from_query(query):
            response = redshift.fetch(query)
            timestamp = response and response[0] and response[0].get(
                'timestamp')
            if not timestamp:
                raise BackgroundJobError(
                    f'Timestamp query failed to return data for comparison; aborting job: {query}'
                )
            if isinstance(timestamp, str):
                timestamp = datetime.strptime(timestamp,
                                              '%Y-%m-%dT%H:%M:%S.%fZ')
            return timestamp

        earliest_untransformed = datetime_from_query(
            f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}',
        )
        latest_transformed = datetime_from_query(
            f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests',
        )
        if not earliest_untransformed or not latest_transformed:
            return False
        timestamp_diff = (earliest_untransformed -
                          latest_transformed).total_seconds()
        lower_bound_tolerance, upper_bound_tolerance = app.config[
            'LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE']
        if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance:
            raise BackgroundJobError(
                f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, '
                f'earliest untransformed {earliest_untransformed}', )

        resolved_ddl_caliper_analytics = resolve_sql_template(
            'generate_caliper_analytics.template.sql')
        if redshift.execute_ddl_script(resolved_ddl_caliper_analytics):
            return 'Caliper analytics tables successfully created.'
        else:
            raise BackgroundJobError(
                'Caliper analytics tables creation failed.')
コード例 #23
0
 def create_historical_tables(self, external_schema):
     resolved_ddl = resolve_sql_template('create_sis_advising_notes_historical_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('SIS Advising Notes schema creation job failed to load historical data.')
コード例 #24
0
    def run(self, term_id=None):
        if not term_id:
            term_id = current_term_id()
        canvas_course_ids = [
            row['canvas_course_id']
            for row in get_enrolled_canvas_sites_for_term(term_id)
        ]

        app.logger.info(
            f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for course_id in canvas_course_ids:
            app.logger.info(
                f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})'
            )
            feed = canvas_api.get_course_enrollments(course_id)
            if feed:
                success_count += 1
                for enrollment in feed:
                    user_id = enrollment.get('user_id')
                    last_activity_at = enrollment.get('last_activity_at') or ''
                    rows.append(
                        encoded_tsv_row([
                            course_id, user_id, term_id, last_activity_at,
                            json.dumps(enrollment)
                        ]))
            else:
                failure_count += 1
                app.logger.error(
                    f'Canvas enrollments API import failed for course id {course_id}.'
                )
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments/canvas_api_enrollments_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog
                DATABASE '{redshift_schema_student}_staging_ext_tmp'
                IAM_ROLE '{redshift_iam_role}'
                CREATE EXTERNAL DATABASE IF NOT EXISTS;
            CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments (
                course_id VARCHAR,
                user_id VARCHAR,
                term_id VARCHAR,
                last_activity_at TIMESTAMP,
                feed VARCHAR
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '\\t'
            STORED AS TEXTFILE
            LOCATION '{loch_s3_sis_api_data_path}/canvas_api_enrollments';

            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}';
            INSERT INTO {redshift_schema_student}_staging.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments);
            DROP TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments;
            DROP SCHEMA {redshift_schema_student}_staging_ext_tmp;

            DELETE FROM {redshift_schema_student}.canvas_api_enrollments
                WHERE term_id = '{term_id}'
                AND course_id IN
                (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute_ddl_script(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return (
            f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{failure_count} failed.')
コード例 #25
0
 def create_indexes(self):
     resolved_ddl = resolve_sql_template('index_e_i_advising_notes.template.sql')
     if rds.execute(resolved_ddl):
         app.logger.info('Created E&I Advising Notes RDS indexes.')
     else:
         raise BackgroundJobError('E&I Advising Notes schema creation job failed to create indexes.')
コード例 #26
0
    def run(self, truncate_lrs=True):
        app.logger.info('Starting DMS replication task...')
        task_id = app.config['LRS_CANVAS_INCREMENTAL_REPLICATION_TASK_ID']

        self.transient_bucket = app.config[
            'LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET']
        self.transient_path = app.config[
            'LRS_CANVAS_INCREMENTAL_TRANSIENT_PATH']

        self.delete_old_incrementals()

        response = dms.start_replication_task(task_id)
        if not response:
            raise BackgroundJobError(
                'Failed to start DMS replication task (response={response}).')

        while True:
            response = dms.get_replication_task(task_id)
            if response.get('Status') == 'stopped':
                if response.get(
                        'StopReason') == 'Stop Reason FULL_LOAD_ONLY_FINISHED':
                    app.logger.info('DMS replication task completed')
                    break
                else:
                    raise BackgroundJobError(
                        f'Replication task stopped for unexpected reason: {response}'
                    )
            sleep(10)

        lrs_response = lrs.fetch('select count(*) from statements')
        if lrs_response:
            self.lrs_statement_count = lrs_response[0][0]
        else:
            raise BackgroundJobError(
                'Failed to retrieve LRS statements for comparison.')

        transient_keys = s3.get_keys_with_prefix(self.transient_path,
                                                 bucket=self.transient_bucket)
        if not transient_keys:
            raise BackgroundJobError(
                'Could not retrieve S3 keys from transient bucket.')
        self.verify_and_unload_transient()

        timestamp_path = localize_datetime(
            datetime.now()).strftime('%Y/%m/%d/%H%M%S')
        destination_path = app.config[
            'LRS_CANVAS_INCREMENTAL_DESTINATION_PATH'] + '/' + timestamp_path
        for destination_bucket in app.config[
                'LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']:
            self.migrate_transient_to_destination(
                transient_keys,
                destination_bucket,
                destination_path,
            )

        if truncate_lrs:
            if lrs.execute('TRUNCATE statements'):
                app.logger.info('Truncated incremental LRS table.')
            else:
                raise BackgroundJobError(
                    'Failed to truncate incremental LRS table.')

        return (
            f'Migrated {self.lrs_statement_count} statements to S3'
            f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={destination_path})"
        )
コード例 #27
0
    def generate_feeds(self):
        # Translation between canvas_user_id and UID/SID is needed to merge Canvas analytics data and SIS enrollment-based data.
        advisees_by_canvas_id = {}
        advisees_by_sid = {}
        self.successes = []
        self.failures = []
        profile_tables = self.generate_student_profile_tables(
            advisees_by_canvas_id, advisees_by_sid)
        if not profile_tables:
            raise BackgroundJobError(
                'Failed to generate student profile tables.')

        feed_path = app.config['LOCH_S3_BOAC_ANALYTICS_DATA_PATH'] + '/feeds/'
        s3.upload_json(advisees_by_canvas_id,
                       feed_path + 'advisees_by_canvas_id.json')

        upload_student_term_maps(advisees_by_sid)

        # Avoid processing Canvas analytics data for future terms and pre-CS terms.
        for term_id in (future_term_ids() + legacy_term_ids()):
            enrollment_term_map = s3.get_object_json(
                feed_path + f'enrollment_term_map_{term_id}.json')
            if enrollment_term_map:
                GenerateMergedEnrollmentTerm().refresh_student_enrollment_term(
                    term_id, enrollment_term_map)

        canvas_integrated_term_ids = reverse_term_ids()
        app.logger.info(
            f'Will queue analytics generation for {len(canvas_integrated_term_ids)} terms on worker nodes.'
        )
        result = queue_merged_enrollment_term_jobs(self.job_id,
                                                   canvas_integrated_term_ids)
        if not result:
            raise BackgroundJobError('Failed to queue enrollment term jobs.')

        refresh_all_from_staging(profile_tables)
        self.update_redshift_academic_standing()
        self.update_rds_profile_indexes()

        app.logger.info(
            'Profile generation complete; waiting for enrollment term generation to finish.'
        )

        while True:
            sleep(1)
            enrollment_results = get_merged_enrollment_term_job_status(
                self.job_id)
            if not enrollment_results:
                raise BackgroundJobError('Failed to refresh RDS indexes.')
            any_pending_job = next(
                (row for row in enrollment_results
                 if row['status'] == 'created' or row['status'] == 'started'),
                None)
            if not any_pending_job:
                break

        app.logger.info('Exporting analytics data for archival purposes.')
        unload_enrollment_terms([current_term_id(), future_term_id()])

        app.logger.info('Refreshing enrollment terms in RDS.')
        with rds.transaction() as transaction:
            if self.refresh_rds_enrollment_terms(None, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS enrollment terms.')
            else:
                transaction.rollback()
                raise BackgroundJobError(
                    'Failed to refresh RDS enrollment terms.')

        status_string = f'Generated merged profiles ({len(self.successes)} successes, {len(self.failures)} failures).'
        errored = False
        for row in enrollment_results:
            status_string += f" {row['details']}"
            if row['status'] == 'error':
                errored = True

        truncate_staging_table('student_enrollment_terms')
        if errored:
            raise BackgroundJobError(status_string)
        else:
            return status_string
コード例 #28
0
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})')

        snapshot_response = canvas_data.get_snapshots()
        if not snapshot_response:
            raise BackgroundJobError(
                'Error retrieving Canvas data snapshots, aborting job.')
        snapshots = snapshot_response.get('files', [])

        def should_sync(snapshot):
            return snapshot[
                'table'] == 'requests' and snapshot['partial'] is False

        snapshots_to_sync = [s for s in snapshots if should_sync(s)]
        app.logger.info(
            f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.'
        )

        success = 0
        failure = 0

        for snapshot in snapshots_to_sync:
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=snapshot['filename'],
                canvas_table=snapshot['table'],
                source_url=snapshot['url'],
            )

            key_components = [
                app.config['LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'],
                snapshot['table'], snapshot['filename']
            ]

            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3',
                                data={
                                    'canvas_sync_job_id': job_id,
                                    'url': snapshot['url'],
                                    'key': key
                                })

            if not response:
                app.logger.error('Failed to dispatch S3 sync of snapshot ' +
                                 snapshot['filename'])
                metadata.update_canvas_sync_status(
                    job_id,
                    key,
                    'error',
                    details=f'Failed to dispatch: {response}')
                failure += 1
            else:
                app.logger.info('Dispatched S3 sync of snapshot ' +
                                snapshot['filename'])
                success += 1

        if cleanup:
            app.logger.info('Will remove obsolete snapshots from S3.')
            current_snapshot_filenames = [
                s['filename'] for s in snapshots_to_sync
            ]
            requests_prefix = app.config[
                'LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'] + '/requests'
            delete_result = s3.delete_objects_with_prefix(
                requests_prefix, whitelist=current_snapshot_filenames)
            if not delete_result:
                app.logger.error('Cleanup of obsolete snapshots failed.')
        return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
コード例 #29
0
 def create_rds_indexes(self):
     resolved_ddl = resolve_sql_template('index_advisors.template.sql')
     if rds.execute(resolved_ddl):
         app.logger.info('Created RDS indexes for advisor schema.')
     else:
         raise BackgroundJobError('Failed to create RDS indexes for advisor schema.')
コード例 #30
0
    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # The size of the non-advisee population makes it unlikely that a one-shot load of all these slow feeds will
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting registrations import job for {len(sids)} non-advisees...'
        )

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.load_concurrently(rows, sids)
        if len(successes) > 0:
            for key in rows.keys():
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Will stash {len(successes)} feeds in S3: {s3_key}')
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        'Error on S3 upload: aborting job.')
                app.logger.info('Will copy S3 feeds into Redshift...')
                if not redshift.execute(
                        f'TRUNCATE {self.redshift_schema}_staging.hist_enr_{key}'
                ):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')
                if not redshift.copy_tsv_from_s3(
                        f'{self.redshift_schema}_staging.hist_enr_{key}',
                        s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {redshift_schema_student}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {redshift_schema_student}_staging.hist_enr_{table_key});
                    INSERT INTO {redshift_schema_student}.hist_enr_{table_key}
                        (SELECT * FROM {redshift_schema_student}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {redshift_schema_student}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )
        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )