Example #1
0
 def _query_edl(self, rows, sids):
     successes = []
     for edl_row in get_edl_student_registrations(sids):
         sid = edl_row['student_id']
         if sid not in successes:
             # Based on the SQL order_by, the first result per SID will be 'last_registration'.
             successes.append(sid)
             rows['last_registrations'].append(
                 encoded_tsv_row([sid, json.dumps(edl_registration_to_json(edl_row))]),
             )
         rows['term_gpas'].append(
             encoded_tsv_row(
                 [
                     sid,
                     edl_row['term_id'],
                     edl_row['current_term_gpa'] or '0',
                     edl_row.get('unt_taken_gpa') or '0',  # TODO: Does EDL give us 'unitsTakenForGpa'?
                 ],
             ),
         )
         if self.include_demographics:
             rows[self.demographics_key].append(
                 encoded_tsv_row([sid, json.dumps(edl_demographics_to_json(edl_row))]),
             )
     failures = list(np.setdiff1d(sids, successes))
     return successes, failures
Example #2
0
    def collect_merged_profiles(self, sids, feed_file, index_file, names_file):
        successes = []
        sis_profile_feeds = queries.get_non_advisee_api_feeds(sids)
        for row in sis_profile_feeds:
            sid = row['sid']
            uid = row['uid']
            sis_api_feed = row['sis_feed']
            sis_profile = parse_merged_sis_profile({
                'sis_profile_feed':
                sis_api_feed,
                'last_registration_feed':
                row['last_registration_feed'],
            })
            merged_profile = {
                'sid': sid,
                'uid': uid,
                'sisProfile': sis_profile,
            }
            self.fill_names_from_sis_profile(sis_api_feed, merged_profile)
            feed_file.write(
                encoded_tsv_row([sid, uid,
                                 json.dumps(merged_profile)]) + b'\n')

            first_name = merged_profile.get('firstName', '')
            last_name = merged_profile.get('lastName', '')
            level = str(sis_profile.get('level', {}).get('code') or '')
            gpa = str(sis_profile.get('cumulativeGPA') or '')
            units = str(sis_profile.get('cumulativeUnits') or '')
            transfer = str(sis_profile.get('transfer') or False)
            expected_grad_term = str(
                sis_profile.get('expectedGraduationTerm', {}).get('id') or '')
            terms_in_attendance = str(
                sis_profile.get('termsInAttendance', {}) or '')
            index_file.write(
                encoded_tsv_row([
                    sid, uid, first_name, last_name, level, gpa, units,
                    transfer, expected_grad_term, terms_in_attendance
                ]) + b'\n', )

            names_file.write(
                encoded_tsv_row([
                    sid,
                    merged_profile.get('uid'),
                    merged_profile.get('firstName'),
                    merged_profile.get('lastName'),
                ]) + b'\n', )
            successes.append(sid)
        return len(successes)
    def _load_from_student_api(self, all_sids):
        # Students API will not return 'unitsTransferEarned' and 'unitsTransferAccepted' data
        # for incoming transfer students unless we request an 'as-of-date' in their enrolled term.
        near_future = (datetime.now() +
                       timedelta(days=60)).strftime('%Y-%m-%d')

        chunked_sids = [
            all_sids[i:i + 100] for i in range(0, len(all_sids), 100)
        ]
        rows = []
        failure_count = 0
        app_obj = app._get_current_object()
        start_loop = timer()
        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
            for result in executor.map(async_get_feeds, repeat(app_obj),
                                       chunked_sids, repeat(near_future)):
                remaining_sids = set(result['sids'])
                feeds = result['feeds']
                if feeds:
                    for feed in feeds:
                        sid = next(_id['id'] for _id in feed['identifiers']
                                   if _id['type'] == 'student-id')
                        remaining_sids.discard(sid)
                        rows.append(encoded_tsv_row([sid, json.dumps(feed)]))
                if remaining_sids:
                    failure_count += len(remaining_sids)
                    app.logger.error(
                        f'SIS student API import failed for SIDs {remaining_sids}.'
                    )
        app.logger.info(
            f'Wanted {len(all_sids)} students; got {len(rows)} in {timer() - start_loop} secs'
        )
        return rows, failure_count
    def load_concurrently(self, all_sids, feed_file):
        chunked_sids = [all_sids[i:i + 100] for i in range(0, len(all_sids), 100)]
        saved_sids = []
        failure_count = 0
        app_obj = app._get_current_object()
        start_loop = timer()

        with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
            for result in executor.map(async_get_feeds, repeat(app_obj), chunked_sids):
                remaining_sids = set(result['sids'])
                feeds = result['feeds']
                if feeds:
                    for feed in feeds:
                        sid = next((_id.get('id') for _id in feed['identifiers'] if _id.get('type') == 'student-id'), None)
                        uid = next((_id.get('id') for _id in feed['identifiers'] if _id.get('type') == 'campus-uid'), None)
                        if not sid or not uid:
                            continue
                        feed_file.write(encoded_tsv_row([sid, uid, json.dumps(feed)]) + b'\n')
                        remaining_sids.discard(sid)
                        saved_sids.append(sid)
                if remaining_sids:
                    failure_count = failure_count + len(remaining_sids)
                    app.logger.error(f'SIS student API import failed for non-advisees {remaining_sids}.')

        app.logger.info(f'Wanted {len(all_sids)} non-advisees; got {len(saved_sids)} in {timer() - start_loop} secs')
        return saved_sids, failure_count
    def _query_student_api(self, rows, sids):
        successes = []
        failures = []
        app_obj = app._get_current_object()
        start_loop = timer()

        with ThreadPoolExecutor(
                max_workers=app.config['STUDENT_API_MAX_THREADS']) as executor:
            for result in executor.map(self._async_get_feed, repeat(app_obj),
                                       sids):
                sid = result['sid']
                full_feed = result['feed']
                if full_feed:
                    successes.append(sid)
                    rows['last_registrations'].append(
                        encoded_tsv_row([
                            sid,
                            json.dumps(full_feed.get('last_registration', {}))
                        ]), )
                    gpa_feed = full_feed.get('term_gpas', {})
                    if gpa_feed:
                        for term_id, term_data in gpa_feed.items():
                            row = [
                                sid,
                                term_id,
                                (term_data.get('gpa') or '0'),
                                (term_data.get('unitsTakenForGpa') or '0'),
                            ]
                            rows['term_gpas'].append(encoded_tsv_row(row))
                    else:
                        app.logger.info(
                            f'No past UGRD registrations found for SID {sid}.')
                    demographics = full_feed.get('demographics', {})
                    if demographics:
                        rows['api_demographics'].append(
                            encoded_tsv_row([sid,
                                             json.dumps(demographics)]), )
                else:
                    failures.append(sid)
                    app.logger.error(
                        f'Registration history import failed for SID {sid}.')
        app.logger.info(
            f'Wanted {len(sids)} students; got {len(successes)} in {timer() - start_loop} secs'
        )
        return successes, failures
Example #6
0
    def import_advisor_attributes(self):
        csid_results = redshift.fetch(
            resolve_sql_template_string(
                'SELECT DISTINCT advisor_sid FROM {redshift_schema_advisor_internal}.advisor_students'
            ), )
        csids = [r['advisor_sid'] for r in csid_results]
        all_attributes = calnet.client(app).search_csids(csids)
        if len(csids) != len(all_attributes):
            ldap_csids = [person['csid'] for person in all_attributes]
            missing = set(csids) - set(ldap_csids)
            app.logger.warning(
                f'Looked for {len(csids)} advisor CSIDs but only found {len(all_attributes)} : missing {missing}'
            )

        advisor_rows = []
        total_count = len(all_attributes)
        for index, a in enumerate(all_attributes):
            sid = a['csid']
            app.logger.info(
                f'CalNet import: Fetch attributes of advisor {sid} ({index + 1} of {total_count})'
            )
            first_name, last_name = calnet.split_sortable_name(a)
            data = [
                a['uid'],
                sid,
                first_name,
                last_name,
                a['title'],
                calnet.get_dept_code(a),
                a['email'],
                a['campus_email'],
            ]
            advisor_rows.append(encoded_tsv_row(data))

        s3_key = f'{get_s3_calnet_daily_path()}/advisors/advisors.tsv'
        app.logger.info(
            f'Will stash {len(advisor_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(advisor_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_advisor_internal}.advisor_attributes;
            COPY {redshift_schema_advisor_internal}.advisor_attributes
                FROM '{loch_s3_calnet_data_path}/advisors/advisors.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
Example #7
0
 def collect_merged_enrollments(self, sids, term_id, feed_file):
     rows = queries.get_non_advisee_sis_enrollments(sids, term_id)
     enrollments_by_student = map_sis_enrollments(rows)
     merge_dropped_classes(
         enrollments_by_student,
         queries.get_non_advisee_enrollment_drops(sids, term_id))
     merge_term_gpas(enrollments_by_student,
                     queries.get_non_advisee_term_gpas(sids, term_id))
     enrollments_by_student = enrollments_by_student.get(term_id, {})
     for (sid, enrollments_feed) in enrollments_by_student.items():
         feed_file.write(
             encoded_tsv_row([sid, term_id,
                              json.dumps(enrollments_feed)]) + b'\n')
     return len(enrollments_by_student.keys())
    def generate_term_feeds(self, sids, feed_file):
        enrollment_stream = queries.stream_sis_enrollments(sids=sids)
        term_gpa_stream = queries.stream_term_gpas(sids=sids)
        term_gpa_tracker = {'term_id': '9999', 'sid': '', 'term_gpas': []}

        row_count = 0

        try:
            term_gpa_results = groupby(term_gpa_stream, lambda r:
                                       (str(r['term_id']), r['sid']))

            for term_id, term_enrollments_grp in groupby(
                    enrollment_stream, operator.itemgetter('sis_term_id')):
                term_id = str(term_id)
                term_name = berkeley.term_name_for_sis_id(term_id)
                for sid, enrollments_grp in groupby(
                        term_enrollments_grp, operator.itemgetter('sid')):
                    term_feed = None
                    for is_dropped, enrollments_subgroup in groupby(
                            enrollments_grp, operator.itemgetter('dropped')):
                        if not is_dropped:
                            term_feed = merge_enrollment(
                                enrollments_subgroup, term_id, term_name)
                        else:
                            if not term_feed:
                                term_feed = empty_term_feed(term_id, term_name)
                            append_drops(term_feed, enrollments_subgroup)

                    while term_gpa_tracker['term_id'] > term_id or (
                            term_gpa_tracker['term_id'] == term_id
                            and term_gpa_tracker['sid'] < sid):
                        (term_gpa_tracker['term_id'], term_gpa_tracker['sid']
                         ), term_gpa_tracker['term_gpas'] = next(
                             term_gpa_results)
                    if term_gpa_tracker[
                            'term_id'] == term_id and term_gpa_tracker[
                                'sid'] == sid:
                        append_term_gpa(term_feed,
                                        term_gpa_tracker['term_gpas'])

                    feed_file.write(
                        encoded_tsv_row([sid, term_id,
                                         json.dumps(term_feed)]) + b'\n')
                    row_count += 1

        finally:
            enrollment_stream.close()
            term_gpa_stream.close()

        return row_count
Example #9
0
 def load_concurrently_v1(self, csids):
     rows = []
     failure_count = 0
     app_obj = app._get_current_object()
     start_loop = timer()
     with ThreadPoolExecutor(max_workers=self.max_threads) as executor:
         for result in executor.map(async_get_feed_v1, repeat(app_obj), csids):
             csid = result['sid']
             feed = result['feed']
             if feed:
                 rows.append(encoded_tsv_row([csid, json.dumps(feed)]))
             else:
                 failure_count += 1
                 app.logger.error(f'SIS student API V1 import failed for CSID {csid}.')
     app.logger.info(f'Wanted {len(csids)} students; got {len(rows)} in {timer() - start_loop} secs')
     return rows, failure_count
    def refresh_student_enrollment_term(self, term_id, enrollment_term_map):
        with tempfile.TemporaryFile() as enrollment_term_file:
            for (sid, sid_term_feed) in enrollment_term_map.items():
                enrollment_term_file.write(
                    encoded_tsv_row([sid, term_id,
                                     json.dumps(sid_term_feed)]) + b'\n')

            drop_staged_enrollment_term(term_id)
            write_file_to_staging('student_enrollment_terms',
                                  enrollment_term_file,
                                  len(enrollment_term_map), term_id)

        with redshift.transaction() as transaction:
            refresh_from_staging('student_enrollment_terms',
                                 term_id,
                                 None,
                                 transaction,
                                 truncate_staging=False)
            if not transaction.commit():
                raise BackgroundJobError(
                    f'Final transaction commit failed on enrollment term refresh (term_id={term_id}).'
                )
    def run(self, term_id=None):
        if not term_id:
            term_id = current_term_id()
        canvas_course_ids = [
            row['canvas_course_id']
            for row in get_enrolled_canvas_sites_for_term(term_id)
        ]

        app.logger.info(
            f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for course_id in canvas_course_ids:
            app.logger.info(
                f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})'
            )
            feed = canvas_api.get_course_enrollments(course_id)
            if feed:
                success_count += 1
                for enrollment in feed:
                    user_id = enrollment.get('user_id')
                    last_activity_at = enrollment.get('last_activity_at') or ''
                    rows.append(
                        encoded_tsv_row([
                            course_id, user_id, term_id, last_activity_at,
                            json.dumps(enrollment)
                        ]))
            else:
                failure_count += 1
                app.logger.error(
                    f'Canvas enrollments API import failed for course id {course_id}.'
                )
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}';
            COPY {redshift_schema_student}_staging.canvas_api_enrollments
                FROM '{loch_s3_sis_api_data_path}/canvas_api_enrollments_{term_id}.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t'
                TIMEFORMAT 'YYYY-MM-DDTHH:MI:SSZ';
            DELETE FROM {redshift_schema_student}.canvas_api_enrollments
                WHERE term_id = '{term_id}'
                AND course_id IN
                (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return (
            f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{failure_count} failed.')
Example #12
0
    def run(self):
        app.logger.info('Starting ASC profile generation job...')
        asc_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid, UPPER(team_name)',
            schema=asc_schema_identifier,
        )

        profile_rows = []
        sids_for_inactive_deletion = []

        for sid, rows_for_student in groupby(asc_rows,
                                             operator.itemgetter('sid')):
            rows_for_student = list(rows_for_student)
            # Since BOAC believes (falsely) that isActiveAsc and statusAsc are attributes of a student, not
            # a team membership, a bit of brutal simplification is needed. Students who are active in at least
            # one sport have inactive team memberships dropped.
            any_active_athletics = reduce(
                operator.or_, [r['active'] for r in rows_for_student], False)
            if any_active_athletics:
                rows_for_student = [r for r in rows_for_student if r['active']]
                sids_for_inactive_deletion.append(sid)
            athletics_profile = {
                'athletics': [],
                'inIntensiveCohort': rows_for_student[0]['intensive'],
                'isActiveAsc': rows_for_student[0]['active'],
                'statusAsc': rows_for_student[0]['status_asc'],
            }
            for row in rows_for_student:
                athletics_profile['athletics'].append({
                    'groupCode':
                    row['group_code'],
                    'groupName':
                    row['group_name'],
                    'name':
                    row['group_name'],
                    'teamCode':
                    row['team_code'],
                    'teamName':
                    row['team_name'],
                })

            profile_rows.append(
                encoded_tsv_row([sid, json.dumps(athletics_profile)]))

        s3_key = f'{get_s3_asc_daily_path()}/athletics_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.student_profiles;
            COPY {redshift_schema_asc}.student_profiles
                FROM '{loch_s3_asc_data_path}/athletics_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(asc_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        if sids_for_inactive_deletion:
            redshift.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion, ),
            )
            rds.execute(
                f'DELETE FROM {asc_schema}.students WHERE active IS false AND sid = ANY(%s)',
                params=(sids_for_inactive_deletion, ),
            )

        return 'ASC profile generation complete.'
Example #13
0
    def run(self):
        app.logger.info('Starting COE schema creation job...')
        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template('create_coe_schema.template.sql')
        # TODO This DDL drops and recreates the internal schema before the external schema is verified. We
        # ought to set up proper staging in conjunction with verification. It's also possible that a persistent
        # external schema isn't needed.
        if redshift.execute_ddl_script(resolved_ddl):
            app.logger.info('COE external schema created.')
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError('COE external schema creation failed.')
        coe_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid',
            schema=internal_schema_identifier,
        )

        profile_rows = []
        index = 1
        for sid, rows_for_student in groupby(coe_rows,
                                             operator.itemgetter('sid')):
            app.logger.info(
                f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})'
            )
            index += 1
            row_for_student = list(rows_for_student)[0]
            coe_profile = {
                'advisorUid': row_for_student.get('advisor_ldap_uid'),
                'gender': row_for_student.get('gender'),
                'ethnicity': row_for_student.get('ethnicity'),
                'minority': row_for_student.get('minority'),
                'didPrep': row_for_student.get('did_prep'),
                'prepEligible': row_for_student.get('prep_eligible'),
                'didTprep': row_for_student.get('did_tprep'),
                'tprepEligible': row_for_student.get('tprep_eligible'),
                'sat1read': row_for_student.get('sat1read'),
                'sat1math': row_for_student.get('sat1math'),
                'sat2math': row_for_student.get('sat2math'),
                'inMet': row_for_student.get('in_met'),
                'gradTerm': row_for_student.get('grad_term'),
                'gradYear': row_for_student.get('grad_year'),
                'probation': row_for_student.get('probation'),
                'status': row_for_student.get('status'),
            }
            profile_rows.append(encoded_tsv_row([sid,
                                                 json.dumps(coe_profile)]))

        s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {redshift_schema_coe}.student_profiles
                FROM '{loch_s3_coe_data_path}/coe_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(coe_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        return 'COE internal schema created.'
Example #14
0
    def run(self, term_id=None):
        if not term_id:
            term_id = current_term_id()
        canvas_course_ids = [
            row['canvas_course_id']
            for row in get_enrolled_canvas_sites_for_term(term_id)
        ]

        app.logger.info(
            f'Starting Canvas enrollments API import job for term {term_id}, {len(canvas_course_ids)} course sites...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for course_id in canvas_course_ids:
            app.logger.info(
                f'Fetching Canvas enrollments API for course id {course_id}, term {term_id} ({index} of {len(canvas_course_ids)})'
            )
            feed = canvas_api.get_course_enrollments(course_id)
            if feed:
                success_count += 1
                for enrollment in feed:
                    user_id = enrollment.get('user_id')
                    last_activity_at = enrollment.get('last_activity_at') or ''
                    rows.append(
                        encoded_tsv_row([
                            course_id, user_id, term_id, last_activity_at,
                            json.dumps(enrollment)
                        ]))
            else:
                failure_count += 1
                app.logger.error(
                    f'Canvas enrollments API import failed for course id {course_id}.'
                )
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/canvas_api_enrollments/canvas_api_enrollments_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog
                DATABASE '{redshift_schema_student}_staging_ext_tmp'
                IAM_ROLE '{redshift_iam_role}'
                CREATE EXTERNAL DATABASE IF NOT EXISTS;
            CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments (
                course_id VARCHAR,
                user_id VARCHAR,
                term_id VARCHAR,
                last_activity_at TIMESTAMP,
                feed VARCHAR
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '\\t'
            STORED AS TEXTFILE
            LOCATION '{loch_s3_sis_api_data_path}/canvas_api_enrollments';

            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}';
            INSERT INTO {redshift_schema_student}_staging.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments);
            DROP TABLE {redshift_schema_student}_staging_ext_tmp.canvas_api_enrollments;
            DROP SCHEMA {redshift_schema_student}_staging_ext_tmp;

            DELETE FROM {redshift_schema_student}.canvas_api_enrollments
                WHERE term_id = '{term_id}'
                AND course_id IN
                (SELECT course_id FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.canvas_api_enrollments
                (SELECT * FROM {redshift_schema_student}_staging.canvas_api_enrollments WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.canvas_api_enrollments
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute_ddl_script(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return (
            f'Canvas enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{failure_count} failed.')
Example #15
0
    def run(self, csids=None):
        if not csids:
            all_sids = get_all_student_ids()
            if all_sids:
                csids = [row['sid'] for row in all_sids]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append(encoded_tsv_row([csid, json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress'
        ):
            raise BackgroundJobError(
                'Error truncating old staging rows: aborting job.')
        if not redshift.copy_tsv_from_s3(
                f'{self.redshift_schema}_staging.sis_api_degree_progress',
                s3_key):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress;
            """, )
        if not redshift.execute(staging_to_destination_query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )
    def run(self, csids=None):
        if not csids:
            all_sids = get_all_student_ids()
            if all_sids:
                csids = [row['sid'] for row in all_sids]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append(encoded_tsv_row([csid, json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.redshift_schema}_staging.sis_api_degree_progress'
        ):
            raise BackgroundJobError(
                'Error truncating old staging rows: aborting job.')

        query = resolve_sql_template_string(
            """
            CREATE EXTERNAL SCHEMA {redshift_schema_student}_staging_ext_tmp FROM data catalog
                DATABASE '{redshift_schema_student}_staging_ext_tmp'
                IAM_ROLE '{redshift_iam_role}'
                CREATE EXTERNAL DATABASE IF NOT EXISTS;
            CREATE EXTERNAL TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress (
                sid VARCHAR,
                feed VARCHAR(MAX)
            )
            ROW FORMAT DELIMITED
            FIELDS TERMINATED BY '\\t'
            STORED AS TEXTFILE
            LOCATION '{loch_s3_sis_api_data_path}/degree_progress';

            DELETE FROM {redshift_schema_student}_staging.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}_staging.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress);
            DROP TABLE {redshift_schema_student}_staging_ext_tmp.sis_api_degree_progress;
            DROP SCHEMA {redshift_schema_student}_staging_ext_tmp;

            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_degree_progress;
            """, )
        if not redshift.execute_ddl_script(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        redshift.execute('VACUUM; ANALYZE;')

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )
Example #17
0
    def run(self):
        app.logger.info(
            'ASC import: Fetch team and student athlete data from ASC API')
        api_results = get_asc_feed()
        if 'error' in api_results:
            raise BackgroundJobError(
                'ASC import: Error from external API: {}'.format(
                    api_results['error']))
        elif not api_results:
            raise BackgroundJobError('ASC import: API returned zero students')
        sync_date = api_results[0]['SyncDate']
        if sync_date != api_results[-1]['SyncDate']:
            raise BackgroundJobError(
                f'ASC import: SyncDate conflict in ASC API: {api_results[0]} vs. {api_results[-1]}'
            )
        rows = []
        for r in api_results:
            if r['AcadYr'] == app.config['ASC_THIS_ACAD_YR'] and r['SportCode']:
                asc_code = r['SportCodeCore']
                if asc_code in SPORT_TRANSLATIONS:
                    group_code = r['SportCode']
                    data = [
                        r['SID'],
                        str(r.get('ActiveYN', 'No') == 'Yes'),
                        str(r.get('IntensiveYN', 'No') == 'Yes'),
                        r.get('SportStatus', ''),
                        group_code,
                        _unambiguous_group_name(r['Sport'], group_code),
                        SPORT_TRANSLATIONS[asc_code],
                        r['SportCore'],
                    ]
                    rows.append(encoded_tsv_row(data))
                else:
                    sid = r['SID']
                    app.logger.error(
                        f'ASC import: Unmapped asc_code {asc_code} has ActiveYN for sid={sid}'
                    )

        s3_key = f'{get_s3_asc_daily_path()}/asc_api_raw_response_{sync_date}.tsv'
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Copy data in S3 file to Redshift...')
        query = resolve_sql_template_string(
            """
            TRUNCATE {redshift_schema_asc}.students;
            COPY {redshift_schema_asc}.students
                FROM 's3://{s3_bucket}/{s3_key}'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """,
            s3_bucket=app.config['LOCH_S3_BUCKET'],
            s3_key=s3_key,
        )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        status = {
            'this_sync_date': sync_date,
            'api_results_count': len(api_results),
        }
        app.logger.info(
            f'ASC import: Successfully completed import job: {str(status)}')
        return status