Beispiel #1
0
 def generate_student_profile_table(self, non_advisee_sids):
     profile_count = 0
     with tempfile.TemporaryFile() as feed_file, tempfile.TemporaryFile(
     ) as index_file, tempfile.TemporaryFile() as names_file:
         tables = {
             'student_profiles_hist_enr': feed_file,
             'student_profile_index_hist_enr': index_file,
             'student_names_hist_enr': names_file,
         }
         # Work in batches so as not to overload memory.
         for i in range(0, len(non_advisee_sids), BATCH_QUERY_MAXIMUM):
             sids = non_advisee_sids[i:i + BATCH_QUERY_MAXIMUM]
             profile_count += self.collect_merged_profiles(
                 sids, feed_file, index_file, names_file)
         if profile_count:
             with redshift.transaction() as transaction:
                 for table_name, data in tables.items():
                     student_schema.truncate_staging_table(table_name)
                     student_schema.write_file_to_staging(
                         table_name, data, profile_count)
                     student_schema.refresh_from_staging(
                         table_name,
                         None,
                         non_advisee_sids,
                         transaction,
                     )
     app.logger.info('Non-advisee profile generation complete.')
     return profile_count
def refresh_all_from_staging(tables):
    with redshift.transaction() as transaction:
        for table in tables:
            refresh_from_staging(table, None, None, transaction)
        if not transaction.commit():
            raise BackgroundJobError(
                f'Final transaction commit failed for {student_schema()}.')
Beispiel #3
0
 def generate_student_enrollments_table(self, non_advisee_sids):
     # Split all S3/Redshift operations by term in hope of not overloading memory or other resources.
     # (Using finer-grained batches of SIDs would probably involve replacing the staging table by a Spectrum
     # external table.)
     total_count = 0
     table_name = 'student_enrollment_terms_hist_enr'
     student_schema.truncate_staging_table(table_name)
     for term_id in reverse_term_ids(include_future_terms=True,
                                     include_legacy_terms=True):
         with tempfile.TemporaryFile() as feed_file:
             term_count = self.collect_merged_enrollments(
                 non_advisee_sids, term_id, feed_file)
             if term_count:
                 student_schema.write_file_to_staging(
                     table_name,
                     feed_file,
                     term_count,
                     term_id,
                 )
         if term_count:
             with redshift.transaction() as transaction:
                 student_schema.refresh_from_staging(
                     table_name,
                     term_id,
                     non_advisee_sids,
                     transaction,
                 )
             total_count += term_count
     app.logger.info('Non-advisee term enrollment generation complete.')
     return total_count
 def generate_student_enrollments_table(self, non_advisee_sids):
     table_name = 'student_enrollment_terms_hist_enr'
     truncate_staging_table(table_name)
     with tempfile.TemporaryFile() as feed_file:
         row_count = self.generate_term_feeds(non_advisee_sids, feed_file)
         if row_count:
             write_file_to_staging(table_name, feed_file, row_count)
             with redshift.transaction() as transaction:
                 refresh_from_staging(
                     table_name,
                     term_id=None,
                     sids=non_advisee_sids,
                     transaction=transaction,
                 )
     app.logger.info('Non-advisee term enrollment generation complete.')
     return row_count
    def refresh_student_enrollment_term(self, term_id, enrollment_term_map):
        with tempfile.TemporaryFile() as enrollment_term_file:
            for (sid, sid_term_feed) in enrollment_term_map.items():
                enrollment_term_file.write(
                    encoded_tsv_row([sid, term_id,
                                     json.dumps(sid_term_feed)]) + b'\n')

            drop_staged_enrollment_term(term_id)
            write_file_to_staging('student_enrollment_terms',
                                  enrollment_term_file,
                                  len(enrollment_term_map), term_id)

        with redshift.transaction() as transaction:
            refresh_from_staging('student_enrollment_terms',
                                 term_id,
                                 None,
                                 transaction,
                                 truncate_staging=False)
            if not transaction.commit():
                raise BackgroundJobError(
                    f'Final transaction commit failed on enrollment term refresh (term_id={term_id}).'
                )
Beispiel #6
0
    def run(self, term_ids=None):
        if not term_ids:
            term_ids = reverse_term_ids()
        app.logger.info(
            f'Starting SIS terms API import job for {len(term_ids)} terms...')

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for term_id in term_ids:
            app.logger.info(
                f'Fetching SIS terms API for term id {term_id} ({index} of {len(term_ids)})'
            )
            feed = sis_terms_api.get_term(term_id)
            if feed:
                success_count += 1
                for academic_career_term in feed:
                    for session in academic_career_term.get('sessions', []):
                        rows.append(
                            '\t'.join([
                                academic_career_term.get('id', ''),
                                academic_career_term.get('name', ''),
                                academic_career_term.get('academicCareer',
                                                         {}).get('code', ''),
                                academic_career_term.get('beginDate', ''),
                                academic_career_term.get('endDate', ''),
                                session.get('id', ''),
                                session.get('name', ''),
                                session.get('beginDate', ''),
                                session.get('endDate', ''),
                            ]), )
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS terms API import failed for term id {term_id}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/terms.tsv'
        app.logger.info(
            f'Will stash {len(rows)} rows from {success_count} feeds in S3: {s3_key}'
        )
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        with redshift.transaction() as transaction:
            if self.update_redshift(term_ids, transaction):
                transaction.commit()
                app.logger.info('Updated Redshift.')
            else:
                transaction.rollback()
                app.logger.error('Failed to update Redshift.')
                return False

        with rds.transaction() as transaction:
            if self.update_rds(rows, term_ids, transaction):
                transaction.commit()
                app.logger.info('Updated RDS.')
            else:
                transaction.rollback()
                app.logger.error('Failed to update RDS.')
                return False

        return f'SIS terms API import job completed: {success_count} succeeded, {failure_count} failed.'
Beispiel #7
0
    def generate_feeds(self, term_id=None, sids=None):
        """Loop through all records stored in the Calnet external schema and write merged student data to the internal student schema."""
        calnet_profiles = self.fetch_calnet_profiles(sids)

        # Jobs targeted toward a specific sid set (such as backfills) may return no CalNet profiles. Warn, don't error.
        if not calnet_profiles:
            app.logger.warn(
                f'No CalNet profiles returned, aborting job. (sids={sids})')
            return False

        # Jobs for non-current terms generate enrollment feeds only.
        if term_id and term_id != berkeley.current_term_id():
            tables = ['student_enrollment_terms']
        else:
            tables = [
                'student_profiles', 'student_academic_status',
                'student_majors', 'student_enrollment_terms', 'student_holds'
            ]

        # In-memory storage for generated feeds prior to TSV output.
        self.rows = {
            'student_profiles': [],
            'student_academic_status': [],
            'student_majors': [],
            'student_enrollment_terms': [],
            'student_holds': [],
        }

        # Track the results of course-level queries to avoid requerying.
        self.canvas_site_map = {}

        # Remove any old data from staging tables.
        for table in tables:
            redshift.execute(
                'TRUNCATE {schema}.{table}',
                schema=self.staging_schema_identifier,
                table=psycopg2.sql.Identifier(table),
            )

        app.logger.info(
            f'Will generate feeds for {len(calnet_profiles)} students (term_id={term_id}).'
        )
        successes = []
        failures = []
        index = 1
        for sid, profile_group in groupby(calnet_profiles,
                                          operator.itemgetter('sid')):
            app.logger.info(
                f'Generating feeds for sid {sid} ({index} of {len(calnet_profiles)})'
            )
            index += 1
            merged_profile = self.generate_or_fetch_merged_profile(
                term_id, sid,
                list(profile_group)[0])
            if merged_profile:
                self.generate_merged_enrollment_terms(merged_profile, term_id)
                self.parse_holds(sid)
                successes.append(sid)
            else:
                failures.append(sid)

        for table in tables:
            if not self.rows[table]:
                continue
            self.upload_to_staging(table)
            if not self.verify_table(table):
                return False

        with redshift.transaction() as transaction:
            for table in tables:
                if not self.refresh_from_staging(table, term_id, sids,
                                                 transaction):
                    app.logger.error(
                        f'Failed to refresh {self.destination_schema}.{table} from staging.'
                    )
                    return False
            if not transaction.commit():
                app.logger.error(
                    f'Final transaction commit failed for {self.destination_schema}.'
                )
                return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(sids, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                app.logger.error('Failed to refresh RDS indexes.')
                return False

        update_merged_feed_status(term_id, successes, failures)
        app.logger.info(f'Updated merged feed status.')

        return f'Merged profile generation complete: {len(successes)} successes, {len(failures)} failures.'