Esempio n. 1
0
    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(f'Starting SIS student API import job for {len(csids)} students...')

        rows, failure_count = self.load(csids)
        if (len(rows) == 0) and (failure_count > 0):
            raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.')

        s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv'
        app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')

        if not redshift.execute(f'TRUNCATE {student_schema()}_staging.sis_api_profiles'):
            raise BackgroundJobError('Error truncating old staging rows: aborting job.')
        if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.sis_api_profiles', s3_key):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema}.sis_api_profiles WHERE sid IN
                (SELECT sid FROM {redshift_schema}_staging.sis_api_profiles);
            INSERT INTO {redshift_schema}.sis_api_profiles
                (SELECT * FROM {redshift_schema}_staging.sis_api_profiles);
            TRUNCATE {redshift_schema}_staging.sis_api_profiles;
            """,
            redshift_schema=student_schema(),
        )
        if not redshift.execute(staging_to_destination_query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'
Esempio n. 2
0
 def refresh_rds_indexes(self, sids, transaction):
     if not (self._delete_rds_rows('student_academic_status', sids,
                                   transaction)
             and self._refresh_rds_academic_status(transaction)
             and self._delete_rds_rows('student_holds', sids, transaction)
             and self._refresh_rds_holds(transaction)
             and self._delete_rds_rows('student_names', sids, transaction)
             and self._refresh_rds_names(transaction)
             and self._delete_rds_rows('student_majors', sids, transaction)
             and self._refresh_rds_majors(transaction) and
             self._delete_rds_rows('student_profiles', sids, transaction)
             and self._refresh_rds_profiles(transaction)
             and self._delete_rds_rows('intended_majors', sids, transaction)
             and self._refresh_rds_intended_majors(transaction) and
             self._delete_rds_rows('academic_standing', sids, transaction)
             and self._refresh_rds_academic_standing(transaction)
             and self._delete_rds_rows('minors', sids, transaction)
             and self._refresh_rds_minors(transaction)
             and self._index_rds_email_address(transaction)
             and self._index_rds_entering_term(transaction)
             and refresh_rds_demographics(self.rds_schema,
                                          self.rds_dblink_to_redshift,
                                          student_schema(), transaction)):
         return False
     return True
Esempio n. 3
0
    def run(self, sids=None):
        if not sids:
            sids = [row['sid'] for row in get_unfetched_non_advisees()]
        app.logger.info(
            f'Starting SIS student API import job for {len(sids)} non-advisees...'
        )

        with tempfile.TemporaryFile() as feed_file:
            saved_sids, failure_count = self.load_concurrently(sids, feed_file)
            if saved_sids:
                sis_profiles_hist_enr = student_schema_table(
                    'sis_profiles_hist_enr')
                truncate_staging_table(sis_profiles_hist_enr)
                write_file_to_staging(sis_profiles_hist_enr, feed_file,
                                      len(saved_sids))

        if saved_sids:
            staging_to_destination_query = resolve_sql_template_string(
                """
                DELETE FROM {redshift_schema}.{sis_profiles_hist_enr} WHERE sid IN
                    (SELECT sid FROM {redshift_schema}_staging.{sis_profiles_hist_enr});
                INSERT INTO {redshift_schema}.{sis_profiles_hist_enr}
                    (SELECT * FROM {redshift_schema}_staging.{sis_profiles_hist_enr});
                TRUNCATE {redshift_schema}_staging.{sis_profiles_hist_enr};
                """,
                redshift_schema=student_schema(),
                sis_profiles_hist_enr=sis_profiles_hist_enr,
            )
            if not redshift.execute(staging_to_destination_query):
                raise BackgroundJobError(
                    'Error on Redshift copy: aborting job.')

        return f'SIS student API non-advisee import job completed: {len(saved_sids)} succeeded, {failure_count} failed.'
def unload_enrollment_terms(term_ids):
    query = resolve_sql_template_string(
        """
        UNLOAD ('SELECT *, GETDATE() AS analytics_generated_at
            FROM {schema}.student_enrollment_terms
            WHERE term_id=ANY(\''{{{term_ids}}}\'')')
            TO '{loch_s3_boac_analytics_incremental_path}/student_enrollment_terms'
            IAM_ROLE '{redshift_iam_role}'
            ENCRYPTED
            DELIMITER AS '\\t'
            ALLOWOVERWRITE
            GZIP;
        """,
        schema=student_schema(),
        term_ids=','.join(term_ids),
    )
    if not redshift.execute(query):
        raise BackgroundJobError('Error on Redshift unload: aborting job.')
def refresh_from_staging(table,
                         term_id,
                         sids,
                         transaction,
                         truncate_staging=True):
    # If our job is restricted to a particular term id or set of sids, then drop rows from the destination table
    # matching those restrictions. If there are no restrictions, the entire destination table can be truncated.
    refresh_conditions = []
    refresh_params = []
    if term_id:
        refresh_conditions.append('term_id = %s')
        refresh_params.append(term_id)
    if sids:
        refresh_conditions.append('sid = ANY(%s)')
        refresh_params.append(sids)

    def _success():
        app.logger.info(
            f'Populated {student_schema()}.{table} from staging schema.')

    def _rollback():
        transaction.rollback()
        raise BackgroundJobError(
            f'Failed to populate table {student_schema()}.{table} from staging schema.'
        )

    if not refresh_conditions:
        transaction.execute(
            'TRUNCATE {schema}.{table}',
            schema=psycopg2.sql.Identifier(student_schema()),
            table=psycopg2.sql.Identifier(table),
        )
        app.logger.info(
            f'Truncated destination table {student_schema()}.{table}.')

        _success() if transaction.execute(
            'INSERT INTO {schema}.{table} (SELECT * FROM {staging_schema}.{table})',
            schema=psycopg2.sql.Identifier(student_schema()),
            staging_schema=psycopg2.sql.Identifier(staging_schema()),
            table=psycopg2.sql.Identifier(table),
        ) else _rollback()

    else:
        delete_sql = 'DELETE FROM {schema}.{table} WHERE ' + ' AND '.join(
            refresh_conditions)
        transaction.execute(
            delete_sql,
            schema=psycopg2.sql.Identifier(student_schema()),
            table=psycopg2.sql.Identifier(table),
            params=tuple(refresh_params),
        )
        app.logger.info(f"""
            Deleted existing rows from destination table {student_schema()}.{table} '
            term_id={term_id or 'all'}, {len(sids) if sids else 'all'} sids).
        """)
        insert_sql = 'INSERT INTO {schema}.{table} (SELECT * FROM {staging_schema}.{table} WHERE ' + ' AND '.join(
            refresh_conditions) + ')'

        _success() if transaction.execute(
            insert_sql,
            schema=psycopg2.sql.Identifier(student_schema()),
            staging_schema=psycopg2.sql.Identifier(staging_schema()),
            table=psycopg2.sql.Identifier(table),
            params=tuple(refresh_params),
        ) else _rollback()

    # The staging table can now be truncated, unless we're running a job distributed between workers.
    if truncate_staging:
        transaction.execute(
            'TRUNCATE {schema}.{table}',
            schema=psycopg2.sql.Identifier(staging_schema()),
            table=psycopg2.sql.Identifier(table),
        )
        app.logger.info(f'Truncated staging table {staging_schema()}.{table}.')
Esempio n. 6
0
    def run(self, load_mode='new'):
        all_sids = [row['sid'] for row in get_all_student_ids()]
        previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()}

        if load_mode == 'new':
            sids = list(set(all_sids).difference(previous_backfills))
        elif load_mode == 'batch':
            new_sids = list(set(all_sids).difference(previous_backfills))
            limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids)
            if limit > 0:
                oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)]
                sids = new_sids + oldest_backfills
            else:
                sids = new_sids
        elif load_mode == 'all':
            sids = all_sids

        app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
            'api_demographics': [],
        }
        successes, failures = self.get_registration_data_per_sids(rows, sids)
        if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0):
            raise BackgroundJobError('Failed to import registration histories: aborting job.')

        for key in rows.keys():
            s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv'
            app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}')
            if not s3.upload_tsv_rows(rows[key], s3_key):
                raise BackgroundJobError('Error on S3 upload: aborting job.')
            app.logger.info('Will copy S3 feeds into Redshift...')
            if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'):
                raise BackgroundJobError('Error truncating old staging rows: aborting job.')
            if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key):
                raise BackgroundJobError('Error on Redshift copy: aborting job.')
            staging_to_destination_query = resolve_sql_template_string(
                """
                DELETE FROM {student_schema}.student_{table_key}
                    WHERE sid IN
                    (SELECT sid FROM {student_schema}_staging.student_{table_key});
                INSERT INTO {student_schema}.student_{table_key}
                    (SELECT * FROM {student_schema}_staging.student_{table_key});
                TRUNCATE TABLE {student_schema}_staging.student_{table_key};
                """,
                table_key=key,
                student_schema=student_schema(),
            )
            if not redshift.execute(staging_to_destination_query):
                raise BackgroundJobError('Error inserting staging entries into destination: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Failed to refresh RDS indexes.')

        update_registration_import_status(successes, failures)

        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )
Esempio n. 7
0
    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        #
        # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less
        # likely to be triggered.)
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting import of historical registration data for {len(sids)} students...'
        )
        redshift.execute('VACUUM; ANALYZE;')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.get_registration_data_per_sids(
            rows, sids, include_demographics=False)
        for key in rows.keys():
            if len(rows[key]) > 0:
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.'
                )
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        f'Error during S3 upload: {s3_key}. Aborting job.')

                staging_table = f'{student_schema()}_staging.hist_enr_{key}'
                if not redshift.execute(f'TRUNCATE {staging_table}'):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')

                app.logger.info(
                    f'Populate {staging_table} (Redshift table) with s3:{s3_key}'
                )
                if not redshift.copy_tsv_from_s3(staging_table, s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')

                app.logger.info(
                    f'Insert student data into {student_schema()}.hist_enr_{key}'
                )
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {student_schema}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key});
                    INSERT INTO {student_schema}.hist_enr_{table_key}
                        (SELECT * FROM {student_schema}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                    student_schema=student_schema(),
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )

        redshift.execute('VACUUM; ANALYZE;')
        return (
            f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.'
        )