Python copy_tsv_from_s3 Examples, nessie.externals.redshift.copy_tsv_from_s3 Python Examples

Example #1

0

Show file

File: import_sis_student_api.py Project: dkawase/nessie

    def run(self, csids=None):
        if app.config['STUDENT_V1_API_PREFERRED']:
            return self.run_v1(csids)
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(f'Starting SIS student API import job for {len(csids)} students...')

        rows, failure_count = self.load_concurrently(csids)
        if (len(rows) == 0) and (failure_count > 0):
            raise BackgroundJobError('Failed to import SIS student API feeds: aborting job.')

        s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv'
        app.logger.info(f'Will stash {len(rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(f'TRUNCATE {self.redshift_schema}_staging.sis_api_profiles'):
            raise BackgroundJobError('Error truncating old staging rows: aborting job.')
        if not redshift.copy_tsv_from_s3(f'{self.redshift_schema}_staging.sis_api_profiles', s3_key):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles);
            INSERT INTO {redshift_schema_student}.sis_api_profiles
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """,
        )
        if not redshift.execute(staging_to_destination_query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        return f'SIS student API import job completed: {len(rows)} succeeded, {failure_count} failed.'

Example #2

0

Show file

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(
            f'Starting SIS student API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching SIS student API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_student_api.get_student(csid)
            if feed:
                success_count += 1
                rows.append('\t'.join([str(csid), json.dumps(feed)]))
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS student API import failed for CSID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/profiles.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.sis_api_profiles'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_profiles', s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_profiles WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_profiles);
            INSERT INTO {redshift_schema_student}.sis_api_profiles
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_profiles);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """, )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return f'SIS student API import job completed: {success_count} succeeded, {failure_count} failed.'

Example #3

0

Show file

File: create_edl_schema.py Project: pfarestveit/nessie

    def _upload_file_to_staging(self, table, _file):
        tsv_filename = f'staging_{table}.tsv'
        s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}'

        app.logger.info(f'Will stash {table} feeds in S3: {s3_key}')
        if not s3.upload_file(_file, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.copy_tsv_from_s3(f'{self.internal_schema}.{table}', s3_key):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

Example #4

0

Show file

def _upload_file_to_staging(table, _file):
    tsv_filename = f'staging_{table}.tsv'
    s3_key = f'{get_s3_edl_daily_path()}/{tsv_filename}'

    app.logger.info(f'Will stash {table} feeds in S3: {s3_key}')
    if not s3.upload_file(_file, s3_key):
        raise BackgroundJobError('Error on S3 upload: aborting job.')

    app.logger.info('Will copy S3 feeds into Redshift...')
    if not redshift.copy_tsv_from_s3(
            f"{app.config['REDSHIFT_SCHEMA_EDL']}.{table}", s3_key):
        raise BackgroundJobError('Error on Redshift copy: aborting job.')

Example #5

0

Show file

    def run(self, load_mode='new'):
        all_sids = [row['sid'] for row in get_all_student_ids()]
        previous_backfills = {row['sid'] for row in get_sids_with_registration_imports()}

        if load_mode == 'new':
            sids = list(set(all_sids).difference(previous_backfills))
        elif load_mode == 'batch':
            new_sids = list(set(all_sids).difference(previous_backfills))
            limit = app.config['CYCLICAL_API_IMPORT_BATCH_SIZE'] - len(new_sids)
            if limit > 0:
                oldest_backfills = [row['sid'] for row in get_active_sids_with_oldest_registration_imports(limit=limit)]
                sids = new_sids + oldest_backfills
            else:
                sids = new_sids
        elif load_mode == 'all':
            sids = all_sids

        app.logger.info(f'Starting registrations/demographics import job for {len(sids)} students...')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
            'api_demographics': [],
        }
        successes, failures = self.get_registration_data_per_sids(rows, sids)
        if load_mode != 'new' and (len(successes) == 0) and (len(failures) > 0):
            raise BackgroundJobError('Failed to import registration histories: aborting job.')

        for key in rows.keys():
            s3_key = f'{get_s3_sis_api_daily_path(use_edl_if_feature_flag=True)}/{key}.tsv'
            app.logger.info(f'Will stash {len(successes)} feeds in S3: {s3_key}')
            if not s3.upload_tsv_rows(rows[key], s3_key):
                raise BackgroundJobError('Error on S3 upload: aborting job.')
            app.logger.info('Will copy S3 feeds into Redshift...')
            if not redshift.execute(f'TRUNCATE {student_schema()}_staging.student_{key}'):
                raise BackgroundJobError('Error truncating old staging rows: aborting job.')
            if not redshift.copy_tsv_from_s3(f'{student_schema()}_staging.student_{key}', s3_key):
                raise BackgroundJobError('Error on Redshift copy: aborting job.')
            staging_to_destination_query = resolve_sql_template_string(
                """
                DELETE FROM {student_schema}.student_{table_key}
                    WHERE sid IN
                    (SELECT sid FROM {student_schema}_staging.student_{table_key});
                INSERT INTO {student_schema}.student_{table_key}
                    (SELECT * FROM {student_schema}_staging.student_{table_key});
                TRUNCATE TABLE {student_schema}_staging.student_{table_key};
                """,
                table_key=key,
                student_schema=student_schema(),
            )
            if not redshift.execute(staging_to_destination_query):
                raise BackgroundJobError('Error inserting staging entries into destination: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(sids, rows['term_gpas'], transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Failed to refresh RDS indexes.')

        update_registration_import_status(successes, failures)

        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )

Example #6

0

Show file

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        app.logger.info(
            f'Starting SIS degree progress API import job for {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_information_count = 0
        failure_count = 0
        index = 1

        # TODO The SIS degree progress API will return useful data only for students with a UGRD current registration.
        # We get that registration from the SIS student API, which is imported concurrently with this job. Is there an
        # alternative way to filter out non-UGRD students?
        for csid in csids:
            app.logger.info(
                f'Fetching degree progress API for SID {csid} ({index} of {len(csids)})'
            )
            feed = sis_degree_progress_api.parsed_degree_progress(csid)
            if feed:
                success_count += 1
                rows.append('\t'.join([str(csid), json.dumps(feed)]))
            elif feed == {}:
                app.logger.info(
                    f'No degree progress information found for SID {csid}.')
                no_information_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS get_degree_progress failed for SID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/degree_progress.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.sis_api_degree_progress'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_degree_progress',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_degree_progress
                WHERE sid IN (SELECT sid FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            INSERT INTO {redshift_schema_student}.sis_api_degree_progress
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_degree_progress);
            TRUNCATE {redshift_schema_student}_staging.sis_api_profiles;
            """, )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'SIS degree progress API import job completed: {success_count} succeeded, '
            f'{no_information_count} returned no information, {failure_count} failed.'
        )

Example #7

0

Show file

    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # Owing to the size of the non-advisee population makes, a one-shot load of all these slow feeds may not
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        #
        # (With the logic change in NS-1155 to pre-screen SIDs for student affiliation in CalNet, the cutoff is less
        # likely to be triggered.)
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting import of historical registration data for {len(sids)} students...'
        )
        redshift.execute('VACUUM; ANALYZE;')

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.get_registration_data_per_sids(
            rows, sids, include_demographics=False)
        for key in rows.keys():
            if len(rows[key]) > 0:
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Upload {key} data to s3:{s3_key}. The file represents {len(rows[key])} students.'
                )
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        f'Error during S3 upload: {s3_key}. Aborting job.')

                staging_table = f'{student_schema()}_staging.hist_enr_{key}'
                if not redshift.execute(f'TRUNCATE {staging_table}'):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')

                app.logger.info(
                    f'Populate {staging_table} (Redshift table) with s3:{s3_key}'
                )
                if not redshift.copy_tsv_from_s3(staging_table, s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')

                app.logger.info(
                    f'Insert student data into {student_schema()}.hist_enr_{key}'
                )
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {student_schema}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {student_schema}_staging.hist_enr_{table_key});
                    INSERT INTO {student_schema}.hist_enr_{table_key}
                        (SELECT * FROM {student_schema}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {student_schema}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                    student_schema=student_schema(),
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )

        redshift.execute('VACUUM; ANALYZE;')
        return (
            f'Finished import of historical registration data: {len(successes)} successes and {len(failures)} failures.'
        )

Example #8

0

Show file

    def run(self, csids=None, term_id=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]
        if not term_id:
            term_id = current_term_id()
        app.logger.info(
            f'Starting SIS enrollments API import job for term {term_id}, {len(csids)} students...'
        )

        rows = []
        success_count = 0
        no_enrollments_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching SIS enrollments API for SID {csid}, term {term_id} ({index} of {len(csids)})'
            )
            feed = sis_enrollments_api.get_drops_and_midterms(csid, term_id)
            if feed:
                success_count += 1
                rows.append('\t'.join(
                    [str(csid), str(term_id),
                     json.dumps(feed)]))
            elif feed is False:
                app.logger.info(
                    f'SID {csid} returned no enrollments for term {term_id}.')
                no_enrollments_count += 1
            else:
                failure_count += 1
                app.logger.error(
                    f'SIS enrollments API import failed for CSID {csid}.')
            index += 1

        s3_key = f'{get_s3_sis_api_daily_path()}/drops_and_midterms_{term_id}.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f"DELETE FROM {self.destination_schema}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}'"
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.sis_api_drops_and_midterms',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string(
            """
            DELETE FROM {redshift_schema_student}.sis_api_drops_and_midterms
                WHERE term_id = '{term_id}'
                AND sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}');
            INSERT INTO {redshift_schema_student}.sis_api_drops_and_midterms
                (SELECT * FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms WHERE term_id = '{term_id}');
            DELETE FROM {redshift_schema_student}_staging.sis_api_drops_and_midterms
                WHERE term_id = '{term_id}';
            """,
            term_id=term_id,
        )
        if not redshift.execute(staging_to_destination_query):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False

        return (
            f'SIS enrollments API import completed for term {term_id}: {success_count} succeeded, '
            f'{no_enrollments_count} returned no enrollments, {failure_count} failed.'
        )

Example #9

0

Show file

File: import_term_gpas.py Project: lyttam/nessie

    def run(self, csids=None):
        if not csids:
            csids = [row['sid'] for row in get_all_student_ids()]

        app.logger.info(
            f'Starting term GPA import job for {len(csids)} students...')

        rows = []
        success_count = 0
        no_registrations_count = 0
        failure_count = 0
        index = 1
        for csid in csids:
            app.logger.info(
                f'Fetching term GPAs for SID {csid}, ({index} of {len(csids)})'
            )
            feed = sis_student_api.get_term_gpas(csid)
            if feed:
                success_count += 1
                for term_id, term_data in feed.items():
                    rows.append('\t'.join([
                        str(csid),
                        str(term_id),
                        str(term_data.get('gpa') or '0'),
                        str(term_data.get('unitsTakenForGpa') or '0')
                    ]))
            elif feed == {}:
                app.logger.info(f'No registrations found for SID {csid}.')
                no_registrations_count += 1
            else:
                failure_count += 1
                app.logger.error(f'Term GPA import failed for SID {csid}.')
            index += 1

        if success_count == 0:
            app.logger.error('Failed to import term GPAs: aborting job.')
            return False

        s3_key = f'{get_s3_sis_api_daily_path()}/term_gpas.tsv'
        app.logger.info(f'Will stash {success_count} feeds in S3: {s3_key}')
        if not s3.upload_data('\n'.join(rows), s3_key):
            app.logger.error('Error on S3 upload: aborting job.')
            return False

        app.logger.info('Will copy S3 feeds into Redshift...')
        if not redshift.execute(
                f'TRUNCATE {self.destination_schema}_staging.student_term_gpas'
        ):
            app.logger.error(
                'Error truncating old staging rows: aborting job.')
            return False
        if not redshift.copy_tsv_from_s3(
                f'{self.destination_schema}_staging.student_term_gpas',
                s3_key):
            app.logger.error('Error on Redshift copy: aborting job.')
            return False
        staging_to_destination_query = resolve_sql_template_string("""
            DELETE FROM {redshift_schema_student}.student_term_gpas
                WHERE sid IN
                (SELECT sid FROM {redshift_schema_student}_staging.student_term_gpas);
            INSERT INTO {redshift_schema_student}.student_term_gpas
                (SELECT * FROM {redshift_schema_student}_staging.student_term_gpas);
            TRUNCATE TABLE {redshift_schema_student}_staging.student_term_gpas;
            """)
        if not redshift.execute(staging_to_destination_query):
            app.logger.error(
                'Error inserting staging entries into destination: aborting job.'
            )
            return False

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(csids, rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                app.logger.error('Failed to refresh RDS indexes.')
                return False

        return (
            f'Term GPA import completed: {success_count} succeeded, '
            f'{no_registrations_count} returned no registrations, {failure_count} failed.'
        )

Example #10

0

Show file

File: import_registrations_hist_enr.py Project: dkawase/nessie

    def run(self, load_mode='batch'):
        new_sids = [
            row['sid']
            for row in get_non_advisees_without_registration_imports()
        ]

        # The size of the non-advisee population makes it unlikely that a one-shot load of all these slow feeds will
        # finish successfully without interfering with other work. Therefore the default approach is to apply a strict
        # upper limit on the number of feeds loaded in any one job run, no matter how many SIDs remain to be processed.
        if load_mode == 'new':
            sids = new_sids
        elif load_mode == 'batch':
            max_batch = app.config['HIST_ENR_REGISTRATIONS_IMPORT_BATCH_SIZE']
            if max_batch >= len(new_sids):
                sids = new_sids
            else:
                sids = new_sids[0:(max_batch)]

        app.logger.info(
            f'Starting registrations import job for {len(sids)} non-advisees...'
        )

        rows = {
            'term_gpas': [],
            'last_registrations': [],
        }
        successes, failures = self.load_concurrently(rows, sids)
        if len(successes) > 0:
            for key in rows.keys():
                s3_key = f'{get_s3_sis_api_daily_path()}/{key}.tsv'
                app.logger.info(
                    f'Will stash {len(successes)} feeds in S3: {s3_key}')
                if not s3.upload_tsv_rows(rows[key], s3_key):
                    raise BackgroundJobError(
                        'Error on S3 upload: aborting job.')
                app.logger.info('Will copy S3 feeds into Redshift...')
                if not redshift.execute(
                        f'TRUNCATE {self.redshift_schema}_staging.hist_enr_{key}'
                ):
                    raise BackgroundJobError(
                        'Error truncating old staging rows: aborting job.')
                if not redshift.copy_tsv_from_s3(
                        f'{self.redshift_schema}_staging.hist_enr_{key}',
                        s3_key):
                    raise BackgroundJobError(
                        'Error on Redshift copy: aborting job.')
                staging_to_destination_query = resolve_sql_template_string(
                    """
                    DELETE FROM {redshift_schema_student}.hist_enr_{table_key}
                        WHERE sid IN
                        (SELECT sid FROM {redshift_schema_student}_staging.hist_enr_{table_key});
                    INSERT INTO {redshift_schema_student}.hist_enr_{table_key}
                        (SELECT * FROM {redshift_schema_student}_staging.hist_enr_{table_key});
                    TRUNCATE TABLE {redshift_schema_student}_staging.hist_enr_{table_key};
                    """,
                    table_key=key,
                )
                if not redshift.execute(staging_to_destination_query):
                    raise BackgroundJobError(
                        'Error inserting staging entries into destination: aborting job.'
                    )
        return (
            f'Registrations import completed: {len(successes)} succeeded, {len(failures)} failed.'
        )