コード例 #1
0
    def verify_migration(self, incremental_url, incremental_schema):
        redshift.drop_external_schema(incremental_schema)
        resolved_ddl_transient = resolve_sql_template(
            'create_lrs_statements_table.template.sql',
            redshift_schema_lrs_external=incremental_schema,
            loch_s3_lrs_statements_path=incremental_url,
        )
        if redshift.execute_ddl_script(resolved_ddl_transient):
            app.logger.info(
                f"LRS incremental schema '{incremental_schema}' created.")
        else:
            raise BackgroundJobError(
                f"LRS incremental schema '{incremental_schema}' creation failed."
            )

        redshift_response = redshift.fetch(
            f'select count(*) from {incremental_schema}.statements')
        if redshift_response:
            redshift_statement_count = redshift_response[0].get('count')
        else:
            raise BackgroundJobError(
                f"Failed to verify LRS incremental schema '{incremental_schema}'."
            )

        if redshift_statement_count == self.lrs_statement_count:
            app.logger.info(
                f'Verified {redshift_statement_count} rows migrated from LRS to {incremental_url}.'
            )
        else:
            raise BackgroundJobError(
                f'Discrepancy between LRS ({self.lrs_statement_count} statements)'
                f' and {incremental_url} ({redshift_statement_count} statements).'
            )
コード例 #2
0
    def run(self):
        app.logger.info('Starting Canvas schema creation job...')

        canvas_path = get_s3_canvas_daily_path()
        if not s3.get_keys_with_prefix(canvas_path):
            canvas_path = get_s3_canvas_daily_path(datetime.now() -
                                                   timedelta(days=1))
            if not s3.get_keys_with_prefix(canvas_path):
                raise BackgroundJobError(
                    'No timely Canvas data found, aborting')
            else:
                app.logger.info('Falling back to yesterday\'s Canvas data')

        external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
        s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/'
        s3_canvas_data_url = s3_prefix + canvas_path
        s3_canvas_data_path_current_term = s3_prefix + berkeley.s3_canvas_data_path_current_term(
        )

        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template(
            'create_canvas_schema.template.sql',
            loch_s3_canvas_data_path_today=s3_canvas_data_url,
            loch_s3_canvas_data_path_current_term=
            s3_canvas_data_path_current_term,
        )
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'Canvas schema creation job completed.'
        else:
            raise BackgroundJobError('Canvas schema creation job failed.')
コード例 #3
0
    def migrate_transient_to_destination(self,
                                         keys,
                                         destination_bucket,
                                         destination_path,
                                         unload_to_etl=False):
        destination_url = 's3://' + destination_bucket + '/' + destination_path
        destination_schema = app.config['REDSHIFT_SCHEMA_LRS']

        for transient_key in keys:
            destination_key = transient_key.replace(self.transient_path,
                                                    destination_path)
            if not s3.copy(self.transient_bucket, transient_key,
                           destination_bucket, destination_key):
                app.logger.error(
                    f'Copy from transient bucket to destination bucket {destination_bucket} failed.'
                )
                return False
        if not self.verify_migration(destination_url, destination_schema):
            return False
        if unload_to_etl:
            if not self.unload_to_etl(destination_schema, destination_bucket):
                app.logger.error(
                    f'Redshift statements unload from {destination_schema} to {destination_bucket} failed.'
                )
                return False
        redshift.drop_external_schema(destination_schema)
        return True
コード例 #4
0
 def verify_and_unload_transient(self):
     transient_url = f's3://{self.transient_bucket}/{self.transient_path}'
     transient_schema = app.config['REDSHIFT_SCHEMA_LRS'] + '_transient'
     self.verify_migration(transient_url, transient_schema)
     self.delete_old_unloads()
     self.unload_to_etl(transient_schema, self.transient_bucket, timestamped=False)
     redshift.drop_external_schema(transient_schema)
コード例 #5
0
    def run(self):
        app.logger.info('Starting SIS Advising Notes schema creation job...')

        daily_path = get_s3_sis_sysadm_daily_path()
        bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes',
                                       bucket=bucket):
            daily_path = get_s3_sis_sysadm_daily_path(datetime.now() -
                                                      timedelta(days=1))
            if not s3.get_keys_with_prefix(f'{daily_path}/advising-notes',
                                           bucket=bucket):
                raise BackgroundJobError(
                    'No timely SIS advising notes data found, aborting')
            else:
                app.logger.info(
                    'Falling back to yesterday\'s SIS advising notes data')

        app.logger.info('Executing SQL...')
        external_schema = app.config['REDSHIFT_SCHEMA_SIS_ADVISING_NOTES']
        redshift.drop_external_schema(external_schema)
        self.create_historical_tables(external_schema)
        self.create_internal_schema(external_schema, daily_path)
        app.logger.info('Redshift schema created. Creating RDS indexes...')
        self.create_indexes()
        app.logger.info('RDS indexes created.')

        return 'SIS Advising Notes schema creation job completed.'
コード例 #6
0
    def create_schema(self):
        base_s3_key = app.config['LOCH_S3_E_I_DATA_PATH']
        external_schema = app.config['REDSHIFT_SCHEMA_E_I_ADVISING_NOTES']
        redshift.drop_external_schema(external_schema)
        # Flatten E&I-sourced JSON files into two schema-friendly JSON files.
        notes = []
        topics = []
        for key in s3.get_keys_with_prefix(base_s3_key):
            if key.endswith('.json'):
                notes_json = s3.get_object_json(key)
                if notes_json and 'notes' in notes_json:
                    notes += notes_json['notes']
                    for note in notes:
                        topics += _extract_topics(note)

        if s3.upload_json(obj=notes, s3_key=f'{base_s3_key}/aggregated_notes/data.json') \
                and s3.upload_json(obj=topics, s3_key=f'{base_s3_key}/aggregated_topics/data.json'):
            # Create schema
            app.logger.info('Executing SQL...')
            resolved_ddl = resolve_sql_template('create_e_i_advising_notes_schema.template.sql')
            if redshift.execute_ddl_script(resolved_ddl):
                verify_external_schema(external_schema, resolved_ddl)
            else:
                raise BackgroundJobError('E&I Advising Notes schema creation job failed.')
        else:
            raise BackgroundJobError('Failed to upload aggregated E&I advising notes and topics.')
コード例 #7
0
    def run(self):
        app.logger.info('Start generating canvas caliper analytics')
        redshift_schema_caliper_analytics = app.config['REDSHIFT_SCHEMA_CALIPER']
        redshift_schema_lrs_external = app.config['REDSHIFT_SCHEMA_LRS']
        canvas_caliper_explode_table = 'canvas_caliper_explode'

        # Because the Caliper incrementals are provided by a Glue job running on a different schedule, the latest batch
        # may have been delivered before last midnight UTC.
        s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path()
        if not s3.get_keys_with_prefix(s3_caliper_daily_path):
            s3_caliper_daily_path = get_s3_daily_canvas_caliper_explode_path(datetime.now() - timedelta(days=1))
            if not s3.get_keys_with_prefix(s3_caliper_daily_path):
                raise BackgroundJobError('No timely S3 Caliper extracts found')
            else:
                app.logger.info('Falling back S3 Caliper extracts for yesterday')
        s3_caliper_daily_url = s3.build_s3_url(s3_caliper_daily_path)

        resolved_ddl_caliper_explode = resolve_sql_template(
            'create_lrs_canvas_explode_table.template.sql',
            canvas_caliper_explode_table=canvas_caliper_explode_table,
            loch_s3_caliper_explode_url=s3_caliper_daily_url,
        )
        redshift.drop_external_schema(redshift_schema_lrs_external)
        if redshift.execute_ddl_script(resolved_ddl_caliper_explode):
            app.logger.info('Caliper explode schema and table successfully created.')
        else:
            raise BackgroundJobError('Caliper explode schema and table creation failed.')

        # Sanity-check event times from the latest Caliper batch against previously transformed event times.
        def datetime_from_query(query):
            response = redshift.fetch(query)
            timestamp = response and response[0] and response[0].get('timestamp')
            if not timestamp:
                raise BackgroundJobError(f'Timestamp query failed to return data for comparison; aborting job: {query}')
            if isinstance(timestamp, str):
                timestamp = datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')
            return timestamp

        earliest_untransformed = datetime_from_query(
            f'SELECT MIN(timestamp) AS timestamp FROM {redshift_schema_lrs_external}.{canvas_caliper_explode_table}',
        )
        latest_transformed = datetime_from_query(
            f'SELECT MAX(timestamp) AS timestamp FROM {redshift_schema_caliper_analytics}.canvas_caliper_user_requests',
        )
        if not earliest_untransformed or not latest_transformed:
            return False
        timestamp_diff = (earliest_untransformed - latest_transformed).total_seconds()
        lower_bound_tolerance, upper_bound_tolerance = app.config['LOCH_CANVAS_CALIPER_TIMESTAMP_DISCREPANCY_TOLERANCE']
        if timestamp_diff < lower_bound_tolerance or timestamp_diff > upper_bound_tolerance:
            raise BackgroundJobError(
                f'Unexpected difference between Caliper timestamps: latest transformed {latest_transformed}, '
                f'earliest untransformed {earliest_untransformed}',
            )

        resolved_ddl_caliper_analytics = resolve_sql_template('generate_caliper_analytics.template.sql')
        if redshift.execute_ddl_script(resolved_ddl_caliper_analytics):
            return 'Caliper analytics tables successfully created.'
        else:
            raise BackgroundJobError('Caliper analytics tables creation failed.')
コード例 #8
0
 def create_schema(self):
     app.logger.info('Executing SQL...')
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template(
         'create_sis_terms_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('SIS terms schema creation job failed.')
コード例 #9
0
    def migrate_transient_to_destination(self, keys, destination_bucket, destination_path):
        destination_url = 's3://' + destination_bucket + '/' + destination_path
        redshift_schema = app.config['REDSHIFT_SCHEMA_LRS']

        for transient_key in keys:
            destination_key = transient_key.replace(self.transient_path, destination_path)
            if not s3.copy(self.transient_bucket, transient_key, destination_bucket, destination_key):
                raise BackgroundJobError(f'Copy from transient bucket to destination bucket {destination_bucket} failed.')
        self.verify_migration(destination_url, redshift_schema)
        redshift.drop_external_schema(redshift_schema)
コード例 #10
0
ファイル: create_edw_schema.py プロジェクト: dkawase/nessie
 def run(self):
     app.logger.info('Starting EDW schema creation job...')
     app.logger.info('Executing SQL...')
     external_schema = app.config['REDSHIFT_SCHEMA_EDW']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_edw_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
         return 'EDW schema creation job completed.'
     else:
         raise BackgroundJobError('EDW schema creation job failed.')
コード例 #11
0
 def run(self):
     app.logger.info(f'Starting Canvas schema creation job...')
     external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_canvas_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         app.logger.info(f'Canvas schema creation job completed.')
         return verify_external_schema(external_schema, resolved_ddl)
     else:
         app.logger.error(f'Canvas schema creation job failed.')
         return False
コード例 #12
0
    def create_schema(self):
        external_schema = app.config['REDSHIFT_SCHEMA_DATA_SCIENCE_ADVISING']
        redshift.drop_external_schema(external_schema)

        resolved_ddl = resolve_sql_template(
            'create_data_science_advising_schema.template.sql')
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError(
                'Data Science Advising schema creation job failed.')
コード例 #13
0
    def run(self):
        success = 0
        failure = 0
        berkeleyx_tenants = {
            'edx': {
                'database': 'berkeleyx_prod_ext',
                'instance': 'prod-analytics',
            },
            'edge': {
                'database': 'berkeleyx_prod_edge_ext',
                'instance': 'prod-edge-analytics',
            },
        }

        for tenant, value in berkeleyx_tenants.items():
            app.logger.info(
                f'Starting Berkeleyx schema creation job for {tenant}...')
            berkeleyx_data_path = 's3://{}/{}/{}'.format(
                app.config['LOCH_EDX_S3_BUCKET'],
                app.config['LOCH_EDX_S3_WEEKLY_DATA_PATH'],
                value['instance'],
            )
            berkeleyx_transaction_log_path = 's3://{}/{}/{}'.format(
                app.config['LOCH_EDX_S3_BUCKET'],
                app.config['LOCH_EDX_S3_TRANSACTION_LOG_PATH'],
                tenant,
            )
            external_schema = value['database'] + '_' + app.config[
                'LOCH_EDX_NESSIE_ENV']
            redshift.drop_external_schema(external_schema)
            resolved_ddl = resolve_sql_template(
                'create_berkeleyx_schema.template.sql',
                loch_s3_berkeleyx_data_path=berkeleyx_data_path,
                loch_s3_berkeleyx_transaction_log_path=
                berkeleyx_transaction_log_path,
                redshift_berkeleyx_ext_schema=external_schema,
            )

            if redshift.execute_ddl_script(resolved_ddl):
                app.logger.info(
                    f'BerkeleyX schema {external_schema} creation completed.')
                success += 1
            else:
                app.logger.error(
                    f'BerkeleyX schema creation {external_schema} failed.')
                failure += 1

        if failure > 0:
            raise BackgroundJobError('Berkeleyx Schema creation jobs failed')
        else:
            app.logger.info(
                'Bekreleyx schema creation jobs completed successfully')
            return True
コード例 #14
0
    def create_schema(self):
        external_schema = app.config['REDSHIFT_SCHEMA_ADVISOR']
        redshift.drop_external_schema(external_schema)

        resolved_ddl = resolve_sql_template(
            'create_advisor_schema.template.sql',
            advisor_data_path=self.s3_path(),
        )
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError('Advisor schema creation job failed.')
コード例 #15
0
 def create_external_schema(self):
     if not self.feature_flag_edl:
         # External schema is necessary when pulling term definitions from S3.
         redshift.drop_external_schema(self.redshift_schema)
         sql_template = 'create_terms_schema.template.sql'
         app.logger.info(f'Executing {sql_template}...')
         resolved_ddl = resolve_sql_template(sql_template)
         if redshift.execute_ddl_script(resolved_ddl):
             verify_external_schema(self.redshift_schema, resolved_ddl)
         else:
             raise BackgroundJobError(
                 'SIS terms schema creation job failed.')
コード例 #16
0
 def run(self):
     app.logger.info('Starting Gradescope schema creation job...')
     app.logger.info('Executing SQL...')
     # Add frequency of ingest and dynamic path creation logic once ingest mechanisms are confirmed
     # For now adding a top level hierarchy to catalog bulk snapshot.
     external_schema = app.config['REDSHIFT_SCHEMA_GRADESCOPE']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_gradescope_schema_template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
         return 'Gradescope schema creation job completed.'
     else:
         raise BackgroundJobError('Gradescope schema creation job failed.')
コード例 #17
0
 def run(self):
     app.logger.info('Starting SIS schema creation job...')
     if not self.update_manifests():
         app.logger.info('Error updating manifests, will not execute schema creation SQL')
         return False
     app.logger.info('Executing SQL...')
     external_schema = app.config['REDSHIFT_SCHEMA_SIS']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_sis_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('SIS schema creation job failed.')
     return 'SIS schema creation job completed.'
コード例 #18
0
 def run(self):
     app.logger.info('Starting OUA Slate schema creation job...')
     app.logger.info('Executing SQL...')
     self.migrate_oua_sftp_data()
     external_schema = app.config['REDSHIFT_SCHEMA_OUA']
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_oua_schema_template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
         self.create_rds_tables_and_indexes()
         app.logger.info('OUA Slate RDS indexes created.')
         return 'OUA schema creation job completed.'
     else:
         raise BackgroundJobError('OUA Slate schema creation job failed.')
コード例 #19
0
 def create_schema(self):
     external_schema = app.config['REDSHIFT_SCHEMA_ASC_ADVISING_NOTES']
     redshift.drop_external_schema(external_schema)
     asc_data_sftp_path = '/'.join([
         f"s3://{app.config['LOCH_S3_BUCKET']}",
         app.config['LOCH_S3_ASC_DATA_SFTP_PATH'],
     ])
     resolved_ddl = resolve_sql_template(
         'create_asc_advising_notes_schema.template.sql',
         asc_data_sftp_path=asc_data_sftp_path,
     )
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema, resolved_ddl)
     else:
         raise BackgroundJobError('ASC Advising Notes schema creation job failed.')
コード例 #20
0
    def create_schema(self):
        app.logger.info('Executing SQL...')
        redshift.drop_external_schema(self.external_schema)

        s3_sis_daily = get_s3_sis_sysadm_daily_path()
        if not s3.get_keys_with_prefix(s3_sis_daily):
            s3_sis_daily = _get_yesterdays_advisor_data()
        s3_path = '/'.join([f"s3://{app.config['LOCH_S3_BUCKET']}", s3_sis_daily, 'advisors'])

        sql_filename = 'edl_create_advisor_schema.template.sql' if self.feature_flag_edl else 'create_advisor_schema.template.sql'
        resolved_ddl = resolve_sql_template(sql_filename, advisor_data_path=s3_path)
        if not redshift.execute_ddl_script(resolved_ddl):
            raise BackgroundJobError(f'Redshift execute_ddl_script failed on {sql_filename}')

        verify_external_schema(self.external_schema, resolved_ddl)
        app.logger.info('Redshift schema created.')
コード例 #21
0
 def run(self):
     app.logger.info('Starting SIS schema creation job...')
     if not self.update_manifests():
         app.logger.info(
             'Error updating manifests, will not execute schema creation SQL'
         )
         return False
     app.logger.info('Executing SQL...')
     redshift.drop_external_schema(external_schema)
     resolved_ddl = resolve_sql_template('create_sis_schema.template.sql')
     if redshift.execute_ddl_script(resolved_ddl):
         verify_external_schema(external_schema,
                                resolved_ddl,
                                is_zero_count_acceptable=feature_flag_edl())
     else:
         raise BackgroundJobError('SIS schema creation job failed.')
     return 'SIS schema creation job completed.'
コード例 #22
0
    def run(self):
        app.logger.info('Starting Canvas API schema creation job...')
        external_schema = app.config['REDSHIFT_SCHEMA_CANVAS_API']
        s3_prefix = 's3://' + app.config['LOCH_S3_BUCKET'] + '/'
        s3_canvas_api_data_url = s3_prefix + get_s3_canvas_api_path(
            transformed=True)

        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template(
            'create_canvas_api_schema.template.sql',
            loch_s3_canvas_api_data_path=s3_canvas_api_data_url,
        )
        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'Canvas API schema creation job completed.'
        else:
            raise BackgroundJobError('Canvas API schema creation job failed.')
コード例 #23
0
    def run(self):
        app.logger.info('Starting YCBM schema creation job...')

        external_schema = app.config['REDSHIFT_SCHEMA_YCBM']
        redshift.drop_external_schema(external_schema)

        sis_source_schema = app.config['REDSHIFT_SCHEMA_EDL'] if app.config['FEATURE_FLAG_EDL_SIS_VIEWS'] else app.config['REDSHIFT_SCHEMA_SIS']
        resolved_ddl = resolve_sql_template(
            'create_ycbm_schema.template.sql',
            redshift_schema_sis=sis_source_schema,
        )

        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'YCBM schema creation job completed.'
        else:
            raise BackgroundJobError('YCBM schema creation job failed.')
コード例 #24
0
    def run(self):
        app.logger.info('Starting CalNet schema creation job...')
        external_schema = app.config['REDSHIFT_SCHEMA_CALNET']
        redshift.drop_external_schema(external_schema)
        sid_snapshot_path = '/'.join([
            f"s3://{app.config['LOCH_S3_BUCKET']}",
            app.config['LOCH_S3_CALNET_DATA_PATH'],
            'sids',
        ])
        resolved_ddl = resolve_sql_template(
            'create_calnet_schema.template.sql',
            sid_snapshot_path=sid_snapshot_path,
        )

        if redshift.execute_ddl_script(resolved_ddl):
            verify_external_schema(external_schema, resolved_ddl)
            return 'CalNet schema creation job completed.'
        else:
            raise BackgroundJobError('CalNet schema creation job failed.')
コード例 #25
0
    def run(self):
        app.logger.info('Starting Undergrads schema creation job...')
        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template('create_undergrads_schema.template.sql')
        if redshift.execute_ddl_script(resolved_ddl):
            app.logger.info('Undergrads external schema created.')
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError('Undergrads external schema creation failed.')
        undergrads_rows = redshift.fetch(f'SELECT * FROM {external_schema}.students ORDER by sid')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(undergrads_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        return 'Undergrads internal schema created.'
コード例 #26
0
    def run(self):
        app.logger.info('Starting OUA Slate schema creation job...')
        app.logger.info('Executing SQL...')

        s3_protected_bucket = app.config['LOCH_S3_PROTECTED_BUCKET']
        oua_slate_sftp_path = app.config[
            'LOCH_S3_SLATE_DATA_SFTP_PATH'] + '/' + self.get_sftp_date_offset(
            ) + '/'
        oua_daily_dest_path = get_s3_oua_daily_path() + '/admissions/'

        # Gets list of keys under SFTP prefix and looks for csv files to migrate to OUA daily location
        keys = s3.get_keys_with_prefix(oua_slate_sftp_path,
                                       full_objects=False,
                                       bucket=s3_protected_bucket)

        if len(keys) > 0:
            for source_key in keys:
                if source_key.endswith('.csv'):
                    destination_key = source_key.replace(
                        oua_slate_sftp_path, oua_daily_dest_path)
                    if not s3.copy(s3_protected_bucket, source_key,
                                   s3_protected_bucket, destination_key):
                        raise BackgroundJobError(
                            f'Copy from SFTP location {source_key} to daily OUA destination {destination_key} failed.'
                        )
            external_schema = app.config['REDSHIFT_SCHEMA_OUA']
            redshift.drop_external_schema(external_schema)
            resolved_ddl = resolve_sql_template(
                'create_oua_schema_template.sql')
            if redshift.execute_ddl_script(resolved_ddl):
                verify_external_schema(external_schema, resolved_ddl)
                self.create_rds_tables_and_indexes()
                app.logger.info('OUA Slate RDS indexes created.')
                return 'OUA schema creation job completed.'

            else:
                raise BackgroundJobError(
                    'OUA Slate schema creation job failed.')

        else:
            return 'No OUA files found in SFTP location today. Skipping OUA data refresh'
コード例 #27
0
ファイル: create_coe_schema.py プロジェクト: dkawase/nessie
    def run(self):
        app.logger.info('Starting COE schema creation job...')
        redshift.drop_external_schema(external_schema)
        resolved_ddl = resolve_sql_template('create_coe_schema.template.sql')
        # TODO This DDL drops and recreates the internal schema before the external schema is verified. We
        # ought to set up proper staging in conjunction with verification. It's also possible that a persistent
        # external schema isn't needed.
        if redshift.execute_ddl_script(resolved_ddl):
            app.logger.info('COE external schema created.')
            verify_external_schema(external_schema, resolved_ddl)
        else:
            raise BackgroundJobError('COE external schema creation failed.')
        coe_rows = redshift.fetch(
            'SELECT * FROM {schema}.students ORDER by sid',
            schema=internal_schema_identifier,
        )

        profile_rows = []
        index = 1
        for sid, rows_for_student in groupby(coe_rows,
                                             operator.itemgetter('sid')):
            app.logger.info(
                f'Generating COE profile for SID {sid} ({index} of {len(coe_rows)})'
            )
            index += 1
            row_for_student = list(rows_for_student)[0]
            coe_profile = {
                'advisorUid': row_for_student.get('advisor_ldap_uid'),
                'gender': row_for_student.get('gender'),
                'ethnicity': row_for_student.get('ethnicity'),
                'minority': row_for_student.get('minority'),
                'didPrep': row_for_student.get('did_prep'),
                'prepEligible': row_for_student.get('prep_eligible'),
                'didTprep': row_for_student.get('did_tprep'),
                'tprepEligible': row_for_student.get('tprep_eligible'),
                'sat1read': row_for_student.get('sat1read'),
                'sat1math': row_for_student.get('sat1math'),
                'sat2math': row_for_student.get('sat2math'),
                'inMet': row_for_student.get('in_met'),
                'gradTerm': row_for_student.get('grad_term'),
                'gradYear': row_for_student.get('grad_year'),
                'probation': row_for_student.get('probation'),
                'status': row_for_student.get('status'),
            }
            profile_rows.append(encoded_tsv_row([sid,
                                                 json.dumps(coe_profile)]))

        s3_key = f'{get_s3_coe_daily_path()}/coe_profiles.tsv'
        app.logger.info(
            f'Will stash {len(profile_rows)} feeds in S3: {s3_key}')
        if not s3.upload_tsv_rows(profile_rows, s3_key):
            raise BackgroundJobError('Error on S3 upload: aborting job.')

        app.logger.info('Will copy S3 feeds into Redshift...')
        query = resolve_sql_template_string(
            """
            COPY {redshift_schema_coe}.student_profiles
                FROM '{loch_s3_coe_data_path}/coe_profiles.tsv'
                IAM_ROLE '{redshift_iam_role}'
                DELIMITER '\\t';
            """, )
        if not redshift.execute(query):
            raise BackgroundJobError('Error on Redshift copy: aborting job.')

        with rds.transaction() as transaction:
            if self.refresh_rds_indexes(coe_rows, transaction):
                transaction.commit()
                app.logger.info('Refreshed RDS indexes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Error refreshing RDS indexes.')

        return 'COE internal schema created.'
コード例 #28
0
    def run(self, truncate_lrs=False):
        app.logger.info('Starting DMS replication task...')
        task_id = app.config['LRS_CANVAS_INCREMENTAL_REPLICATION_TASK_ID']

        self.transient_bucket = app.config[
            'LRS_CANVAS_INCREMENTAL_TRANSIENT_BUCKET']
        self.transient_path = app.config[
            'LRS_CANVAS_INCREMENTAL_TRANSIENT_PATH']

        if not self.delete_old_incrementals():
            return False

        response = dms.start_replication_task(task_id)
        if not response:
            app.logger.error(
                'Failed to start DMS replication task (response={response}).')
            return False

        while True:
            response = dms.get_replication_task(task_id)
            if response.get('Status') == 'stopped':
                if response.get(
                        'StopReason') == 'Stop Reason FULL_LOAD_ONLY_FINISHED':
                    app.logger.info('DMS replication task completed')
                    break
                else:
                    app.logger.error(
                        f'Replication task stopped for unexpected reason: {response}'
                    )
                    return False
            sleep(10)

        lrs_response = lrs.fetch('select count(*) from statements')
        if lrs_response:
            self.lrs_statement_count = lrs_response[0][0]
        else:
            app.logger.error(
                f'Failed to retrieve LRS statements for comparison.')
            return False

        transient_keys = s3.get_keys_with_prefix(self.transient_path,
                                                 bucket=self.transient_bucket)
        if not transient_keys:
            app.logger.error(
                'Could not retrieve S3 keys from transient bucket.')
            return False

        transient_url = f's3://{self.transient_bucket}/{self.transient_path}'
        transient_schema = app.config['REDSHIFT_SCHEMA_LRS'] + '_transient'
        if not self.verify_migration(transient_url, transient_schema):
            return False
        redshift.drop_external_schema(transient_schema)

        timestamp_path = localize_datetime(
            datetime.now()).strftime('%Y/%m/%d/%H%M%S')
        destination_path = app.config[
            'LRS_CANVAS_INCREMENTAL_DESTINATION_PATH'] + '/' + timestamp_path
        for destination_bucket in app.config[
                'LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']:
            if not self.migrate_transient_to_destination(
                    transient_keys,
                    destination_bucket,
                    destination_path,
                    unload_to_etl=True,
            ):
                return False

        if truncate_lrs:
            if lrs.execute('TRUNCATE statements'):
                app.logger.info('Truncated incremental LRS table.')
            else:
                app.logger.error('Failed to truncate incremental LRS table.')
                return False

        return (
            f'Migrated {self.lrs_statement_count} statements to S3'
            f"(buckets={app.config['LRS_CANVAS_INCREMENTAL_DESTINATION_BUCKETS']}, path={destination_path})"
        )
コード例 #29
0
    def run(self):
        # Retrieve latest schema definitions from Canvas data API
        response = canvas_data.get_canvas_data_schema()
        external_schema = app.config['REDSHIFT_SCHEMA_CANVAS']
        redshift_iam_role = app.config['REDSHIFT_IAM_ROLE']
        canvas_schema = []

        # Parse and isolate table and column details
        for key, value in response['schema'].items():
            for column in value['columns']:
                # Not every column has description and length.
                description = None
                if 'description' in column:
                    description = column['description']

                length = None
                if 'length' in column:
                    length = column['length']

                canvas_schema.append([
                    value['tableName'],
                    column['name'],
                    column['type'],
                    description,
                    length,
                ])
        # Create a dataframe
        schema_df = pd.DataFrame(canvas_schema)
        schema_df.columns = [
            'table_name',
            'column_name',
            'column_type',
            'column_description',
            'column_length',
        ]

        # The schema definitions received from Canvas are Redshift compliant. We update
        # cetain column types to match Glue and Spectrum data types.
        schema_df['glue_type'] = schema_df['column_type'].replace({
            'enum':
            'varchar',
            'guid':
            'varchar',
            'text':
            'varchar(max)',
            'date':
            'timestamp',
            'datetime':
            'timestamp',
        })

        schema_df['transformed_column_name'] = schema_df[
            'column_name'].replace({
                'default': '"default"',
                'percent': '"percent"',
            })
        # Create Hive compliant storage descriptors
        canvas_external_catalog_ddl = self.generate_external_catalog(
            external_schema, schema_df)

        # Clean up and recreate refreshed tables on Glue using Spectrum
        redshift.drop_external_schema(external_schema)
        redshift.create_external_schema(external_schema, redshift_iam_role)

        if redshift.execute_ddl_script(canvas_external_catalog_ddl):
            app.logger.info('Canvas schema creation job completed.')
        else:
            app.logger.error('Canvas schema creation job failed.')
            raise BackgroundJobError('Canvas schema creation job failed.')

        self.verify_external_data_catalog()
        return 'Canvas external schema created and verified.'