Ejemplo n.º 1
0
    def test_sync_canvas_snapshots(self, app, metadata_db, caplog):
        """Dispatches a complete sync job against fixtures."""
        caplog.set_level(logging.INFO)
        with capture_app_logs(app):
            with mock_s3(app):
                result = SyncCanvasSnapshots().run_wrapped()
            assert 'Canvas snapshot sync job dispatched to workers' in result
            assert_background_job_status('sync')
            assert 'Dispatched S3 sync of snapshot quiz_dim-00000-0ab80c7c.gz' in caplog.text
            assert 'Dispatched S3 sync of snapshot requests-00098-b14782f5.gz' in caplog.text
            assert '311 successful dispatches, 0 failures' in caplog.text

            schema = app.config['RDS_SCHEMA_METADATA']

            count_results = rds.fetch(f'SELECT count(*) FROM {schema}.canvas_sync_job_status')
            assert count_results[0]['count'] == 311

            canvas_status_results = rds.fetch(f'SELECT DISTINCT status FROM {schema}.canvas_sync_job_status')
            assert len(canvas_status_results) == 1
            assert canvas_status_results[0]['status'] == 'created'

            sync_results = rds.fetch(f'SELECT * FROM {schema}.canvas_sync_job_status LIMIT 1')
            assert sync_results[0]['job_id'].startswith('sync_')
            assert sync_results[0]['filename'] == 'account_dim-00000-5eb7ee9e.gz'
            assert sync_results[0]['canvas_table'] == 'account_dim'
            assert 'account_dim/part-00505-5c40f1f3-b611-4f64-a007-67b775e984fe.c000.txt.gz' in sync_results[0]['source_url']
            assert sync_results[0]['destination_url'] is None
            assert sync_results[0]['details'] is None
            assert sync_results[0]['created_at']
            assert sync_results[0]['updated_at']
Ejemplo n.º 2
0
    def test_canvas_sync_metadata(self, app, metadata_db):
        """When given a job id, updates metadata on file sync."""
        url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
        key = 'canvas/sonnet_submission_dim/sonnet-xlv.txt'

        with mock_s3(app):
            with open(_get_fixtures_path() + '/sonnet_xlv.html', 'r') as file:
                responses.add(responses.GET,
                              url,
                              body=file.read(),
                              headers={'Content-Length': '767'})

            # Run two successive sync jobs on the same file. The first succeeds, the second is skipped as
            # a duplicate.
            metadata.create_canvas_sync_status('job_1', 'sonnet-xlv.txt',
                                               'sonnet_submission_dim', url)
            result = SyncFileToS3().run(url=url,
                                        key=key,
                                        canvas_sync_job_id='job_1')
            assert result is True
            metadata.create_canvas_sync_status('job_2', 'sonnet-xlv.txt',
                                               'sonnet_submission_dim', url)
            result = SyncFileToS3().run(url=url,
                                        key=key,
                                        canvas_sync_job_id='job_2')
            assert result is False

            schema = app.config['RDS_SCHEMA_METADATA']
            sync_metadata = rds.fetch(
                f'SELECT * FROM {schema}.canvas_sync_job_status')
            snapshot_metadata = rds.fetch(
                f'SELECT * FROM {schema}.canvas_synced_snapshots')

            assert len(sync_metadata) == 2
            assert sync_metadata[0]['job_id'] == 'job_1'
            assert sync_metadata[0][
                'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt'
            assert sync_metadata[0]['status'] == 'complete'
            assert sync_metadata[0]['source_size'] == 767
            assert sync_metadata[0]['destination_size'] == 767
            assert sync_metadata[0]['updated_at'] > sync_metadata[0][
                'created_at']
            assert sync_metadata[1]['job_id'] == 'job_2'
            assert sync_metadata[1][
                'destination_url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt'
            assert sync_metadata[1]['status'] == 'duplicate'
            assert sync_metadata[1]['source_size'] is None
            assert sync_metadata[1]['destination_size'] is None
            assert sync_metadata[1]['updated_at'] > sync_metadata[1][
                'created_at']

            assert len(snapshot_metadata) == 1
            assert snapshot_metadata[0]['filename'] == 'sonnet-xlv.txt'
            assert snapshot_metadata[0][
                'canvas_table'] == 'sonnet_submission_dim'
            assert snapshot_metadata[0][
                'url'] == 's3://mock-bucket/canvas/sonnet_submission_dim/sonnet-xlv.txt'
            assert snapshot_metadata[0]['size'] == 767
            assert snapshot_metadata[0]['created_at']
            assert snapshot_metadata[0]['deleted_at'] is None
Ejemplo n.º 3
0
    def test_import_student_photos(self, app, metadata_db, student_tables,
                                   caplog):
        from nessie.jobs.import_student_photos import ImportStudentPhotos
        caplog.set_level(logging.DEBUG)
        with capture_app_logs(app):
            with mock_s3(app):
                result = ImportStudentPhotos().run_wrapped()
                assert result == 'Student photo import completed: 1 succeeded, 9 had no photo available, 0 failed.'
                response = s3.get_keys_with_prefix('cal1card-data/photos')
                assert len(response) == 1
                assert response[0] == 'cal1card-data/photos/61889.jpg'

            success_rows = rds.fetch(
                f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'success'"
            )
            assert len(success_rows) == 1
            assert success_rows[0]['sid'] == '11667051'

            failure_rows = rds.fetch(
                f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'failure'"
            )
            assert len(failure_rows) == 0

            not_found_rows = rds.fetch(
                f"SELECT * FROM {app.config['RDS_SCHEMA_METADATA']}.photo_import_status WHERE status = 'photo_not_found'"
            )
            assert len(not_found_rows) == 9
    def test_run(self, app, metadata_db):
        """Uploads Canvas grade change logs to S3, then stores feeds in Redshift."""
        with mock_s3(app):
            with override_config(app, 'TEST_CANVAS_COURSE_IDS',
                                 [1492459, 1488704, 1491827]):
                result = ImportCanvasGradeChangeLog().run_wrapped()
                assert result
                assert 'Canvas grade change log import completed for term 2178: 3 succeeded, ' in result
                assert '0 failed.' in result

        assert_background_job_status('ImportCanvasGradeChangeLog')
        schema = app.config['RDS_SCHEMA_METADATA']
        count_results = rds.fetch(
            f'SELECT count(*) FROM {schema}.canvas_api_import_job_status')
        assert count_results[0]['count'] == 3

        canvas_status_results = rds.fetch(
            f'SELECT DISTINCT status FROM {schema}.canvas_api_import_job_status'
        )
        assert len(canvas_status_results) == 1
        assert canvas_status_results[0]['status'] == 'created'

        sync_results = rds.fetch(
            f'SELECT * FROM {schema}.canvas_api_import_job_status LIMIT 1')
        assert sync_results[0]['job_id'].startswith(
            'ImportCanvasGradeChangeLog_')
        assert sync_results[0]['course_id'] == '1492459'
        assert sync_results[0]['table_name'] == 'grade_change_log'
        assert sync_results[0]['details'] is None
        assert sync_results[0]['created_at']
        assert sync_results[0]['updated_at']
Ejemplo n.º 5
0
def get_failures_from_last_sync():
    last_job_id = None
    failures = []

    job_id_result = rds.fetch(
        f"""SELECT MAX(job_id) AS last_job_id FROM {_rds_schema()}.canvas_sync_job_status WHERE job_id LIKE %s""",
        params=['sync%%'],
    )
    if not job_id_result:
        app.logger.error('Failed to retrieve id for last sync job')
    else:
        last_job_id = job_id_result[0]['last_job_id']
        failures_query = f"""SELECT * FROM {_rds_schema()}.canvas_sync_job_status WHERE job_id = %s
            AND (status NOT IN ('complete', 'duplicate') OR destination_size != source_size)"""
        failures = rds.fetch(failures_query, params=[last_job_id])
    return {'job_id': last_job_id, 'failures': failures}
Ejemplo n.º 6
0
 def _success_history_after_batch_import():
     result = ImportRegistrations().run_wrapped(load_mode='batch')
     assert result == 'Registrations import completed: 1 succeeded, 8 failed.'
     rows = rds.fetch("SELECT * FROM nessie_metadata_test.registration_import_status WHERE status = 'success' ORDER BY updated_at")
     assert len(rows) == 2
     assert rows[0]['updated_at'] < rows[1]['updated_at']
     return (rows[0]['sid'], rows[1]['sid'])
Ejemplo n.º 7
0
def get_active_sids_with_oldest_registration_imports(limit):
    active_sids = [r['sid'] for r in get_all_student_ids()]
    sql = f"""SELECT sid FROM {metadata_schema()}.registration_import_status
        WHERE sid = ANY(%s)
        AND status = 'success'
        ORDER BY updated_at LIMIT %s"""
    return rds.fetch(sql, params=(active_sids, limit))
 def refresh_term_index(self, app):
     from nessie.jobs.create_sis_terms_schema import CreateSisTermsSchema
     CreateSisTermsSchema().refresh_current_term_index()
     rds_schema = app.config['RDS_SCHEMA_SIS_TERMS']
     rows = rds.fetch(f'SELECT * FROM {rds_schema}.current_term_index')
     assert len(rows) == 1
     return rows[0]
    def import_appointment_advisors(self):
        sis_notes_schema = app.config['RDS_SCHEMA_SIS_ADVISING_NOTES']
        advisor_schema_redshift = app.config['REDSHIFT_SCHEMA_ADVISOR_INTERNAL']

        advisor_sids_from_sis_appointments = set(
            [r['advisor_sid'] for r in rds.fetch(f'SELECT DISTINCT advisor_sid FROM {sis_notes_schema}.advising_appointments')],
        )
        advisor_sids_from_advisors = set(
            [r['sid'] for r in redshift.fetch(f'SELECT DISTINCT sid FROM {advisor_schema_redshift}.advisor_departments')],
        )
        advisor_sids = list(advisor_sids_from_sis_appointments | advisor_sids_from_advisors)

        advisor_attributes = calnet.client(app).search_csids(advisor_sids)
        if not advisor_attributes:
            raise BackgroundJobError('Failed to fetch note author attributes.')

        unique_advisor_attributes = list({adv['uid']: adv for adv in advisor_attributes}.values())

        with rds.transaction() as transaction:
            insertable_rows = []
            for entry in unique_advisor_attributes:
                first_name, last_name = calnet.split_sortable_name(entry)
                insertable_rows.append(tuple((entry.get('uid'), entry.get('csid'), first_name, last_name)))

            result = transaction.insert_bulk(
                f'INSERT INTO {sis_notes_schema}.advising_appointment_advisors (uid, sid, first_name, last_name) VALUES %s',
                insertable_rows,
            )
            if result:
                transaction.commit()
                app.logger.info('Imported appointment advisor attributes.')
            else:
                transaction.rollback()
                raise BackgroundJobError('Failed to import appointment advisor attributes.')
Ejemplo n.º 10
0
    def _advisor_attributes_by_uid(self):
        asc_schema = app.config['RDS_SCHEMA_ASC']
        e_i_schema = app.config['RDS_SCHEMA_E_I']

        advisor_uids_from_asc_notes = set([
            r['advisor_uid'] for r in rds.fetch(
                f'SELECT DISTINCT advisor_uid FROM {asc_schema}.advising_notes'
            )
        ], )
        advisor_uids_from_e_i_notes = set([
            r['advisor_uid'] for r in rds.fetch(
                f'SELECT DISTINCT advisor_uid FROM {e_i_schema}.advising_notes'
            )
        ], )
        advisor_uids = list(advisor_uids_from_asc_notes
                            | advisor_uids_from_e_i_notes)
        return calnet.client(app).search_uids(advisor_uids)
Ejemplo n.º 11
0
 def _advisor_attributes_by_email(self):
     data_science_schema = app.config['RDS_SCHEMA_DATA_SCIENCE']
     sql = f"""
         SELECT DISTINCT advisor_email FROM {data_science_schema}.advising_notes
         WHERE advisor_email IS NOT NULL
     """
     advisor_emails = set([r['advisor_email'] for r in rds.fetch(sql)])
     return calnet.client(app).search_emails(list(advisor_emails))
Ejemplo n.º 12
0
def assert_background_job_status(prefix):
    from flask import current_app as app
    schema = app.config['RDS_SCHEMA_METADATA']
    background_job_status_results = rds.fetch(f'SELECT * FROM {schema}.background_job_status')
    assert len(background_job_status_results) == 1
    assert background_job_status_results[0]['job_id'].startswith(f'{prefix}_')
    assert background_job_status_results[0]['status'] == 'succeeded'
    assert background_job_status_results[0]['created_at']
    assert background_job_status_results[0]['updated_at'] > background_job_status_results[0]['created_at']
Ejemplo n.º 13
0
 def get_sis_current_term(self, for_date):
     rows = rds.fetch(
         f"""SELECT *, DATE(term_ends + INTERVAL '10 DAYS') AS grace_period_ends
             FROM {self.rds_schema}.term_definitions
             WHERE DATE(term_ends + INTERVAL '10 DAYS') >= '{for_date}'
             ORDER BY term_id ASC LIMIT 2""", )
     if rows:
         return rows[1] if (
             for_date >= rows[1]['term_begins']
             or for_date > rows[0]['grace_period_ends']) else rows[0]
Ejemplo n.º 14
0
def get_merged_enrollment_term_job_status(master_job_id):
    return rds.fetch(
        f"""SELECT *
        FROM {_rds_schema()}.merged_enrollment_term_job_queue
        WHERE master_job_id=%s
        ORDER BY term_id
        """,
        params=(master_job_id, ),
        log_query=False,
    )
Ejemplo n.º 15
0
def most_recent_background_job_status(job_id_prefix, status=None):
    sql = f'SELECT * FROM {_rds_schema()}.background_job_status WHERE job_id LIKE %s'
    params = [f'{job_id_prefix}%']
    if status:
        sql += ' AND status = %s'
        params += [status]
    sql += ' ORDER BY updated_at DESC LIMIT 1'
    result = rds.fetch(
        sql,
        params,
    )
    if result and result[0]:
        return result[0]
Ejemplo n.º 16
0
 def test_metadata_tracked(self, app, metadata_db, student_tables, caplog):
     from nessie.jobs.import_registrations import ImportRegistrations
     rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status')
     assert len(rows) == 0
     caplog.set_level(logging.DEBUG)
     with capture_app_logs(app):
         with mock_s3(app):
             ImportRegistrations().run_wrapped()
             rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status')
             assert len(rows) == 10
             assert len([r for r in rows if r['status'] == 'failure']) == 8
             assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success'
             result = ImportRegistrations().run_wrapped()
             assert result == 'Registrations import completed: 0 succeeded, 8 failed.'
             result = ImportRegistrations().run_wrapped(load_mode='all')
             assert result == 'Registrations import completed: 2 succeeded, 8 failed.'
             rds.execute("DELETE FROM nessie_metadata_test.registration_import_status WHERE sid = '11667051'")
             result = ImportRegistrations().run_wrapped()
             assert result == 'Registrations import completed: 1 succeeded, 8 failed.'
             assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success'
             rds.execute("UPDATE nessie_metadata_test.registration_import_status SET status='failure' WHERE sid = '11667051'")
             result = ImportRegistrations().run_wrapped()
             assert result == 'Registrations import completed: 1 succeeded, 8 failed.'
             assert next(r['status'] for r in rows if r['sid'] == '11667051') == 'success'
Ejemplo n.º 17
0
    def test_refresh_current_term_index(self, mock_datetime, app,
                                        term_definitions):
        mock_datetime.now.return_value = datetime(year=2018,
                                                  month=5,
                                                  day=1,
                                                  hour=5,
                                                  minute=21)
        from nessie.jobs.create_sis_terms_schema import CreateSisTermsSchema
        CreateSisTermsSchema().refresh_current_term_index()

        rds_schema = app.config['RDS_SCHEMA_SIS_TERMS']
        rows = rds.fetch(f'SELECT * FROM {rds_schema}.current_term_index')
        assert len(rows) == 1
        assert rows[0]['current_term_name'] == 'Spring 2018'
        assert rows[0]['future_term_name'] == 'Fall 2018'
Ejemplo n.º 18
0
def poll_merged_enrollment_term_job_queue():
    result = rds.fetch(
        f"""UPDATE {_rds_schema()}.merged_enrollment_term_job_queue
        SET status='started', instance_id=%s
        WHERE id = (
            SELECT id
            FROM {_rds_schema()}.merged_enrollment_term_job_queue
            WHERE status = 'created'
            LIMIT 1
            FOR UPDATE SKIP LOCKED
        )
        RETURNING id, master_job_id, term_id
        """,
        params=(_instance_id(), ),
        log_query=False,
    )
    if result and result[0]:
        return result[0]
Ejemplo n.º 19
0
    def _advisor_attributes_by_sid(self):
        sis_notes_schema = app.config['RDS_SCHEMA_SIS_ADVISING_NOTES']
        advisor_schema_redshift = app.config[
            'REDSHIFT_SCHEMA_ADVISOR_INTERNAL']

        advisor_sids_from_sis_notes = set([
            r['advisor_sid'] for r in rds.fetch(
                f'SELECT DISTINCT advisor_sid FROM {sis_notes_schema}.advising_notes'
            )
        ], )
        advisor_sids_from_advisors = set([
            r['sid'] for r in redshift.fetch(
                f'SELECT DISTINCT sid FROM {advisor_schema_redshift}.advisor_departments'
            )
        ], )
        advisor_sids = list(advisor_sids_from_sis_notes
                            | advisor_sids_from_advisors)
        return calnet.client(app).search_csids(advisor_sids)
Ejemplo n.º 20
0
    def test_import_registrations_batch_mode(self, app, metadata_db, student_tables, caplog):
        from nessie.jobs.import_registrations import ImportRegistrations
        with mock_s3(app):
            ImportRegistrations().run_wrapped()
            rows = rds.fetch('SELECT * FROM nessie_metadata_test.registration_import_status')
            assert len(rows) == 10

            with override_config(app, 'CYCLICAL_API_IMPORT_BATCH_SIZE', 9):

                def _success_history_after_batch_import():
                    result = ImportRegistrations().run_wrapped(load_mode='batch')
                    assert result == 'Registrations import completed: 1 succeeded, 8 failed.'
                    rows = rds.fetch("SELECT * FROM nessie_metadata_test.registration_import_status WHERE status = 'success' ORDER BY updated_at")
                    assert len(rows) == 2
                    assert rows[0]['updated_at'] < rows[1]['updated_at']
                    return (rows[0]['sid'], rows[1]['sid'])

                sid_1, sid_2 = _success_history_after_batch_import()
                assert _success_history_after_batch_import() == (sid_2, sid_1)
                assert _success_history_after_batch_import() == (sid_1, sid_2)
Ejemplo n.º 21
0
def get_advisee_sids_with_photos():
    sql = f"""SELECT sid
        FROM {metadata_schema()}.photo_import_status
        WHERE status = 'success'"""
    return rds.fetch(sql)
Ejemplo n.º 22
0
 def get_sis_term_for_id(self, term_id):
     sql = f"SELECT * FROM {rds_schema}.term_definitions WHERE term_id = '{term_id}' LIMIT 1"
     rows = rds.fetch(sql)
     return rows and rows[0]
Ejemplo n.º 23
0
 def get_sis_current_term(self, for_date):
     sql = f"SELECT * FROM {rds_schema}.term_definitions WHERE term_ends > '{for_date}' ORDER BY term_id ASC LIMIT 1"
     rows = rds.fetch(sql)
     return rows and rows[0]
Ejemplo n.º 24
0
    def test_resync_canvas_snapshots(self, app, metadata_db, caplog):
        """Dispatches a complete resync job against fixtures."""
        caplog.set_level(logging.INFO)
        snapshots = canvas_data.get_snapshots()['files']

        def mock_metadata(job_id, snapshot, status, destination_size):
            metadata.create_canvas_sync_status(job_id, snapshot['filename'],
                                               snapshot['table'],
                                               snapshot['url'])
            key = '/'.join([
                get_s3_canvas_daily_path(), snapshot['table'],
                snapshot['filename']
            ])
            metadata.update_canvas_sync_status(
                job_id,
                key,
                status,
                source_size=1048576,
                destination_size=destination_size)

        old_sync_job = 'sync_152550000'
        latest_sync_job = 'sync_152560000'

        # The older job should be ignored by the resync.
        for snapshot in snapshots[0:5]:
            mock_metadata(old_sync_job, snapshot, 'complete', 1048576)
        for snapshot in snapshots[5:10]:
            mock_metadata(old_sync_job, snapshot, 'error', None)

        # The latest job synced five files successfully and ran into three problems.
        for snapshot in snapshots[10:15]:
            mock_metadata(latest_sync_job, snapshot, 'complete', 1048576)
        stalled = snapshots[15]
        errored = snapshots[16]
        size_discrepancy = snapshots[17]
        mock_metadata(latest_sync_job, stalled, 'streaming', None)
        mock_metadata(latest_sync_job, errored, 'error', None)
        mock_metadata(latest_sync_job, size_discrepancy, 'complete', 65536)

        schema = app.config['RDS_SCHEMA_METADATA']

        with capture_app_logs(app):
            assert rds.fetch(
                f'SELECT count(*) FROM {schema}.canvas_sync_job_status'
            )[0]['count'] == 18
            with mock_s3(app):
                result = ResyncCanvasSnapshots().run_wrapped()
            assert 'Canvas snapshot resync job dispatched to workers' in result
            assert_background_job_status('resync')
            assert f"Dispatched S3 resync of snapshot {stalled['filename']}" in caplog.text
            assert f"Dispatched S3 resync of snapshot {errored['filename']}" in caplog.text
            assert f"Dispatched S3 resync of snapshot {size_discrepancy['filename']}" in caplog.text
            assert '3 successful dispatches, 0 failures' in caplog.text

        assert rds.fetch(
            f'SELECT count(*) FROM {schema}.canvas_sync_job_status'
        )[0]['count'] == 21
        resync_results = rds.fetch(
            f"SELECT * FROM {schema}.canvas_sync_job_status WHERE job_id LIKE 'resync%'"
        )
        assert len(resync_results) == 3

        urls = []
        for r in resync_results:
            assert r['job_id'].startswith('resync_')
            assert r['filename']
            assert r['canvas_table']
            assert r['created_at']
            assert r['updated_at']
            urls.append(r['source_url'])
        assert stalled['url'] in urls
        assert errored['url'] in urls
        assert size_discrepancy['url'] in urls
Ejemplo n.º 25
0
def get_current_term_index():
    rds_terms_schema = app.config['RDS_SCHEMA_TERMS']
    sql = f'SELECT * FROM {rds_terms_schema}.current_term_index LIMIT 1'
    rows = rds.fetch(sql)
    return rows and rows[0]
Ejemplo n.º 26
0
def get_sids_with_registration_imports():
    sql = f"""SELECT sid
        FROM {metadata_schema()}.registration_import_status
        WHERE status = 'success'"""
    return rds.fetch(sql)
Ejemplo n.º 27
0
def background_job_status_by_date(created_date):
    sql = f'SELECT * FROM {_rds_schema()}.background_job_status WHERE cast(created_at as date) = %s'
    return rds.fetch(
        sql,
        params=[created_date.strftime('%Y-%m-%d')],
    )