Ejemplo n.º 1
0
 def test_server_error(self, app, caplog):
     """Logs unexpected server errors."""
     with capture_app_logs(app):
         canvas_error = MockResponse(429, {}, '{"message": "Easy, tiger."}')
         with register_mock(canvas_data.get_snapshots, canvas_error):
             response = canvas_data.get_snapshots()
             assert '429 Client Error: Too Many Requests' in caplog.text
             assert not response
Ejemplo n.º 2
0
 def test_get_snapshots(self, app):
     """Returns fixture data."""
     response = canvas_data.get_snapshots()
     assert response['incomplete'] is False
     assert response['schemaVersion'] == '2.0.0'
     assert len(response['files']) == 348
     assert response['files'][0][
         'filename'] == 'account_dim-00000-5eb7ee9e.gz'
     assert response['files'][0]['table'] == 'account_dim'
     assert response['files'][0]['partial'] is False
     assert response['files'][0]['url'].startswith(
         'https://hosted-data-work.s3.amazonaws.com/20180320T160000.415/dw_split/12345600000054321'
     )
Ejemplo n.º 3
0
    def test_resync_canvas_snapshots(self, app, metadata_db, caplog):
        """Dispatches a complete resync job against fixtures."""
        caplog.set_level(logging.INFO)
        snapshots = canvas_data.get_snapshots()['files']

        def mock_metadata(job_id, snapshot, status, destination_size):
            metadata.create_canvas_sync_status(job_id, snapshot['filename'],
                                               snapshot['table'],
                                               snapshot['url'])
            key = '/'.join([
                get_s3_canvas_daily_path(), snapshot['table'],
                snapshot['filename']
            ])
            metadata.update_canvas_sync_status(
                job_id,
                key,
                status,
                source_size=1048576,
                destination_size=destination_size)

        old_sync_job = 'sync_152550000'
        latest_sync_job = 'sync_152560000'

        # The older job should be ignored by the resync.
        for snapshot in snapshots[0:5]:
            mock_metadata(old_sync_job, snapshot, 'complete', 1048576)
        for snapshot in snapshots[5:10]:
            mock_metadata(old_sync_job, snapshot, 'error', None)

        # The latest job synced five files successfully and ran into three problems.
        for snapshot in snapshots[10:15]:
            mock_metadata(latest_sync_job, snapshot, 'complete', 1048576)
        stalled = snapshots[15]
        errored = snapshots[16]
        size_discrepancy = snapshots[17]
        mock_metadata(latest_sync_job, stalled, 'streaming', None)
        mock_metadata(latest_sync_job, errored, 'error', None)
        mock_metadata(latest_sync_job, size_discrepancy, 'complete', 65536)

        schema = app.config['RDS_SCHEMA_METADATA']

        with capture_app_logs(app):
            assert rds.fetch(
                f'SELECT count(*) FROM {schema}.canvas_sync_job_status'
            )[0]['count'] == 18
            with mock_s3(app):
                result = ResyncCanvasSnapshots().run_wrapped()
            assert 'Canvas snapshot resync job dispatched to workers' in result
            assert_background_job_status('resync')
            assert f"Dispatched S3 resync of snapshot {stalled['filename']}" in caplog.text
            assert f"Dispatched S3 resync of snapshot {errored['filename']}" in caplog.text
            assert f"Dispatched S3 resync of snapshot {size_discrepancy['filename']}" in caplog.text
            assert '3 successful dispatches, 0 failures' in caplog.text

        assert rds.fetch(
            f'SELECT count(*) FROM {schema}.canvas_sync_job_status'
        )[0]['count'] == 21
        resync_results = rds.fetch(
            f"SELECT * FROM {schema}.canvas_sync_job_status WHERE job_id LIKE 'resync%'"
        )
        assert len(resync_results) == 3

        urls = []
        for r in resync_results:
            assert r['job_id'].startswith('resync_')
            assert r['filename']
            assert r['canvas_table']
            assert r['created_at']
            assert r['updated_at']
            urls.append(r['source_url'])
        assert stalled['url'] in urls
        assert errored['url'] in urls
        assert size_discrepancy['url'] in urls
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})')

        snapshot_response = canvas_data.get_snapshots()
        if not snapshot_response:
            raise BackgroundJobError(
                'Error retrieving Canvas data snapshots, aborting job.')
        snapshots = snapshot_response.get('files', [])

        def should_sync(snapshot):
            return snapshot[
                'table'] == 'requests' and snapshot['partial'] is False

        snapshots_to_sync = [s for s in snapshots if should_sync(s)]
        app.logger.info(
            f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.'
        )

        success = 0
        failure = 0

        for snapshot in snapshots_to_sync:
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=snapshot['filename'],
                canvas_table=snapshot['table'],
                source_url=snapshot['url'],
            )

            key_components = [
                app.config['LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'],
                snapshot['table'], snapshot['filename']
            ]

            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3',
                                data={
                                    'canvas_sync_job_id': job_id,
                                    'url': snapshot['url'],
                                    'key': key
                                })

            if not response:
                app.logger.error('Failed to dispatch S3 sync of snapshot ' +
                                 snapshot['filename'])
                metadata.update_canvas_sync_status(
                    job_id,
                    key,
                    'error',
                    details=f'Failed to dispatch: {response}')
                failure += 1
            else:
                app.logger.info('Dispatched S3 sync of snapshot ' +
                                snapshot['filename'])
                success += 1

        if cleanup:
            app.logger.info('Will remove obsolete snapshots from S3.')
            current_snapshot_filenames = [
                s['filename'] for s in snapshots_to_sync
            ]
            requests_prefix = app.config[
                'LOCH_S3_CANVAS_DATA_PATH_HISTORICAL'] + '/requests'
            delete_result = s3.delete_objects_with_prefix(
                requests_prefix, whitelist=current_snapshot_filenames)
            if not delete_result:
                app.logger.error('Cleanup of obsolete snapshots failed.')
        return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'
Ejemplo n.º 5
0
    def run(self, cleanup=True):
        job_id = self.generate_job_id()
        app.logger.info(f'Starting Canvas snapshot sync job... (id={job_id})')

        snapshot_response = canvas_data.get_snapshots()
        if not snapshot_response:
            raise BackgroundJobError(
                'Error retrieving Canvas data snapshots, aborting job.')
        snapshots = snapshot_response.get('files', [])

        def should_sync(snapshot):
            # For tables other than requests, sync all snapshots.
            # For the requests table, sync snapshots that are partial or later than the configured cutoff date.
            def after_cutoff_date(url):
                match = re.search('requests/(20\d{6})', url)
                return match is not None and (
                    match[1] >=
                    app.config['LOCH_CANVAS_DATA_REQUESTS_CUTOFF_DATE'])

            return snapshot['table'] != 'requests' or snapshot[
                'partial'] is True or after_cutoff_date(snapshot['url'])

        snapshots_to_sync = [s for s in snapshots if should_sync(s)]
        app.logger.info(
            f'Will sync {len(snapshots_to_sync)} of {len(snapshots)} available files from Canvas Data.'
        )

        success = 0
        failure = 0

        for snapshot in snapshots_to_sync:
            metadata.create_canvas_sync_status(
                job_id=job_id,
                filename=snapshot['filename'],
                canvas_table=snapshot['table'],
                source_url=snapshot['url'],
            )
            if snapshot['table'] == 'requests':
                key_components = [
                    berkeley.s3_canvas_data_path_current_term(),
                    snapshot['table'], snapshot['filename']
                ]
            else:
                key_components = [
                    get_s3_canvas_daily_path(), snapshot['table'],
                    snapshot['filename']
                ]

            key = '/'.join(key_components)
            response = dispatch('sync_file_to_s3',
                                data={
                                    'canvas_sync_job_id': job_id,
                                    'url': snapshot['url'],
                                    'key': key
                                })

            if not response:
                app.logger.error('Failed to dispatch S3 sync of snapshot ' +
                                 snapshot['filename'])
                metadata.update_canvas_sync_status(
                    job_id,
                    key,
                    'error',
                    details=f'Failed to dispatch: {response}')
                failure += 1
            else:
                app.logger.info('Dispatched S3 sync of snapshot ' +
                                snapshot['filename'])
                success += 1

        if cleanup:
            app.logger.info('Will remove obsolete snapshots from S3.')
            current_snapshot_filenames = [
                s['filename'] for s in snapshots_to_sync
            ]
            requests_prefix = berkeley.s3_canvas_data_path_current_term(
            ) + '/requests'
            delete_result = s3.delete_objects_with_prefix(
                requests_prefix, whitelist=current_snapshot_filenames)
            if not delete_result:
                app.logger.error('Cleanup of obsolete snapshots failed.')
        return f'Canvas snapshot sync job dispatched to workers ({success} successful dispatches, {failure} failures).'