Exemple #1
0
    def test_remove_obsolete_files(self, app, caplog, cleanup_s3):
        """Removes files from S3 following prefix and whitelist rules."""
        caplog.set_level(logging.INFO)
        with capture_app_logs(app):
            prefix1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/001'
            prefix2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/002'

            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XX.html',
                prefix1 + '/xx/sonnet-xx.html')
            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XXI.html',
                prefix1 + '/xxi/sonnet-xxi.html')
            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XXII.html',
                prefix1 + '/xxii/sonnet-xxii.html')
            assert s3.upload_from_url(
                'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html',
                prefix2 + '/xlv/sonnet-xlv.html')

            whitelist = ['sonnet-xxi.html', 'sonnet-xxii.html']
            assert s3.delete_objects_with_prefix(prefix1, whitelist) is True

            assert f'3 key(s) matching prefix "{prefix1}"' in caplog.text
            assert '2 key(s) in whitelist' in caplog.text
            assert 'will delete 1 object(s)' in caplog.text

            assert s3.object_exists(prefix1 + '/xx/sonnet-xx.html') is False
            assert s3.object_exists(prefix1 + '/xxi/sonnet-xxi.html') is True
            assert s3.object_exists(prefix1 + '/xxii/sonnet-xxii.html') is True
            assert s3.object_exists(prefix2 + '/xlv/sonnet-xlv.html') is True
Exemple #2
0
    def test_file_upload_and_delete(self, app, cleanup_s3):
        """Can upload and delete files in S3."""
        url1 = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
        key1 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html'

        url2 = 'http://shakespeare.mit.edu/Poetry/sonnet.LXII.html'
        key2 = app.config['LOCH_S3_PREFIX_TESTEXT'] + '/00002/sonnet-xlii.html'

        assert s3.object_exists(key1) is False
        assert s3.upload_from_url(url1, key1)['ContentLength'] == 767
        assert s3.object_exists(key1) is True
        assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] +
                                       '/00001') == [key1]

        assert s3.object_exists(key2) is False
        assert s3.upload_from_url(url2, key2)['ContentLength'] == 743
        assert s3.object_exists(key2) is True
        assert s3.get_keys_with_prefix(app.config['LOCH_S3_PREFIX_TESTEXT'] +
                                       '/00002') == [key2]

        client = s3.get_client()
        contents1 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'],
                                      Key=key1)['Body'].read().decode('utf-8')
        assert 'These present-absent with swift motion slide' in contents1
        contents2 = client.get_object(Bucket=app.config['LOCH_S3_BUCKET'],
                                      Key=key2)['Body'].read().decode('utf-8')
        assert 'Beated and chopp\'d with tann\'d antiquity' in contents2
Exemple #3
0
 def test_s3_upload_error_handling(self, app, caplog, bad_bucket):
     """Handles and logs connection errors on S3 upload."""
     with capture_app_logs(app):
         url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
         key = app.config[
             'LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html'
         with pytest.raises(ValueError):
             s3.upload_from_url(url, key)
             assert 'Error on S3 upload' in caplog.text
             assert 'the bucket \'not-a-bucket-nohow\' does not exist, or is forbidden for access' in caplog.text
Exemple #4
0
 def test_source_url_error_handling(self, app, caplog):
     """Handles and logs connection errors to source URL."""
     with capture_app_logs(app):
         url = 'http://shakespeare.mit.edu/Poetry/sonnet.XLV.html'
         key = app.config[
             'LOCH_S3_PREFIX_TESTEXT'] + '/00001/sonnet-xlv.html'
         responses.add(responses.GET,
                       url,
                       status=500,
                       body='{"message": "Internal server error."}')
         with pytest.raises(ConnectionError):
             s3.upload_from_url(url, key)
             assert 'Received unexpected status code, aborting S3 upload' in caplog.text
             assert 'status=500' in caplog.text
             assert 'body={"message": "Internal server error."}' in caplog.text
             assert f'url={url}' in caplog.text
             assert f'key={key}' in caplog.text
Exemple #5
0
    def run(self, url, key, canvas_sync_job_id=None):
        if canvas_sync_job_id:
            update_canvas_sync_status(canvas_sync_job_id, key, 'started')
        if s3.object_exists(key):
            app.logger.info(f'Key {key} exists, skipping upload')
            if canvas_sync_job_id:
                update_canvas_sync_status(canvas_sync_job_id, key, 'duplicate')
            return False
        else:
            app.logger.info(f'Key {key} does not exist, starting upload')
            try:

                def update_streaming_status(headers):
                    update_canvas_sync_status(
                        canvas_sync_job_id,
                        key,
                        'streaming',
                        source_size=headers.get('Content-Length'))

                response = s3.upload_from_url(
                    url, key, on_stream_opened=update_streaming_status)
                if response and canvas_sync_job_id:
                    destination_size = response.get('ContentLength')
                    update_canvas_sync_status(
                        canvas_sync_job_id,
                        key,
                        'complete',
                        destination_size=destination_size)
                    create_canvas_snapshot(key, size=destination_size)
                return True
            except (ClientError, ConnectionError, ValueError) as e:
                if canvas_sync_job_id:
                    update_canvas_sync_status(canvas_sync_job_id,
                                              key,
                                              'error',
                                              details=str(e))
                return False
Exemple #6
0
    def process_archives(self, frequency, datestamp, job_id):
        s3_key = app.config['LOCH_S3_PIAZZA_DATA_PATH']
        self.sid = app.config['PIAZZA_API_SID']  # this is Piazza school ID for Berkeley
        self.session_id = app.config['PIAZZA_API_SESSIONID']  # a 'random string' but we still are getting it from config
        self.headers = {
            'Content-Type': 'application/json',
            'CSRF-Token': self.session_id,
        }
        try:
            list_of_archives = self.get_list_of_archives(self.headers)
            archives_to_process = self.select_archives_by_type_and_date(list_of_archives, frequency, datestamp)
            if not archives_to_process:
                app.logger.debug(f'{frequency}/{datestamp}: no archives found for these criteria')
                return f'{frequency}/{datestamp}: no archives found for these criteria'
            for file_number, archive_file in enumerate(archives_to_process):
                download_url = self.piazza_api('school.generate_url', self.headers, {'sid': self.sid, 'name': archive_file['name']})
                download_url = download_url.text
                download_url = json.loads(download_url)['result']
                app.logger.debug('Download URL: ' + download_url)
                piazza_file_name = archive_file['name']
                # piazza_file_name is like 'daily_2020-08-14.zip' or 'full_2020-08-14.zip'
                # in s3 it will end up in e.g. .../piazza-data/daily/2020/08/14/daily_2020-08-14.zip
                parts = '/'.join(split('[\._\-]', piazza_file_name)[0:4])
                s3_file = f'{s3_key}/{parts}/{piazza_file_name}.zip'

                def update_streaming_status(headers):
                    update_background_job_status(job_id, 'streaming', details=f"{s3_file}, size={headers.get('Content-Length')}")

                response = s3.upload_from_url(download_url, s3_file, on_stream_opened=update_streaming_status)
                if response and job_id:
                    destination_size = response.get('ContentLength')
                    update_background_job_status(job_id, 'stream complete', details=f'{s3_file}, stream complete, size={destination_size}')
        except Exception as e:
            # let the people upstairs know, they're in charge
            raise e
        return ', '.join(f"{a['name']}: {a['size']} bytes" for a in archives_to_process)