def test_call_s3select_incomplete_response(self): """ Test that an incomplete response from S3 Select is detected and an exception is raised. """ bucket = "bucket" key = ".quilt/packages/manifest_hash" expected_sql = "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s" expected_args = { 'Bucket': bucket, 'Key': key, 'Expression': expected_sql, 'ExpressionType': 'SQL', 'InputSerialization': { 'CompressionType': 'NONE', 'JSON': {'Type': 'LINES'} }, 'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}}, } mock_s3 = boto3.client('s3') with patch.object( mock_s3, 'select_object_content', return_value=self.s3response_incomplete ) as patched: with self.assertRaises(IncompleteResultException): query_manifest_content( mock_s3, bucket=bucket, key=key, sql_stmt=expected_sql ) patched.assert_called_once_with(**expected_args)
def test_call_s3select(self): """ Test that parameters are correctly passed to S3 Select (without a prefix) """ bucket = "bucket" key = ".quilt/packages/manifest_hash" expected_sql = "SELECT SUBSTRING(s.logical_key, 1) AS logical_key FROM s3object s" expected_args = { 'Bucket': bucket, 'Key': key, 'Expression': expected_sql, 'ExpressionType': 'SQL', 'InputSerialization': { 'CompressionType': 'NONE', 'JSON': {'Type': 'LINES'} }, 'OutputSerialization': {'JSON': {'RecordDelimiter': '\n'}}, } mock_s3 = boto3.client('s3') with patch.object( mock_s3, 'select_object_content', return_value=self.s3response ) as patched: query_manifest_content( mock_s3, bucket=bucket, key=key, sql_stmt=expected_sql) patched.assert_called_once_with(**expected_args)
def select_package_stats(s3_client, bucket, manifest_key) -> str: """use s3 select to generate file stats for package""" logger_ = get_quilt_logger() try: raw_stats = query_manifest_content( s3_client, bucket=bucket, key=manifest_key, sql_stmt=SELECT_PACKAGE_STATS ).read() if raw_stats: stats = json.loads(raw_stats) assert isinstance(stats['total_bytes'], int) assert isinstance(stats['total_files'], int) return stats except ( AssertionError, botocore.exceptions.ClientError, json.JSONDecodeError, KeyError, ) as err: logger_.exception("Unable to compute package stats via S3 select") return None
def select_manifest_meta(s3_client, bucket: str, key: str): """ wrapper for retry and returning a string """ try: raw = query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=SELECT_PACKAGE_META) return raw.read() except botocore.exceptions.ClientError as cle: print(f"Unable to S3 select manifest: {cle}") return None
def lambda_handler(request): """ Parse a manifest to return a folder-like view of its contents (logical keys). Returns: JSON response """ bucket = request.args['bucket'] key = request.args['manifest'] prefix = request.args.get('prefix') logical_key = request.args.get('logical_key') access_key = request.args.get('access_key') secret_key = request.args.get('secret_key') session_token = request.args.get('session_token') allow_anonymous_access = bool(os.getenv('ALLOW_ANONYMOUS_ACCESS')) # If credentials are passed in, use them # for the client. If no credentials are supplied, test that # the manifest object is publicly accessible. If so, create # an s3 client using the underlying IAM role's permissions. if access_key and secret_key and session_token: s3_client = create_s3_client(aws_access_key_id=access_key, aws_secret_access_key=secret_key, aws_session_token=session_token) elif (allow_anonymous_access and access_key is None and secret_key is None and session_token is None): # Test to see if the target key is publicly accessible. If not, the call # below will raise and exception and return a 403 response anons3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) try: anons3.head_object(Bucket=bucket, Key=key) except botocore.exceptions.ClientError as error: if error.response.get('Error'): code = error.response['Error']['Code'] if code == '403': return make_json_response( 403, { 'title': 'Access Denied', 'detail': f"Access denied reading manifest: {key}" }) raise error # Use the default S3 client configuration s3_client = boto3.client('s3') else: return make_json_response( 401, { 'title': 'Incomplete credentials', 'detail': "access_key, secret_key and session_token are required" }) assert s3_client # Get details of a single file in the package if logical_key is not None: sql_stmt = f"SELECT s.* FROM s3object s WHERE s.logical_key = '{sql_escape(logical_key)}' LIMIT 1" response_data = json.load( query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=sql_stmt)) else: # Call s3 select to fetch only logical keys matching the # desired prefix (folder path) prefix_length = len(prefix) if prefix is not None else 0 sql_stmt = ( f"SELECT SUBSTRING(s.logical_key, {prefix_length + 1}) AS logical_key" ", s.\"size\", s.physical_keys[0] as physical_key FROM s3object s") if prefix: sql_stmt += f" WHERE SUBSTRING(s.logical_key, 1, {prefix_length}) = '{sql_escape(prefix)}'" result = query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=sql_stmt) # Parse the response into a logical folder view df = pd.read_json(result, lines=True) response_data = file_list_to_folder(df) # Fetch package-level or directory-level metadata if prefix: sql_stmt = f"SELECT s.meta FROM s3object s WHERE s.logical_key = '{sql_escape(prefix)}'" else: sql_stmt = "SELECT s.* FROM s3object s WHERE s.logical_key is NULL" result = query_manifest_content(s3_client, bucket=bucket, key=key, sql_stmt=sql_stmt) meta = json.load(result) if result else {} response_data.update(dict(meta=meta)) ret_val = make_json_response(200, {'contents': response_data}) return ret_val