def test_s3_uri_ordering(): s3_uri_a = S3URI.parse_s3_uri("s3://s3-bucket/path/a") s3_uri_b = S3URI.parse_s3_uri("s3://s3-bucket/path/b") s3_uri_1 = S3URI.parse_s3_uri("s3://s3-bucket/path/1") s3_uri_9 = S3URI.parse_s3_uri("s3://s3-bucket/path/9") sorted_uris = sorted([s3_uri_9, s3_uri_b, s3_uri_1, s3_uri_a]) assert sorted_uris == [s3_uri_1, s3_uri_9, s3_uri_a, s3_uri_b]
def test_s3_objects_glob(s3_bucket_name, s3_temp_objects): s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}") s3_objects = list(s3_uri.s3_objects(glob_pattern="**/*.tif")) s3_keys = sorted([obj.key for obj in s3_objects]) s3_temp_keys = sorted([obj.key for obj in s3_temp_objects]) s3_temp_tifs = [key for key in s3_temp_keys if key.endswith(".tif")] assert s3_keys == s3_temp_tifs
def s3_file_info(s3_uri: Union[S3URI, str], s3_client: BaseClient = None) -> S3Info: """ Collect data from an S3 HEAD request for an S3URI :param s3_uri: a fully qualified S3 URI for the s3 object to read :param s3_client: an optional botocore.client.BaseClient for s3 :return: an S3Info object with HEAD data on success; on failure the S3Info object has no HEAD data """ if s3_client is None: s3_client = s3_io_client() if isinstance(s3_uri, str): s3_uri = S3URI(s3_uri) s3_info = S3Info(s3_uri=s3_uri) try: s3_head = s3_client.head_object(Bucket=s3_uri.bucket, Key=s3_uri.key) if response_success(s3_head): # LastModified is a datetime.datetime s3_info.last_modified = s3_head["LastModified"] s3_info.s3_size = int(s3_head["ContentLength"]) LOGGER.debug("Success S3URI info: %s", s3_uri) except ClientError as err: LOGGER.debug("Failed S3URI info: %s", s3_uri) LOGGER.debug(err) return s3_info
async def put_s3_content( data_file: str, s3_uri: str, s3_client: AioBaseClient ) -> Optional[str]: """ Write a file to an s3 object :param data_file: a data file :param s3_uri: a fully qualified S3 URI for an s3 object :param s3_client: a required aiobotocore.client.AioBaseClient for s3 :return: the s3 URI on success """ s3_uri = S3URI(s3_uri) try: async with aiofiles.open(data_file, "rb") as fd: file_bytes = await fd.read() response = await s3_client.put_object( Bucket=s3_uri.bucket, Key=s3_uri.key, Body=file_bytes ) success = response_success(response) if success: exists_waiter = s3_client.get_waiter("object_exists") await exists_waiter.wait(Bucket=s3_uri.bucket, Key=s3_uri.key) return str(s3_uri) except ClientError as err: LOGGER.error("Failed S3 PUT to: %s", s3_uri) LOGGER.error(err)
def put_s3_content(data_file: str, s3_uri: str, s3_client: BaseClient = None) -> Optional[str]: """ Write a file to an s3 URI :param data_file: a data file :param s3_uri: a fully qualified S3 URI for the s3 object to write :param s3_client: an optional botocore.client.BaseClient for s3 :return: the s3 URI on success """ if s3_client is None: s3_client = s3_io_client() s3_uri = S3URI(s3_uri) try: with open(data_file, "rb") as fd: response = s3_client.put_object(Bucket=s3_uri.bucket, Key=s3_uri.key, Body=fd) success = response_success(response) if success: # Use a boto3 waiter to confirm it worked exists_waiter = s3_client.get_waiter("object_exists") exists_waiter.wait(Bucket=s3_uri.bucket, Key=s3_uri.key) return str(s3_uri) except ClientError as err: LOGGER.error("Failed S3 PUT to: %s", s3_uri) LOGGER.error(err)
def test_s3_objects_prefix(s3_bucket_name, s3_temp_objects, s3_temp_dir): # create a mock s3 bucket with a couple of files in it s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}") s3_prefix = s3_temp_dir.split("/")[0] s3_objects = list(s3_uri.s3_objects(prefix=s3_prefix)) s3_keys = sorted([obj.key for obj in s3_objects]) s3_temp_keys = sorted([obj.key for obj in s3_temp_objects]) assert s3_keys == s3_temp_keys
def test_s3_uri_has_no_key(s3_bucket_name): uri = f"s3://{s3_bucket_name}" s3_uri = S3URI.parse_s3_uri(uri) assert isinstance(s3_uri, S3URI) assert s3_uri.bucket == s3_bucket_name assert s3_uri.key == "" assert s3_uri.key_path == "" assert s3_uri.key_file == ""
def geojsons_s3_dump(geojson_features: List[Dict], s3uri: str, s3_client: BaseClient = None) -> Optional[str]: """ Write GeoJSON Text Sequence files to an s3 URI [GeoJSON Text Sequences](https://tools.ietf.org/html/rfc8142) are lines of geojson features that are designed for streaming operations on large datasets. These files can be loaded by geopandas, using fiona `driver="GeoJSONSeq"`, which can be auto-detected. For example: .. code-block:: import geopandas as gpd s3_uri = "s3://your-bucket/prefix/input.geojsons" gdf = gpd.read_file(s3_uri) :param geojson_features: a list of geojson features; from any feature collection, this is geojson_collection["features"] :param s3uri: a fully qualified S3 URI for the s3 object to write :param s3_client: an optional botocore.client.BaseClient for s3 :return: the s3 URI on success """ if s3_client is None: s3_client = s3_io_client() s3_uri = S3URI(s3uri) success = False tmp_file = None try: with tempfile.NamedTemporaryFile(delete=False) as o_file: tmp_file = o_file.name with open(o_file.name, "w") as fd: for feature in geojson_features: json.dump(feature, fd) fd.write("\n") o_file.flush() s3_obj = put_s3_content(data_file=tmp_file, s3_uri=str(s3_uri), s3_client=s3_client) if s3_obj: success = True finally: if tmp_file: os.unlink(tmp_file) if success: LOGGER.info("Saved GeoJSONSeq to %s", str(s3_uri)) return str(s3_uri) else: LOGGER.error("Failed to save GeoJSONSeq to %s", s3_uri)
def test_s3_objects_glob_hundreds(s3_bucket_name, s3_temp_1000s_objects): # create a mock s3 bucket with hundreds of files in it; this # should exceed the default MaxKeys limit on a filter because # the implementation in S3URI.s3_objects is unlimited. s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}") s3_objects = list(s3_uri.s3_objects(glob_pattern="**/*.tif")) s3_keys = sorted([obj.key for obj in s3_objects]) s3_temp_keys = sorted([obj.key for obj in s3_temp_1000s_objects]) s3_temp_tifs = [key for key in s3_temp_keys if key.endswith(".tif")] assert s3_keys == s3_temp_tifs
def test_s3_exists_for_success(s3_uri_object, mocker): # s3_uri_object fixture exists s3_uri = S3URI(s3_uri_object.s3_uri) mocker.spy(boto3, "client") mocker.spy(boto3, "resource") s3_uri_exists = s3_uri.s3_exists() # the s3 client is used once to check the s3 object exists assert boto3.client.call_count == 0 assert boto3.resource.call_count == 1 assert s3_uri_exists is True
async def test_s3_aio_files_info( aio_aws_s3_client, aio_s3_object_uri, aio_s3_object_text ): s3_uri: str = aio_s3_object_uri s3_text: str = aio_s3_object_text s3_files = await s3_files_info([s3_uri], s3_client=aio_aws_s3_client) for s3_info in s3_files: assert isinstance(s3_info, S3Info) assert s3_info.s3_uri == S3URI(s3_uri) assert s3_info.s3_size == len(s3_text) assert isinstance(s3_info.last_modified, datetime)
async def test_s3_aio_geojsons( geojson_features, aio_aws_s3_client, aio_s3_bucket, ): s3_uri = S3URI(f"s3://{aio_s3_bucket}/tmp.geojsons") result = await geojsons_s3_dump( geojson_features, str(s3_uri), s3_client=aio_aws_s3_client ) assert result == s3_uri.s3_uri data = await geojsons_s3_load(str(s3_uri), s3_client=aio_aws_s3_client) assert data == geojson_features
def test_s3_file_derivatives(s3_temp_file, s3_temp_objects): s3_uri = S3URI.parse_s3_uri(s3_temp_file.s3_uri) file_path = Path(s3_uri.key_file) s3_objects = list(s3_uri.s3_derivatives()) s3_derivative_keys = set([obj.key for obj in s3_objects]) assert s3_derivative_keys for key in s3_derivative_keys: assert key != s3_uri.key assert file_path.stem in Path(key).stem # derivative keys are in anything in key-path/**/stem*.* s3_temp_keys = [obj.key for obj in s3_temp_objects] key_intersect = set(s3_temp_keys).intersection(s3_derivative_keys) assert s3_derivative_keys == key_intersect
def test_s3_object_summary_for_success(s3_uri_object, mocker): # s3_uri_object fixture exists s3_uri = S3URI(s3_uri_object.s3_uri) mocker.spy(boto3, "client") mocker.spy(boto3, "resource") summary = s3_uri.s3_object_summary() # the s3 resource is used once to check the s3 object exists assert boto3.client.call_count == 0 assert boto3.resource.call_count == 1 assert summary.__class__.__name__ == "s3.ObjectSummary" assert summary.bucket_name == s3_uri.bucket assert summary.key == s3_uri.key assert isinstance(summary.last_modified, datetime.datetime)
def test_geojsons_io(geojson_features, aws_s3_client, s3_bucket, mocker): assert_bucket_200(s3_bucket, aws_s3_client) spy_client = mocker.spy(boto3, "client") spy_resource = mocker.spy(boto3, "resource") s3_uri = S3URI(f"s3://{s3_bucket}/tmp.geojsons") result = geojsons_s3_dump(geojson_features, s3_uri.s3_uri) assert result == s3_uri.s3_uri # the s3 client is used once to upload the s3 object data assert spy_client.call_count == 1 assert spy_resource.call_count == 0 assert_object_200(bucket=s3_bucket, key=s3_uri.key, s3_client=aws_s3_client) data = geojsons_s3_load(s3_uri.s3_uri) assert data == geojson_features # the s3 client is used to read the s3 object data assert spy_client.call_count == 2 assert spy_resource.call_count == 0
def test_s3_head_request_for_success(s3_uri_object, mocker): # s3_uri_object fixture exists s3_uri = S3URI(s3_uri_object.s3_uri) mocker.spy(boto3, "client") mocker.spy(boto3, "resource") s3_head = s3_uri.s3_head_request() # the s3 client is used once to check the s3 object exists assert boto3.client.call_count == 1 assert boto3.resource.call_count == 0 assert isinstance(s3_head, dict) assert list(s3_head.keys()) == [ "ResponseMetadata", "LastModified", "ContentLength", "ETag", "Metadata", ]
async def test_s3_aio_geojsons_files( geojson_features, aio_aws_s3_client, aio_s3_bucket, ): s3_uris = [ S3URI(f"s3://{aio_s3_bucket}/tmp_{i:03d}.geojsons").s3_uri for i in range(10) ] for s3_uri in s3_uris: result = await geojsons_s3_dump( geojson_features, s3_uri, s3_client=aio_aws_s3_client ) assert result == s3_uri data = await s3_load_files(s3_uris, s3_client=aio_aws_s3_client) assert sorted(data.keys()) == s3_uris for s3_uri, s3_data in data.items(): assert s3_data == geojson_features
async def test_s3_aio_file_info( aio_aws_s3_client, aio_s3_object_uri, aio_s3_object_text ): s3_uri: str = aio_s3_object_uri s3_text: str = aio_s3_object_text s3_info = await s3_file_info(s3_uri, s3_client=aio_aws_s3_client) assert isinstance(s3_info, S3Info) assert s3_info.s3_uri == S3URI(s3_uri) assert s3_info.s3_size == len(s3_text) assert isinstance(s3_info.last_modified, datetime) s3_dict = s3_info.dict assert isinstance(s3_dict, Dict) assert s3_dict["s3_uri"] == s3_uri assert s3_dict["s3_size"] == len(s3_text) # last-modified is an iso8601 string assert isinstance(s3_dict["last_modified"], str) last_modified = datetime.fromisoformat(s3_dict["last_modified"]) assert isinstance(last_modified, datetime)
async def get_s3_content(s3_uri: str, s3_client: AioBaseClient): """ Read an s3 object :param s3_uri: a fully qualified S3 URI for an s3 object :param s3_client: a required aiobotocore.client.AioBaseClient for s3 :return: the data from the s3 object """ try: s3_uri = S3URI(s3_uri) LOGGER.info("Read S3URI: %s", s3_uri.s3_uri) content_object = await s3_client.get_object( Bucket=s3_uri.bucket, Key=s3_uri.key ) file_content = await content_object["Body"].read() return file_content.decode("utf-8") except ClientError as err: LOGGER.error("Failed S3 GET for: %s", s3_uri) LOGGER.error(err)
def test_yaml_io(geojson_feature_collection, aws_s3_client, s3_bucket, mocker): # Since JSON is a subset of YAML, this should work for GeoJSON data assert_bucket_200(s3_bucket, aws_s3_client) spy_client = mocker.spy(boto3, "client") spy_resource = mocker.spy(boto3, "resource") s3_uri = S3URI(f"s3://{s3_bucket}/tmp.yaml") result = yaml_s3_dump(geojson_feature_collection, s3_uri.s3_uri) assert result == s3_uri.s3_uri # the s3 client is used once to upload the s3 object data assert spy_client.call_count == 1 assert spy_resource.call_count == 0 assert_object_200(bucket=s3_bucket, key=s3_uri.key, s3_client=aws_s3_client) data = yaml_s3_load(s3_uri.s3_uri) assert data == geojson_feature_collection # the s3 client is used to read the s3 object data assert spy_client.call_count == 2 assert spy_resource.call_count == 0
async def test_s3_aio_yaml_files( geojson_features, aio_aws_s3_client, aio_s3_bucket, ): # Since JSON is a subset of YAML, using GeoJSON features should work s3_uris = [ S3URI(f"s3://{aio_s3_bucket}/tmp_{i:03d}.yaml").s3_uri for i in range(10) ] for s3_uri in s3_uris: result = await yaml_s3_dump( geojson_features, s3_uri, s3_client=aio_aws_s3_client ) assert result == s3_uri data = await s3_load_files(s3_uris, s3_client=aio_aws_s3_client) assert sorted(data.keys()) == s3_uris for s3_uri, s3_data in data.items(): assert s3_data == geojson_features
def get_s3_content(s3_uri: str, s3_client: BaseClient = None): """ Read an s3 URI :param s3_uri: a fully qualified S3 URI for the s3 object to read :param s3_client: an optional botocore.client.BaseClient for s3 :return: the data from the s3 object """ if s3_client is None: s3_client = s3_io_client() try: s3_uri = S3URI(s3_uri) LOGGER.info("Read s3-uri: %s", s3_uri.s3_uri) content_object = s3_client.get_object(Bucket=s3_uri.bucket, Key=s3_uri.key) file_content = content_object["Body"].read().decode("utf-8") return file_content except ClientError as err: LOGGER.error("Failed S3 PUT to: %s", s3_uri) LOGGER.error(err)
def yaml_s3_dump(yaml_data: Any, s3_uri: str, s3_client: BaseClient = None) -> Optional[str]: """ Write YAML to an s3 URI :param yaml_data: an object to yaml.dump :param s3_uri: a fully qualified S3 URI for the s3 object to write :param s3_client: an optional botocore.client.BaseClient for s3 :return: the s3 URI on success """ s3_uri = S3URI(s3_uri) success = False tmp_file = None try: with tempfile.NamedTemporaryFile(delete=False) as o_file: tmp_file = o_file.name with open(o_file.name, "w") as fd: yaml.safe_dump(yaml_data, fd) o_file.flush() s3_obj = put_s3_content(data_file=tmp_file, s3_uri=str(s3_uri), s3_client=s3_client) if s3_obj: success = True finally: if tmp_file: os.unlink(tmp_file) if success: LOGGER.info("Saved S3URI: %s", str(s3_uri)) return str(s3_uri) else: LOGGER.error("Failed to save S3URI: %s", str(s3_uri))
async def yaml_s3_dump( yaml_data: Any, s3_uri: str, s3_client: AioBaseClient ) -> Optional[str]: """ Write YAML to an s3 URI :param yaml_data: an object to yaml.dump :param s3_uri: a fully qualified S3 URI for an s3 object :param s3_client: a required aiobotocore.client.AioBaseClient for s3 :return: the s3 URI on success """ s3_uri = S3URI(s3_uri) success = False tmp_file = None try: async with NamedTemporaryFile(delete=False) as o_file: tmp_file = o_file.name async with aiofiles.open(o_file.name, "w") as fd: dumps = yaml.dump(yaml_data) await fd.write(dumps) s3_obj = await put_s3_content( data_file=tmp_file, s3_uri=str(s3_uri), s3_client=s3_client ) if s3_obj: success = True finally: if tmp_file: os.unlink(tmp_file) if success: LOGGER.info("Saved S3URI: %s", str(s3_uri)) return str(s3_uri) else: LOGGER.error("Failed to save S3URI: %s", str(s3_uri))
def test_s3_file_init_fails_with_invalid_bucket(): with pytest.raises(ValueError) as err: # the '_' character is not allowed S3URI.parse_s3_uri("s3://money_buckets/more_money") assert "The s3_uri is invalid" in err.value.args[0]
def s3_uri(s3_uri_str) -> S3URI: return S3URI.parse_s3_uri(s3_uri_str)
def test_s3_objects(s3_bucket_name, s3_temp_objects): s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}") s3_objects = list(s3_uri.s3_objects()) s3_keys = sorted([obj.key for obj in s3_objects]) s3_temp_keys = sorted([obj.key for obj in s3_temp_objects]) assert s3_keys == s3_temp_keys
def test_s3_uri_has_no_schema(): with pytest.raises(ValueError) as err: S3URI.parse_s3_uri("bucket/key") assert "The s3_uri is invalid" in err.value.args[0]
def test_s3_uri_has_no_paths(): with pytest.raises(ValueError) as err: S3URI.parse_s3_uri("s3://") assert "The s3_uri is invalid" in err.value.args[0]
def test_s3_uri_as_str(s3_uri_str): s3_uri = S3URI.parse_s3_uri(s3_uri_str) assert isinstance(s3_uri, S3URI) assert str(s3_uri) == s3_uri_str