Ejemplo n.º 1
0
def test_s3_uri_ordering():
    s3_uri_a = S3URI.parse_s3_uri("s3://s3-bucket/path/a")
    s3_uri_b = S3URI.parse_s3_uri("s3://s3-bucket/path/b")
    s3_uri_1 = S3URI.parse_s3_uri("s3://s3-bucket/path/1")
    s3_uri_9 = S3URI.parse_s3_uri("s3://s3-bucket/path/9")
    sorted_uris = sorted([s3_uri_9, s3_uri_b, s3_uri_1, s3_uri_a])
    assert sorted_uris == [s3_uri_1, s3_uri_9, s3_uri_a, s3_uri_b]
Ejemplo n.º 2
0
def test_s3_objects_glob(s3_bucket_name, s3_temp_objects):
    s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}")
    s3_objects = list(s3_uri.s3_objects(glob_pattern="**/*.tif"))
    s3_keys = sorted([obj.key for obj in s3_objects])
    s3_temp_keys = sorted([obj.key for obj in s3_temp_objects])
    s3_temp_tifs = [key for key in s3_temp_keys if key.endswith(".tif")]
    assert s3_keys == s3_temp_tifs
Ejemplo n.º 3
0
def s3_file_info(s3_uri: Union[S3URI, str],
                 s3_client: BaseClient = None) -> S3Info:
    """
    Collect data from an S3 HEAD request for an S3URI

    :param s3_uri: a fully qualified S3 URI for the s3 object to read
    :param s3_client: an optional botocore.client.BaseClient for s3
    :return: an S3Info object with HEAD data on success;
        on failure the S3Info object has no HEAD data
    """
    if s3_client is None:
        s3_client = s3_io_client()

    if isinstance(s3_uri, str):
        s3_uri = S3URI(s3_uri)

    s3_info = S3Info(s3_uri=s3_uri)
    try:
        s3_head = s3_client.head_object(Bucket=s3_uri.bucket, Key=s3_uri.key)
        if response_success(s3_head):
            # LastModified is a datetime.datetime
            s3_info.last_modified = s3_head["LastModified"]
            s3_info.s3_size = int(s3_head["ContentLength"])
            LOGGER.debug("Success S3URI info: %s", s3_uri)
    except ClientError as err:
        LOGGER.debug("Failed S3URI info: %s", s3_uri)
        LOGGER.debug(err)
    return s3_info
Ejemplo n.º 4
0
async def put_s3_content(
    data_file: str, s3_uri: str, s3_client: AioBaseClient
) -> Optional[str]:
    """
    Write a file to an s3 object

    :param data_file: a data file
    :param s3_uri: a fully qualified S3 URI for an s3 object
    :param s3_client: a required aiobotocore.client.AioBaseClient for s3
    :return: the s3 URI on success
    """
    s3_uri = S3URI(s3_uri)
    try:
        async with aiofiles.open(data_file, "rb") as fd:
            file_bytes = await fd.read()
            response = await s3_client.put_object(
                Bucket=s3_uri.bucket, Key=s3_uri.key, Body=file_bytes
            )
            success = response_success(response)
            if success:
                exists_waiter = s3_client.get_waiter("object_exists")
                await exists_waiter.wait(Bucket=s3_uri.bucket, Key=s3_uri.key)
                return str(s3_uri)
    except ClientError as err:
        LOGGER.error("Failed S3 PUT to: %s", s3_uri)
        LOGGER.error(err)
Ejemplo n.º 5
0
def put_s3_content(data_file: str,
                   s3_uri: str,
                   s3_client: BaseClient = None) -> Optional[str]:
    """
    Write a file to an s3 URI

    :param data_file: a data file
    :param s3_uri: a fully qualified S3 URI for the s3 object to write
    :param s3_client: an optional botocore.client.BaseClient for s3
    :return: the s3 URI on success
    """
    if s3_client is None:
        s3_client = s3_io_client()

    s3_uri = S3URI(s3_uri)

    try:
        with open(data_file, "rb") as fd:
            response = s3_client.put_object(Bucket=s3_uri.bucket,
                                            Key=s3_uri.key,
                                            Body=fd)
            success = response_success(response)
            if success:
                # Use a boto3 waiter to confirm it worked
                exists_waiter = s3_client.get_waiter("object_exists")
                exists_waiter.wait(Bucket=s3_uri.bucket, Key=s3_uri.key)
                return str(s3_uri)
    except ClientError as err:
        LOGGER.error("Failed S3 PUT to: %s", s3_uri)
        LOGGER.error(err)
Ejemplo n.º 6
0
def test_s3_objects_prefix(s3_bucket_name, s3_temp_objects, s3_temp_dir):
    # create a mock s3 bucket with a couple of files in it
    s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}")
    s3_prefix = s3_temp_dir.split("/")[0]
    s3_objects = list(s3_uri.s3_objects(prefix=s3_prefix))
    s3_keys = sorted([obj.key for obj in s3_objects])
    s3_temp_keys = sorted([obj.key for obj in s3_temp_objects])
    assert s3_keys == s3_temp_keys
Ejemplo n.º 7
0
def test_s3_uri_has_no_key(s3_bucket_name):
    uri = f"s3://{s3_bucket_name}"
    s3_uri = S3URI.parse_s3_uri(uri)
    assert isinstance(s3_uri, S3URI)
    assert s3_uri.bucket == s3_bucket_name
    assert s3_uri.key == ""
    assert s3_uri.key_path == ""
    assert s3_uri.key_file == ""
Ejemplo n.º 8
0
def geojsons_s3_dump(geojson_features: List[Dict],
                     s3uri: str,
                     s3_client: BaseClient = None) -> Optional[str]:
    """
    Write GeoJSON Text Sequence files to an s3 URI

    [GeoJSON Text Sequences](https://tools.ietf.org/html/rfc8142) are
    lines of geojson features that are designed for streaming
    operations on large datasets. These files can be loaded by
    geopandas, using fiona `driver="GeoJSONSeq"`, which
    can be auto-detected.  For example:

    .. code-block::

        import geopandas as gpd

        s3_uri = "s3://your-bucket/prefix/input.geojsons"
        gdf = gpd.read_file(s3_uri)

    :param geojson_features: a list of geojson features; from any
        feature collection, this is geojson_collection["features"]
    :param s3uri: a fully qualified S3 URI for the s3 object to write
    :param s3_client: an optional botocore.client.BaseClient for s3
    :return: the s3 URI on success
    """
    if s3_client is None:
        s3_client = s3_io_client()

    s3_uri = S3URI(s3uri)

    success = False
    tmp_file = None
    try:
        with tempfile.NamedTemporaryFile(delete=False) as o_file:
            tmp_file = o_file.name
            with open(o_file.name, "w") as fd:
                for feature in geojson_features:
                    json.dump(feature, fd)
                    fd.write("\n")
            o_file.flush()

        s3_obj = put_s3_content(data_file=tmp_file,
                                s3_uri=str(s3_uri),
                                s3_client=s3_client)
        if s3_obj:
            success = True

    finally:
        if tmp_file:
            os.unlink(tmp_file)

    if success:
        LOGGER.info("Saved GeoJSONSeq to %s", str(s3_uri))
        return str(s3_uri)
    else:
        LOGGER.error("Failed to save GeoJSONSeq to %s", s3_uri)
Ejemplo n.º 9
0
def test_s3_objects_glob_hundreds(s3_bucket_name, s3_temp_1000s_objects):
    # create a mock s3 bucket with hundreds of files in it; this
    # should exceed the default MaxKeys limit on a filter because
    # the implementation in S3URI.s3_objects is unlimited.
    s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}")
    s3_objects = list(s3_uri.s3_objects(glob_pattern="**/*.tif"))
    s3_keys = sorted([obj.key for obj in s3_objects])
    s3_temp_keys = sorted([obj.key for obj in s3_temp_1000s_objects])
    s3_temp_tifs = [key for key in s3_temp_keys if key.endswith(".tif")]
    assert s3_keys == s3_temp_tifs
Ejemplo n.º 10
0
def test_s3_exists_for_success(s3_uri_object, mocker):
    # s3_uri_object fixture exists
    s3_uri = S3URI(s3_uri_object.s3_uri)
    mocker.spy(boto3, "client")
    mocker.spy(boto3, "resource")
    s3_uri_exists = s3_uri.s3_exists()
    # the s3 client is used once to check the s3 object exists
    assert boto3.client.call_count == 0
    assert boto3.resource.call_count == 1
    assert s3_uri_exists is True
Ejemplo n.º 11
0
async def test_s3_aio_files_info(
    aio_aws_s3_client, aio_s3_object_uri, aio_s3_object_text
):
    s3_uri: str = aio_s3_object_uri
    s3_text: str = aio_s3_object_text
    s3_files = await s3_files_info([s3_uri], s3_client=aio_aws_s3_client)
    for s3_info in s3_files:
        assert isinstance(s3_info, S3Info)
        assert s3_info.s3_uri == S3URI(s3_uri)
        assert s3_info.s3_size == len(s3_text)
        assert isinstance(s3_info.last_modified, datetime)
Ejemplo n.º 12
0
async def test_s3_aio_geojsons(
    geojson_features,
    aio_aws_s3_client,
    aio_s3_bucket,
):
    s3_uri = S3URI(f"s3://{aio_s3_bucket}/tmp.geojsons")
    result = await geojsons_s3_dump(
        geojson_features, str(s3_uri), s3_client=aio_aws_s3_client
    )
    assert result == s3_uri.s3_uri
    data = await geojsons_s3_load(str(s3_uri), s3_client=aio_aws_s3_client)
    assert data == geojson_features
Ejemplo n.º 13
0
def test_s3_file_derivatives(s3_temp_file, s3_temp_objects):
    s3_uri = S3URI.parse_s3_uri(s3_temp_file.s3_uri)
    file_path = Path(s3_uri.key_file)
    s3_objects = list(s3_uri.s3_derivatives())
    s3_derivative_keys = set([obj.key for obj in s3_objects])
    assert s3_derivative_keys
    for key in s3_derivative_keys:
        assert key != s3_uri.key
        assert file_path.stem in Path(key).stem
    # derivative keys are in anything in key-path/**/stem*.*
    s3_temp_keys = [obj.key for obj in s3_temp_objects]
    key_intersect = set(s3_temp_keys).intersection(s3_derivative_keys)
    assert s3_derivative_keys == key_intersect
Ejemplo n.º 14
0
def test_s3_object_summary_for_success(s3_uri_object, mocker):
    # s3_uri_object fixture exists
    s3_uri = S3URI(s3_uri_object.s3_uri)

    mocker.spy(boto3, "client")
    mocker.spy(boto3, "resource")
    summary = s3_uri.s3_object_summary()
    # the s3 resource is used once to check the s3 object exists
    assert boto3.client.call_count == 0
    assert boto3.resource.call_count == 1

    assert summary.__class__.__name__ == "s3.ObjectSummary"
    assert summary.bucket_name == s3_uri.bucket
    assert summary.key == s3_uri.key
    assert isinstance(summary.last_modified, datetime.datetime)
Ejemplo n.º 15
0
def test_geojsons_io(geojson_features, aws_s3_client, s3_bucket, mocker):
    assert_bucket_200(s3_bucket, aws_s3_client)
    spy_client = mocker.spy(boto3, "client")
    spy_resource = mocker.spy(boto3, "resource")
    s3_uri = S3URI(f"s3://{s3_bucket}/tmp.geojsons")
    result = geojsons_s3_dump(geojson_features, s3_uri.s3_uri)
    assert result == s3_uri.s3_uri
    # the s3 client is used once to upload the s3 object data
    assert spy_client.call_count == 1
    assert spy_resource.call_count == 0
    assert_object_200(bucket=s3_bucket,
                      key=s3_uri.key,
                      s3_client=aws_s3_client)
    data = geojsons_s3_load(s3_uri.s3_uri)
    assert data == geojson_features
    # the s3 client is used to read the s3 object data
    assert spy_client.call_count == 2
    assert spy_resource.call_count == 0
Ejemplo n.º 16
0
def test_s3_head_request_for_success(s3_uri_object, mocker):
    # s3_uri_object fixture exists
    s3_uri = S3URI(s3_uri_object.s3_uri)

    mocker.spy(boto3, "client")
    mocker.spy(boto3, "resource")
    s3_head = s3_uri.s3_head_request()
    # the s3 client is used once to check the s3 object exists
    assert boto3.client.call_count == 1
    assert boto3.resource.call_count == 0
    assert isinstance(s3_head, dict)
    assert list(s3_head.keys()) == [
        "ResponseMetadata",
        "LastModified",
        "ContentLength",
        "ETag",
        "Metadata",
    ]
Ejemplo n.º 17
0
async def test_s3_aio_geojsons_files(
    geojson_features,
    aio_aws_s3_client,
    aio_s3_bucket,
):
    s3_uris = [
        S3URI(f"s3://{aio_s3_bucket}/tmp_{i:03d}.geojsons").s3_uri for i in range(10)
    ]
    for s3_uri in s3_uris:
        result = await geojsons_s3_dump(
            geojson_features, s3_uri, s3_client=aio_aws_s3_client
        )
        assert result == s3_uri

    data = await s3_load_files(s3_uris, s3_client=aio_aws_s3_client)
    assert sorted(data.keys()) == s3_uris
    for s3_uri, s3_data in data.items():
        assert s3_data == geojson_features
Ejemplo n.º 18
0
async def test_s3_aio_file_info(
    aio_aws_s3_client, aio_s3_object_uri, aio_s3_object_text
):
    s3_uri: str = aio_s3_object_uri
    s3_text: str = aio_s3_object_text
    s3_info = await s3_file_info(s3_uri, s3_client=aio_aws_s3_client)
    assert isinstance(s3_info, S3Info)
    assert s3_info.s3_uri == S3URI(s3_uri)
    assert s3_info.s3_size == len(s3_text)
    assert isinstance(s3_info.last_modified, datetime)
    s3_dict = s3_info.dict
    assert isinstance(s3_dict, Dict)
    assert s3_dict["s3_uri"] == s3_uri
    assert s3_dict["s3_size"] == len(s3_text)
    # last-modified is an iso8601 string
    assert isinstance(s3_dict["last_modified"], str)
    last_modified = datetime.fromisoformat(s3_dict["last_modified"])
    assert isinstance(last_modified, datetime)
Ejemplo n.º 19
0
async def get_s3_content(s3_uri: str, s3_client: AioBaseClient):
    """
    Read an s3 object

    :param s3_uri: a fully qualified S3 URI for an s3 object
    :param s3_client: a required aiobotocore.client.AioBaseClient for s3
    :return: the data from the s3 object
    """
    try:
        s3_uri = S3URI(s3_uri)
        LOGGER.info("Read S3URI: %s", s3_uri.s3_uri)
        content_object = await s3_client.get_object(
            Bucket=s3_uri.bucket, Key=s3_uri.key
        )
        file_content = await content_object["Body"].read()
        return file_content.decode("utf-8")
    except ClientError as err:
        LOGGER.error("Failed S3 GET for: %s", s3_uri)
        LOGGER.error(err)
Ejemplo n.º 20
0
def test_yaml_io(geojson_feature_collection, aws_s3_client, s3_bucket, mocker):
    # Since JSON is a subset of YAML, this should work for GeoJSON data
    assert_bucket_200(s3_bucket, aws_s3_client)
    spy_client = mocker.spy(boto3, "client")
    spy_resource = mocker.spy(boto3, "resource")
    s3_uri = S3URI(f"s3://{s3_bucket}/tmp.yaml")
    result = yaml_s3_dump(geojson_feature_collection, s3_uri.s3_uri)
    assert result == s3_uri.s3_uri
    # the s3 client is used once to upload the s3 object data
    assert spy_client.call_count == 1
    assert spy_resource.call_count == 0
    assert_object_200(bucket=s3_bucket,
                      key=s3_uri.key,
                      s3_client=aws_s3_client)
    data = yaml_s3_load(s3_uri.s3_uri)
    assert data == geojson_feature_collection
    # the s3 client is used to read the s3 object data
    assert spy_client.call_count == 2
    assert spy_resource.call_count == 0
Ejemplo n.º 21
0
async def test_s3_aio_yaml_files(
    geojson_features,
    aio_aws_s3_client,
    aio_s3_bucket,
):
    # Since JSON is a subset of YAML, using GeoJSON features should work
    s3_uris = [
        S3URI(f"s3://{aio_s3_bucket}/tmp_{i:03d}.yaml").s3_uri for i in range(10)
    ]
    for s3_uri in s3_uris:
        result = await yaml_s3_dump(
            geojson_features, s3_uri, s3_client=aio_aws_s3_client
        )
        assert result == s3_uri

    data = await s3_load_files(s3_uris, s3_client=aio_aws_s3_client)
    assert sorted(data.keys()) == s3_uris
    for s3_uri, s3_data in data.items():
        assert s3_data == geojson_features
Ejemplo n.º 22
0
def get_s3_content(s3_uri: str, s3_client: BaseClient = None):
    """
    Read an s3 URI

    :param s3_uri: a fully qualified S3 URI for the s3 object to read
    :param s3_client: an optional botocore.client.BaseClient for s3
    :return: the data from the s3 object
    """
    if s3_client is None:
        s3_client = s3_io_client()
    try:
        s3_uri = S3URI(s3_uri)
        LOGGER.info("Read s3-uri: %s", s3_uri.s3_uri)
        content_object = s3_client.get_object(Bucket=s3_uri.bucket,
                                              Key=s3_uri.key)
        file_content = content_object["Body"].read().decode("utf-8")
        return file_content
    except ClientError as err:
        LOGGER.error("Failed S3 PUT to: %s", s3_uri)
        LOGGER.error(err)
Ejemplo n.º 23
0
def yaml_s3_dump(yaml_data: Any,
                 s3_uri: str,
                 s3_client: BaseClient = None) -> Optional[str]:
    """
    Write YAML to an s3 URI

    :param yaml_data: an object to yaml.dump
    :param s3_uri: a fully qualified S3 URI for the s3 object to write
    :param s3_client: an optional botocore.client.BaseClient for s3
    :return: the s3 URI on success
    """
    s3_uri = S3URI(s3_uri)

    success = False
    tmp_file = None
    try:
        with tempfile.NamedTemporaryFile(delete=False) as o_file:
            tmp_file = o_file.name
            with open(o_file.name, "w") as fd:
                yaml.safe_dump(yaml_data, fd)
            o_file.flush()

        s3_obj = put_s3_content(data_file=tmp_file,
                                s3_uri=str(s3_uri),
                                s3_client=s3_client)
        if s3_obj:
            success = True

    finally:
        if tmp_file:
            os.unlink(tmp_file)

    if success:
        LOGGER.info("Saved S3URI: %s", str(s3_uri))
        return str(s3_uri)
    else:
        LOGGER.error("Failed to save S3URI: %s", str(s3_uri))
Ejemplo n.º 24
0
async def yaml_s3_dump(
    yaml_data: Any, s3_uri: str, s3_client: AioBaseClient
) -> Optional[str]:
    """
    Write YAML to an s3 URI

    :param yaml_data: an object to yaml.dump
    :param s3_uri: a fully qualified S3 URI for an s3 object
    :param s3_client: a required aiobotocore.client.AioBaseClient for s3
    :return: the s3 URI on success
    """
    s3_uri = S3URI(s3_uri)
    success = False
    tmp_file = None
    try:
        async with NamedTemporaryFile(delete=False) as o_file:
            tmp_file = o_file.name
            async with aiofiles.open(o_file.name, "w") as fd:
                dumps = yaml.dump(yaml_data)
                await fd.write(dumps)

        s3_obj = await put_s3_content(
            data_file=tmp_file, s3_uri=str(s3_uri), s3_client=s3_client
        )
        if s3_obj:
            success = True

    finally:
        if tmp_file:
            os.unlink(tmp_file)

    if success:
        LOGGER.info("Saved S3URI: %s", str(s3_uri))
        return str(s3_uri)
    else:
        LOGGER.error("Failed to save S3URI: %s", str(s3_uri))
Ejemplo n.º 25
0
def test_s3_file_init_fails_with_invalid_bucket():
    with pytest.raises(ValueError) as err:
        # the '_' character is not allowed
        S3URI.parse_s3_uri("s3://money_buckets/more_money")
    assert "The s3_uri is invalid" in err.value.args[0]
Ejemplo n.º 26
0
def s3_uri(s3_uri_str) -> S3URI:
    return S3URI.parse_s3_uri(s3_uri_str)
Ejemplo n.º 27
0
def test_s3_objects(s3_bucket_name, s3_temp_objects):
    s3_uri = S3URI.parse_s3_uri(f"s3://{s3_bucket_name}")
    s3_objects = list(s3_uri.s3_objects())
    s3_keys = sorted([obj.key for obj in s3_objects])
    s3_temp_keys = sorted([obj.key for obj in s3_temp_objects])
    assert s3_keys == s3_temp_keys
Ejemplo n.º 28
0
def test_s3_uri_has_no_schema():
    with pytest.raises(ValueError) as err:
        S3URI.parse_s3_uri("bucket/key")
    assert "The s3_uri is invalid" in err.value.args[0]
Ejemplo n.º 29
0
def test_s3_uri_has_no_paths():
    with pytest.raises(ValueError) as err:
        S3URI.parse_s3_uri("s3://")
    assert "The s3_uri is invalid" in err.value.args[0]
Ejemplo n.º 30
0
def test_s3_uri_as_str(s3_uri_str):
    s3_uri = S3URI.parse_s3_uri(s3_uri_str)
    assert isinstance(s3_uri, S3URI)
    assert str(s3_uri) == s3_uri_str