def test_load_dict(self): data = {"key1": "foo", "key2": "bar"} data_str = json.dumps(data) resource = S3Resource( "filename.ext", content_type="application/json", prefix="prefix/", bucketname="bucketname", protocol="protocol://", stream=S3Stream(io.StringIO(data_str)), ) # pass in a transform func and do not unpack content transformed = resource.load(lambda content: { **content, "extra": True }, unpack=False) self.assertDictEqual(transformed, { "key1": "foo", "key2": "bar", "extra": True }) # pass in a basemodel and unpack content class DummyClass(BaseModel): """dummy class""" key1: str key2: str dummy = resource.load(DummyClass) self.assertIsInstance(dummy, DummyClass) self.assertDictEqual(dummy.dict(), data)
def test_save_str_stream(self): data = {"key1": "foo", "key2": "bar"} data_str = json.dumps(data) s3stream = S3Stream(io.StringIO(data_str)) # mock s3 client s3client = boto3.client("s3") s3client.upload_fileobj = MagicMock( return_value={"msg": "boto3 response"}) resource = S3Resource( "filename.ext", content_type="application/json", prefix="prefix/", bucketname="bucketname", protocol="protocol://", stream=s3stream, s3client=s3client, Metadata={"tag": "metadata"}, ) resource.save() args, _ = s3client.upload_fileobj.call_args stream, _, _ = args output = stream.read() self.assertTrue(isinstance(output, bytes)) self.assertDictEqual(data, json.loads(output.decode("utf-8")))
def test_basic_ok(self): data = {"key1": "foo", "key2": "bar"} data_str = json.dumps(data) s3stream = S3Stream(io.StringIO(data_str)) resource = S3Resource( "filename.ext", content_type="application/json", prefix="prefix/", bucketname="bucketname", protocol="protocol://", stream=s3stream, Metadata={"tag": "metadata"}, ) self.assertEqual(resource.key, "prefix/filename.ext") self.assertEqual(resource.uri, "protocol://bucketname/prefix/filename.ext") self.assertEqual(resource.content_type, "application/json") self.assertEqual(resource.stream, s3stream) self.assertDictEqual(resource.extra_args, {"Metadata": { "tag": "metadata" }}) self.assertEqual(resource.read(), data_str) self.assertDictEqual(resource.load(), data)
def test_save_bin_stream(self): data = {"key1": "foo", "key2": "bar"} data_str = json.dumps(data) s3stream = S3Stream(io.BytesIO(data_str.encode("utf-8"))) # mock s3 client s3client = boto3.client("s3") s3client.upload_fileobj = MagicMock( return_value={"msg": "boto3 response"}) resource = S3Resource( "filename.ext", content_type="application/json", prefix="prefix/", bucketname="bucketname", protocol="protocol://", stream=s3stream, s3client=s3client, Metadata={"tag": "metadata"}, ) resource.save() s3client.upload_fileobj.assert_called_with( s3stream, "bucketname", "prefix/filename.ext", ExtraArgs={ "ContentType": "application/json", "Metadata": { "tag": "metadata" }, }, ) self.assertDictEqual(resource.last_resp, {"msg": "boto3 response"})
def __init__(self, filename: str, content_type: str = "", bucketname: str = "", prefix: str = "", protocol: str = "s3a://", stream: S3Stream[StringOrBytes] = None, s3client: boto3.client = None, stats: dict = None, **kwargs): """ Creates a new instance of S3Resource, which will use `boto3.s3.transfer.S3Transfer` under the hood to download/upload the s3 resource. See https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.S3Transfer Args: filename (str): filename of the object. content_type (str, optional): mime type of the object. Defaults to "". bucketname (str, optional): name of the bucket the obj is or should be. Defaults to "". prefix (str, optional): prefix to be added to the filename to get the s3 object key. Defaults to "application/octet-stream". protocol (str, optional): s3 client protocol. Defaults to "s3a://". stream (S3Stream[StringOrBytes], optional): data stream. Defaults to None. s3_client (boto3.client, optional): s3 client to use to retrieve resource. Defaults to None. Metadata (dict, optional): metadata for the object. Defaults to None. **kwargs: Any additional args to pass to `boto3.s3.transfer.S3Transfer` function. """ # random name if filename is not provided filename = filename or uuid4().hex dirname = os.path.dirname(filename) if dirname: filename = filename[len(dirname) + 1:] prefix = os.path.join(prefix, dirname) + os.path.sep if stream: if not isinstance(stream, S3Stream): stream = S3Stream.from_any(stream, content_type) if content_type: stream.content_type = content_type self.filename = filename self._content_type = content_type self.bucketname = bucketname self.prefix = prefix self.protocol = protocol self._stream: Optional[S3Stream[StringOrBytes]] = stream self.extra_args = kwargs self.s3client = s3client self.last_resp = None self.stats = stats
def test_pandas_dict(self): data = pd.DataFrame(data=[{"name": "a"}, {"name": "b"}]) stream = S3Stream.from_any(data, output_as="json", orient="records") self.assertEqual(stream.content_type, "application/json") self.assertEqual(json.loads(stream.read()), [{ "name": "a" }, { "name": "b" }])
def create_resource(self, filename: str, content_type: str = "", obj: Any = None, protocol: str = "s3a://", metadata: Dict[str, str] = None, pandas_kwargs: dict = None, **kwargs) -> S3Resource: """ Creates a new instance of S3Resource binds to the current bucket. Example:: # create S3Resource in bucket to read in foo = prj_bucket.create_resource("foo.json", "application/json") # read "s3a://some_bucket/prj-a/foo.json" and load as a dict (or list) foo_dict = foo.load() # create S3Resource in bucket and save to "s3a://some_bucket/prj-a/foo.json" prj_bucket.create_resource("foo.json", obj={"foo": "bar"}).save() Args: filename (str): name of the resource. content_type (str, optional): mime type. Defaults to "application/octet-stream". obj (Any, optional): python object to convert into a resource. Defaults to None. protocol (str, optional): protocol. Defaults to "s3a://". stream (Union[io.StringIO, io.BytesIO, IO[StringOrBytes]], optional): content of the resource. Defaults to None. metadata (dict, optional): metadata for the object. Defaults to None. pandas_kwargs: Any additional args to pass to `pandas`. **kwargs: Any additional args to pass to `S3Resource`. Returns: S3Resource: a S3Resource related to the active S3Bucket. """ stream = (S3Stream.from_any(obj, content_type, **(pandas_kwargs or {})) if obj is not None else None) if not content_type: if stream: content_type = stream.content_type return S3Resource(filename=filename, prefix=self.prefix, bucketname=self.name, protocol=protocol, content_type=content_type or "application/octet-stream", stream=stream, s3client=self._s3client, Metadata=metadata or {}, **kwargs)
def test_model(self): class Model(BaseModel): """dummy model""" name: str value: int data = Model(name="foo", value=10) stream = S3Stream.from_any(data, content_type="text/plain") self.assertEqual(stream.content_type, "application/json") self.assertDictEqual(json.loads(stream.read()), data.dict())
def test_pickle(self): class Model: """dummy class""" def __init__(self, name: str, value: int): self.name = name self.value = value joblib.dump = MagicMock() data = Model(name="foo", value=10) stream = S3Stream.from_any(data, content_type="application/octet-stream") self.assertEqual(stream.content_type, "application/octet-stream") joblib.dump.assert_called_once()
def test_load_list(self): data = ["a", "b", "c"] data_str = json.dumps(data) resource = S3Resource( "filename.ext", content_type="application/json", stream=S3Stream(io.StringIO(data_str)), ) # pass in a transform func and do not unpack content transformed = resource.load(lambda content: content + ["d"], unpack=False) self.assertListEqual(transformed, ["a", "b", "c", "d"]) # pass in a transform func and unpack content transformed = resource.load(lambda a, b, c: "%s:%s:%s" % (a, b, c), unpack=True) self.assertEqual(transformed, "a:b:c")
def stream(self) -> S3Stream[StringOrBytes]: """data stream for the resource.""" if self._stream: return self._stream if self.bucketname: stream = io.BytesIO() s3client = self.s3client or boto3.client("s3") self.last_resp = s3client.download_fileobj( self.bucketname, self.key, stream, ExtraArgs=self.extra_args) stream.seek(0) # reset to initial counter self._stream = S3Stream(stream, self._content_type) # overwrite infered mime if provided if self._content_type: self._stream.content_type = self._content_type return self._stream raise RuntimeError("S3Resource does not have a stream.")
def test_full_path(self): data = {"key1": "foo", "key2": "bar"} data_str = json.dumps(data) s3stream = S3Stream(io.StringIO(data_str)) resource = S3Resource( "subprefix/filename.ext", content_type="application/json", prefix="prefix/", bucketname="bucketname", protocol="protocol://", stream=s3stream, ) self.assertEqual(resource.filename, "filename.ext") self.assertEqual(resource.prefix, "prefix/subprefix/") self.assertEqual(resource.key, "prefix/subprefix/filename.ext") self.assertEqual( resource.uri, "protocol://bucketname/prefix/subprefix/filename.ext")
def test_pandas_csv(self): data = pd.DataFrame(data=[{"name": "a"}, {"name": "b"}]) stream = S3Stream.from_any(data, output_as="csv", index=False) self.assertEqual(stream.content_type, "application/csv") self.assertEqual(stream.read(), "name\na\nb\n")
def test_dict(self): data = {"foo": "bar"} stream = S3Stream.from_any(data, content_type="text/plain") self.assertEqual(stream.content_type, "application/json") self.assertDictEqual(json.loads(stream.read()), data)
def test_str(self): data = "foo bar" stream = S3Stream.from_any(data, content_type="text/plain") self.assertEqual(stream.content_type, "text/plain") self.assertEqual(stream.read(), data)