def test_basics(path, glue_database, glue_table): args = { "table": glue_table, "path": "", "columns_types": { "col0": "bigint" } } # Missing database argument with pytest.raises(TypeError): wr.catalog.create_parquet_table(**args) # Configuring default database value wr.config.database = glue_database # Testing configured database wr.catalog.create_parquet_table(**args) # Testing configured s3 block size size = 1 * 2**20 # 1 MB wr.config.s3_block_size = size with open_s3_object(path, mode="wb") as s3obj: s3obj.write(b"foo") with open_s3_object(path, mode="rb") as s3obj: assert s3obj._s3_block_size == size # Resetting all configs wr.config.reset() # Missing database argument with pytest.raises(TypeError): wr.catalog.does_table_exist(table=glue_table) # Configuring default database value again wr.config.database = glue_database # Testing configured database again assert wr.catalog.does_table_exist(table=glue_table) is True # Resetting this specific config wr.config.reset("database") # Missing database argument with pytest.raises(TypeError): wr.catalog.does_table_exist(table=glue_table) # exporting environment variable os.environ["WR_DATABASE"] = glue_database wr.config.reset("database") assert wr.catalog.does_table_exist(table=glue_table) is True del os.environ["WR_DATABASE"] wr.config.reset("database") # Missing database argument with pytest.raises(TypeError): wr.catalog.does_table_exist(table=glue_table) assert wr.config.to_pandas().shape == (len(wr._config._CONFIG_ARGS), 7)
def test_additional_kwargs(path, kms_key_id, s3_additional_kwargs, use_threads): if s3_additional_kwargs is not None and "SSEKMSKeyId" in s3_additional_kwargs: s3_additional_kwargs["SSEKMSKeyId"] = kms_key_id path = f"{path}0.txt" with open_s3_object(path, mode="w", s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads) as s3obj: s3obj.write("foo") with open_s3_object( path, mode="r", s3_block_size=10_000_000, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads, ) as s3obj: assert s3obj.read() == "foo"
def test_botocore_config(path): original = botocore.client.ClientCreator.create_client # Default values for botocore.config.Config expected_max_retries_attempt = 5 expected_connect_timeout = 10 expected_max_pool_connections = 10 expected_retry_mode = None def wrapper(self, **kwarg): assert kwarg["client_config"].retries["max_attempts"] == expected_max_retries_attempt assert kwarg["client_config"].connect_timeout == expected_connect_timeout assert kwarg["client_config"].max_pool_connections == expected_max_pool_connections assert kwarg["client_config"].retries.get("mode") == expected_retry_mode return original(self, **kwarg) # Check for default values with patch("botocore.client.ClientCreator.create_client", new=wrapper): with open_s3_object(path, mode="wb") as s3obj: s3obj.write(b"foo") # Update default config with environment variables expected_max_retries_attempt = 20 expected_connect_timeout = 10 expected_max_pool_connections = 10 expected_retry_mode = "adaptive" os.environ["AWS_MAX_ATTEMPTS"] = str(expected_max_retries_attempt) os.environ["AWS_RETRY_MODE"] = expected_retry_mode with patch("botocore.client.ClientCreator.create_client", new=wrapper): with open_s3_object(path, mode="wb") as s3obj: s3obj.write(b"foo") del os.environ["AWS_MAX_ATTEMPTS"] del os.environ["AWS_RETRY_MODE"] # Update botocore.config.Config expected_max_retries_attempt = 30 expected_connect_timeout = 40 expected_max_pool_connections = 50 expected_retry_mode = "legacy" botocore_config = botocore.config.Config( retries={"max_attempts": expected_max_retries_attempt, "mode": expected_retry_mode}, connect_timeout=expected_connect_timeout, max_pool_connections=expected_max_pool_connections, ) wr.config.botocore_config = botocore_config with patch("botocore.client.ClientCreator.create_client", new=wrapper): with open_s3_object(path, mode="wb") as s3obj: s3obj.write(b"foo") wr.config.reset()
def test_io_intense(path, use_threads): path = f"{path}0.txt" data = b"0" * 10_000_000 + b"1" * 10_000_000 + b"2" * 10_000_000 with open_s3_object(path, mode="wb", use_threads=use_threads) as s3obj: s3obj.write(data) with open_s3_object(path, mode="rb", use_threads=use_threads) as s3obj: assert s3obj.read() == data bucket, key = wr._utils.parse_path(path) assert boto3.client("s3").get_object(Bucket=bucket, Key=key)["Body"].read() == data
def _read_parquet_file( path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], version_id: Optional[str] = None, pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> pa.Table: pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs) s3_block_size: int = 20_971_520 if columns else -1 # One shot for a full read otherwise 20 MB (20 * 2**20) with open_s3_object( path=path, mode="rb", version_id=version_id, use_threads=use_threads, s3_block_size=s3_block_size, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, read_dictionary=categories, coerce_int96_timestamp_unit=pyarrow_args["coerce_int96_timestamp_unit"], ) if pq_file is None: raise exceptions.InvalidFile(f"Invalid Parquet file: {path}") return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_parquet_row_group( row_group: int, path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_primitives: _utils.Boto3PrimitivesType, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> pa.Table: boto3_session: boto3.Session = _utils.boto3_from_primitives( primitives=boto3_primitives) with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) num_row_groups: int = pq_file.num_row_groups _logger.debug("Reading Row Group %s/%s [multi-threaded]", row_group + 1, num_row_groups) return pq_file.read_row_group(i=row_group, columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_parquet_metadata_file( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: Union[bool, int], version_id: Optional[str] = None, ignore_null: bool = False, pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> Optional[Dict[str, str]]: pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs) with open_s3_object( path=path, mode="rb", version_id=version_id, use_threads=use_threads, s3_block_size=131_072, # 128 KB (128 * 2**10) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, coerce_int96_timestamp_unit=pyarrow_args["coerce_int96_timestamp_unit"] ) if pq_file is None: return None return _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None, ignore_null=ignore_null )[0]
def test_read_line(path, mode, block_size, use_threads): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) text = "0\n11\n22222\n33333333333333\n44444444444444444444444444444444444444444444\n55555" expected = [ "0\n", "11\n", "22222\n", "33333333333333\n", "44444444444444444444444444444444444444444444\n", "55555" ] client_s3.put_object(Body=text, Bucket=bucket, Key=key) with open_s3_object(path, mode=mode, s3_block_size=block_size, newline="\n", use_threads=use_threads) as s3obj: for i, line in enumerate(s3obj): if mode == "r": assert line == expected[i] else: assert line == expected[i].encode("utf-8") s3obj.seek(0) lines = s3obj.readlines() if mode == "r": assert lines == expected else: assert [line.decode("utf-8") for line in lines] == expected if "b" in mode: assert s3obj._cache == b""
def _read_text_file( path: str, version_id: Optional[str], parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, _utils.Boto3PrimitivesType], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: Union[bool, int], ) -> pd.DataFrame: boto3_session = _utils.ensure_session(boto3_session) mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) try: with open_s3_object( path=path, version_id=version_id, mode=mode, use_threads=use_threads, s3_block_size=-1, # One shot download encoding=encoding, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": raise exceptions.NoFilesFound(f"No files Found on: {path}.") raise e return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def _new_writer( file_path: str, compression: Optional[str], schema: pa.Schema, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> Iterator[pyarrow.parquet.ParquetWriter]: writer: Optional[pyarrow.parquet.ParquetWriter] = None with open_s3_object( path=file_path, mode="wb", use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: try: writer = pyarrow.parquet.ParquetWriter( where=f, write_statistics=True, use_dictionary=True, coerce_timestamps="ms", compression="NONE" if compression is None else compression, flavor="spark", schema=schema, ) yield writer finally: if writer is not None and writer.is_open is True: writer.close()
def _read_parquet_file( path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> pa.Table: s3_block_size: int = 20_971_520 if columns else -1 # One shot for a full read otherwise 20 MB (20 * 2**20) with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=s3_block_size, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: Optional[ pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper( source=f, read_dictionary=categories) if pq_file is None: raise exceptions.InvalidFile(f"Invalid Parquet file: {path}") return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_text_chunked( paths: List[str], chunksize: int, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: boto3.Session, pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: bool, ) -> Iterator[pd.DataFrame]: for path in paths: _logger.debug("path: %s", path) mode, encoding, newline = _get_read_details( path=path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=path, mode=mode, s3_block_size=10_485_760, # 10 MB (10 * 2**20) encoding=encoding, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: reader: pandas.io.parsers.TextFileReader = parser_func( f, chunksize=chunksize, **pandas_kwargs) for df in reader: yield _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def test_read(path, use_threads, block_size, seq, length): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) text = "0123456789" client_s3.put_object(Body=text, Bucket=bucket, Key=key) fs = s3fs.S3FileSystem() with fs.open(path, "rb") as f: with open_s3_object(path, mode="rb", s3_block_size=block_size, use_threads=use_threads) as s3obj: for i in seq: s3obj.seek(i) f.seek(i) data = s3obj.read(length) assert data[0:1] == text[i].encode("utf-8") assert data == f.read(length) logger.debug(s3obj._cache) if block_size < 1: assert len(s3obj._cache) == s3obj._size elif length > block_size: assert block_size <= len(s3obj._cache) <= length else: assert len(s3obj._cache) == block_size assert s3obj._cache == b""
def _to_text( file_format: str, df: pd.DataFrame, use_threads: bool, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], path: Optional[str] = None, path_root: Optional[str] = None, **pandas_kwargs: Any, ) -> List[str]: if df.empty is True: raise exceptions.EmptyDataFrame() if path is None and path_root is not None: file_path: str = f"{path_root}{uuid.uuid4().hex}.{file_format}" elif path is not None and path_root is None: file_path = path else: raise RuntimeError("path and path_root received at the same time.") encoding: Optional[str] = pandas_kwargs.get("encoding", None) with open_s3_object( path=file_path, mode="w", use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, encoding=encoding, newline=None, ) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) if file_format == "csv": df.to_csv(f, **pandas_kwargs) elif file_format == "json": df.to_json(f, **pandas_kwargs) return [file_path]
def _read_text_file( path: str, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: bool, ) -> pd.DataFrame: mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=path, mode=mode, use_threads=use_threads, s3_block_size=-1, # One shot download encoding=encoding, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs) return _apply_partitions(df=df, dataset=dataset, path=path, path_root=path_root)
def test_cache_seek(path): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) text = "0" * 1_000_000 + "1" * 4 client_s3.put_object(Body=text, Bucket=bucket, Key=key) with open_s3_object(path, mode="rb", s3_block_size=1_000) as s3obj: s3obj.seek(1_000_000) assert s3obj.read(100).decode("utf-8") == "1" * 4 assert s3obj._cache == b""
def test_cache(path, use_threads, block_size, text): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) client_s3.put_object(Body=text, Bucket=bucket, Key=key) with open_s3_object(path, mode="rb", s3_block_size=block_size, use_threads=use_threads) as s3obj: for i in range(len(text)): value = s3obj.read(1) assert value == text[i].encode("utf-8") assert len(s3obj._cache) in (block_size, block_size - 1, len(text)) assert s3obj._cache == b""
def test_write_full(path, mode, use_threads): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) text = "ajdaebdiebdkibaekdbekfbksbfksebkfjebkfjbekjfbkjebfkebwkfbewkjfbkjwebf" with open_s3_object(path, mode=mode, newline="\n", use_threads=use_threads) as s3obj: if mode == "wb": s3obj.write(text.encode("utf-8")) else: s3obj.write(text) assert client_s3.get_object(Bucket=bucket, Key=key)["Body"].read() == text.encode("utf-8")
def _read_parquet_metadata_file( path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool ) -> Dict[str, str]: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=131_072, # 128 KB (128 * 2**10) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f) return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
def test_write_chunked(path, mode, data_size, use_threads): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) chunks = ["a", "jdae", "bdiebdkibaekdbekfbksbfk", "sebkf", "jebkfjbekjfbkjebfkebwkfbe", "f", "0" * data_size] expected = b"ajdaebdiebdkibaekdbekfbksbfksebkfjebkfjbekjfbkjebfkebwkfbef" + (b"0" * data_size) with open_s3_object(path, mode=mode, newline="\n", use_threads=use_threads) as s3obj: for chunk in chunks: if mode == "wb": s3obj.write(chunk.encode("utf-8")) else: s3obj.write(chunk) assert client_s3.get_object(Bucket=bucket, Key=key)["Body"].read() == expected
def test_read_full(path, mode, use_threads): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) text = "AHDG*AWY&GD*A&WGd*AWgd87AGWD*GA*G*g*AGˆˆ&ÂDTW&ˆˆD&ÂTW7ˆˆTAWˆˆDAW&ˆˆAWGDIUHWOD#N" client_s3.put_object(Body=text, Bucket=bucket, Key=key) with open_s3_object(path, mode=mode, s3_block_size=100, newline="\n", use_threads=use_threads) as s3obj: if mode == "r": assert s3obj.read() == text else: assert s3obj.read() == text.encode("utf-8") if "b" in mode: assert s3obj._cache == b""
def test_read_chunked(path, mode, block_size, use_threads): client_s3 = boto3.client("s3") path = f"{path}0.txt" bucket, key = wr._utils.parse_path(path) text = "0123456789" client_s3.put_object(Body=text, Bucket=bucket, Key=key) with open_s3_object(path, mode=mode, s3_block_size=block_size, newline="\n", use_threads=use_threads) as s3obj: if mode == "r": for i in range(3): assert s3obj.read(1) == text[i] else: for i in range(3): assert s3obj.read(1) == text[i].encode("utf-8") assert len(s3obj._cache) <= block_size if "b" in mode: assert s3obj._cache == b""
def _count_row_groups( path: str, categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> int: _logger.debug("Counting row groups...") with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=131_072, # 128 KB (128 * 2**10) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories) n: int = cast(int, pq_file.num_row_groups) _logger.debug("Row groups count: %d", n) return n
def _to_text( file_format: str, df: pd.DataFrame, use_threads: bool, boto3_session: Optional[boto3.Session], s3_additional_kwargs: Optional[Dict[str, str]], path: Optional[str] = None, path_root: Optional[str] = None, filename: Optional[str] = None, **pandas_kwargs: Any, ) -> List[str]: if df.empty is True: raise exceptions.EmptyDataFrame() if path is None and path_root is not None: if filename is None: filename = uuid.uuid4().hex file_path: str = ( f"{path_root}{filename}.{file_format}{_COMPRESSION_2_EXT.get(pandas_kwargs.get('compression'))}" ) elif path is not None and path_root is None: file_path = path else: raise RuntimeError("path and path_root received at the same time.") mode, encoding, newline = _get_write_details(path=file_path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=file_path, mode=mode, use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, encoding=encoding, newline=newline, ) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) if file_format == "csv": df.to_csv(f, mode=mode, **pandas_kwargs) elif file_format == "json": df.to_json(f, **pandas_kwargs) return [file_path]
def _read_parquet_file( path: str, columns: Optional[List[str]], categories: Optional[List[str]], boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> pa.Table: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=134_217_728, # 128 MB (128 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_text_file( path: str, parser_func: Callable[..., pd.DataFrame], path_root: Optional[str], boto3_session: Union[boto3.Session, Dict[str, Optional[str]]], pandas_kwargs: Dict[str, Any], s3_additional_kwargs: Optional[Dict[str, str]], dataset: bool, use_threads: bool, ) -> pd.DataFrame: mode, encoding, newline = _get_read_details(path=path, pandas_kwargs=pandas_kwargs) with open_s3_object( path=path, mode=mode, use_threads=use_threads, s3_block_size=134_217_728, # 128 MB (128 * 2**20) encoding=encoding, s3_additional_kwargs=s3_additional_kwargs, newline=newline, boto3_session=boto3_session, ) as f: df: pd.DataFrame = parser_func(f, **pandas_kwargs)
def _read_parquet_chunked( paths: List[str], chunked: Union[bool, int], columns: Optional[List[str]], categories: Optional[List[str]], validate_schema: bool, safe: bool, boto3_session: boto3.Session, dataset: bool, path_root: Optional[str], s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool, ) -> Iterator[pd.DataFrame]: next_slice: Optional[pd.DataFrame] = None last_schema: Optional[Dict[str, str]] = None last_path: str = "" for path in paths: with open_s3_object( path=path, mode="rb", use_threads=use_threads, s3_block_size=10_485_760, # 10 MB (10 * 2**20) s3_additional_kwargs=s3_additional_kwargs, boto3_session=boto3_session, ) as f: pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile( source=f, read_dictionary=categories) schema: Dict[str, str] = _data_types.athena_types_from_pyarrow_schema( schema=pq_file.schema.to_arrow_schema(), partitions=None)[0] if validate_schema is True and last_schema is not None: if schema != last_schema: raise exceptions.InvalidSchemaConvergence( f"Was detect at least 2 different schemas:\n" f" - {last_path} -> {last_schema}\n" f" - {path} -> {schema}") last_schema = schema last_path = path num_row_groups: int = pq_file.num_row_groups _logger.debug("num_row_groups: %s", num_row_groups) for i in range(num_row_groups): _logger.debug("Reading Row Group %s...", i) df: pd.DataFrame = _arrowtable2df( table=pq_file.read_row_group(i=i, columns=columns, use_threads=use_threads, use_pandas_metadata=False), categories=categories, safe=safe, use_threads=use_threads, dataset=dataset, path=path, path_root=path_root, ) if chunked is True: yield df elif isinstance(chunked, int) and chunked > 0: if next_slice is not None: df = _union(dfs=[next_slice, df], ignore_index=None) while len(df.index) >= chunked: yield df.iloc[:chunked] df = df.iloc[chunked:] if df.empty: next_slice = None else: next_slice = df else: raise exceptions.InvalidArgument(f"chunked: {chunked}")
def download( path: str, local_file: Union[str, Any], version_id: Optional[str] = None, use_threads: Union[bool, int] = True, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, ) -> None: """Download file from from a received S3 path to local file. Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- path : str S3 path (e.g. ``s3://bucket/key0``). local_file : Union[str, Any] A file-like object in binary mode or a path to local file (e.g. ``./local/path/to/key0``). version_id: Optional[str] Version id of the object. use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forward to botocore requests, only "SSECustomerAlgorithm", "SSECustomerKey" and "RequestPayer" arguments will be considered. Returns ------- None Examples -------- Downloading a file using a path to local file >>> import awswrangler as wr >>> wr.s3.download(path='s3://bucket/key', local_file='./key') Downloading a file using a file-like object >>> import awswrangler as wr >>> with open(file='./key', mode='wb') as local_f: >>> wr.s3.download(path='s3://bucket/key', local_file=local_f) """ session: boto3.Session = _utils.ensure_session(session=boto3_session) _logger.debug("path: %s", path) with open_s3_object( path=path, mode="rb", use_threads=use_threads, version_id=version_id, s3_block_size=-1, # One shot download s3_additional_kwargs=s3_additional_kwargs, boto3_session=session, ) as s3_f: if isinstance(local_file, str): _logger.debug("Downloading local_file: %s", local_file) with open(file=local_file, mode="wb") as local_f: local_f.write(s3_f.read()) else: _logger.debug("Downloading file-like object.") local_file.write(s3_f.read())
def to_excel( df: pd.DataFrame, path: str, boto3_session: Optional[boto3.Session] = None, s3_additional_kwargs: Optional[Dict[str, Any]] = None, use_threads: Union[bool, int] = True, **pandas_kwargs: Any, ) -> str: """Write EXCEL file on Amazon S3. Note ---- This function accepts any Pandas's read_excel() argument. https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html Note ---- Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library might have to be installed first (e.g. xlrd). Note ---- In case of `use_threads=True` the number of threads that will be spawned will be gotten from os.cpu_count(). Parameters ---------- df: pandas.DataFrame Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html path : str Amazon S3 path (e.g. s3://bucket/filename.xlsx). boto3_session : boto3.Session(), optional Boto3 Session. The default boto3 Session will be used if boto3_session receive None. s3_additional_kwargs : Optional[Dict[str, Any]] Forwarded to botocore requests. e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'} use_threads : bool, int True to enable concurrent requests, False to disable multiple threads. If enabled os.cpu_count() will be used as the max number of threads. If integer is provided, specified number is used. pandas_kwargs: KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add valid Pandas arguments in the function call and Wrangler will accept it. e.g. wr.s3.to_excel(df, path, na_rep="", index=False) https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html Returns ------- str Written S3 path. Examples -------- Writing EXCEL file >>> import awswrangler as wr >>> import pandas as pd >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx') """ if "pandas_kwargs" in pandas_kwargs: raise exceptions.InvalidArgument( "You can NOT pass `pandas_kwargs` explicit, just add valid " "Pandas arguments in the function call and Wrangler will accept it." "e.g. wr.s3.to_excel(df, path, na_rep=" ", index=False)") session: boto3.Session = _utils.ensure_session(session=boto3_session) with open_s3_object( path=path, mode="wb", use_threads=use_threads, s3_additional_kwargs=s3_additional_kwargs, boto3_session=session, ) as f: _logger.debug("pandas_kwargs: %s", pandas_kwargs) df.to_excel(f, **pandas_kwargs) return path
def test_basics(path, glue_database, glue_table, workgroup0, workgroup1): args = {"table": glue_table, "path": "", "columns_types": {"col0": "bigint"}} # Missing database argument with pytest.raises(TypeError): wr.catalog.create_parquet_table(**args) # Configuring default database value wr.config.database = glue_database # Testing configured database wr.catalog.create_parquet_table(**args) # Configuring default database with wrong value wr.config.database = "missing_database" with pytest.raises(boto3.client("glue").exceptions.EntityNotFoundException): wr.catalog.create_parquet_table(**args) # Overwriting configured database wr.catalog.create_parquet_table(database=glue_database, **args) # Testing configured s3 block size size = 1 * 2 ** 20 # 1 MB wr.config.s3_block_size = size with open_s3_object(path, mode="wb") as s3obj: s3obj.write(b"foo") with open_s3_object(path, mode="rb") as s3obj: assert s3obj._s3_block_size == size # Resetting all configs wr.config.reset() # Missing database argument with pytest.raises(TypeError): wr.catalog.does_table_exist(table=glue_table) # Configuring default database value again wr.config.database = glue_database # Testing configured database again assert wr.catalog.does_table_exist(table=glue_table) is True # Resetting this specific config wr.config.reset("database") # Missing database argument with pytest.raises(TypeError): wr.catalog.does_table_exist(table=glue_table) # exporting environment variable os.environ["WR_DATABASE"] = glue_database wr.config.reset("database") assert wr.catalog.does_table_exist(table=glue_table) is True del os.environ["WR_DATABASE"] wr.config.reset("database") # Missing database argument with pytest.raises(TypeError): wr.catalog.does_table_exist(table=glue_table) assert wr.config.to_pandas().shape == (len(wr._config._CONFIG_ARGS), 7) # Workgroup wr.config.workgroup = workgroup0 df = wr.athena.read_sql_query(sql="SELECT 1 as col0", database=glue_database) assert df.query_metadata["WorkGroup"] == workgroup0 os.environ["WR_WORKGROUP"] = workgroup1 wr.config.reset() df = wr.athena.read_sql_query(sql="SELECT 1 as col0", database=glue_database) assert df.query_metadata["WorkGroup"] == workgroup1 # Endpoints URLs region = boto3.Session().region_name wr.config.sts_endpoint_url = f"https://sts.{region}.amazonaws.com" wr.config.s3_endpoint_url = f"https://s3.{region}.amazonaws.com" wr.config.athena_endpoint_url = f"https://athena.{region}.amazonaws.com" wr.config.glue_endpoint_url = f"https://glue.{region}.amazonaws.com" _urls_test(glue_database) os.environ["WR_STS_ENDPOINT_URL"] = f"https://sts.{region}.amazonaws.com" os.environ["WR_S3_ENDPOINT_URL"] = f"https://s3.{region}.amazonaws.com" os.environ["WR_ATHENA_ENDPOINT_URL"] = f"https://athena.{region}.amazonaws.com" os.environ["WR_GLUE_ENDPOINT_URL"] = f"https://glue.{region}.amazonaws.com" wr.config.reset() _urls_test(glue_database)