def test_basics(path, glue_database, glue_table):
    args = {
        "table": glue_table,
        "path": "",
        "columns_types": {
            "col0": "bigint"
        }
    }

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.create_parquet_table(**args)

    # Configuring default database value
    wr.config.database = glue_database

    # Testing configured database
    wr.catalog.create_parquet_table(**args)

    # Testing configured s3 block size
    size = 1 * 2**20  # 1 MB
    wr.config.s3_block_size = size
    with open_s3_object(path, mode="wb") as s3obj:
        s3obj.write(b"foo")
    with open_s3_object(path, mode="rb") as s3obj:
        assert s3obj._s3_block_size == size

    # Resetting all configs
    wr.config.reset()

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.does_table_exist(table=glue_table)

    # Configuring default database value again
    wr.config.database = glue_database

    # Testing configured database again
    assert wr.catalog.does_table_exist(table=glue_table) is True

    # Resetting this specific config
    wr.config.reset("database")

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.does_table_exist(table=glue_table)

    # exporting environment variable
    os.environ["WR_DATABASE"] = glue_database
    wr.config.reset("database")
    assert wr.catalog.does_table_exist(table=glue_table) is True
    del os.environ["WR_DATABASE"]
    wr.config.reset("database")

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.does_table_exist(table=glue_table)

    assert wr.config.to_pandas().shape == (len(wr._config._CONFIG_ARGS), 7)
예제 #2
0
def test_additional_kwargs(path, kms_key_id, s3_additional_kwargs, use_threads):
    if s3_additional_kwargs is not None and "SSEKMSKeyId" in s3_additional_kwargs:
        s3_additional_kwargs["SSEKMSKeyId"] = kms_key_id
    path = f"{path}0.txt"
    with open_s3_object(path, mode="w", s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads) as s3obj:
        s3obj.write("foo")
    with open_s3_object(
        path, mode="r", s3_block_size=10_000_000, s3_additional_kwargs=s3_additional_kwargs, use_threads=use_threads,
    ) as s3obj:
        assert s3obj.read() == "foo"
예제 #3
0
def test_botocore_config(path):
    original = botocore.client.ClientCreator.create_client

    # Default values for botocore.config.Config
    expected_max_retries_attempt = 5
    expected_connect_timeout = 10
    expected_max_pool_connections = 10
    expected_retry_mode = None

    def wrapper(self, **kwarg):
        assert kwarg["client_config"].retries["max_attempts"] == expected_max_retries_attempt
        assert kwarg["client_config"].connect_timeout == expected_connect_timeout
        assert kwarg["client_config"].max_pool_connections == expected_max_pool_connections
        assert kwarg["client_config"].retries.get("mode") == expected_retry_mode
        return original(self, **kwarg)

    # Check for default values
    with patch("botocore.client.ClientCreator.create_client", new=wrapper):
        with open_s3_object(path, mode="wb") as s3obj:
            s3obj.write(b"foo")

    # Update default config with environment variables
    expected_max_retries_attempt = 20
    expected_connect_timeout = 10
    expected_max_pool_connections = 10
    expected_retry_mode = "adaptive"

    os.environ["AWS_MAX_ATTEMPTS"] = str(expected_max_retries_attempt)
    os.environ["AWS_RETRY_MODE"] = expected_retry_mode

    with patch("botocore.client.ClientCreator.create_client", new=wrapper):
        with open_s3_object(path, mode="wb") as s3obj:
            s3obj.write(b"foo")

    del os.environ["AWS_MAX_ATTEMPTS"]
    del os.environ["AWS_RETRY_MODE"]

    # Update botocore.config.Config
    expected_max_retries_attempt = 30
    expected_connect_timeout = 40
    expected_max_pool_connections = 50
    expected_retry_mode = "legacy"

    botocore_config = botocore.config.Config(
        retries={"max_attempts": expected_max_retries_attempt, "mode": expected_retry_mode},
        connect_timeout=expected_connect_timeout,
        max_pool_connections=expected_max_pool_connections,
    )
    wr.config.botocore_config = botocore_config

    with patch("botocore.client.ClientCreator.create_client", new=wrapper):
        with open_s3_object(path, mode="wb") as s3obj:
            s3obj.write(b"foo")

    wr.config.reset()
예제 #4
0
def test_io_intense(path, use_threads):
    path = f"{path}0.txt"
    data = b"0" * 10_000_000 + b"1" * 10_000_000 + b"2" * 10_000_000

    with open_s3_object(path, mode="wb", use_threads=use_threads) as s3obj:
        s3obj.write(data)

    with open_s3_object(path, mode="rb", use_threads=use_threads) as s3obj:
        assert s3obj.read() == data

    bucket, key = wr._utils.parse_path(path)
    assert boto3.client("s3").get_object(Bucket=bucket, Key=key)["Body"].read() == data
예제 #5
0
def _read_parquet_file(
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
    version_id: Optional[str] = None,
    pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> pa.Table:
    pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs)
    s3_block_size: int = 20_971_520 if columns else -1  # One shot for a full read otherwise 20 MB (20 * 2**20)
    with open_s3_object(
        path=path,
        mode="rb",
        version_id=version_id,
        use_threads=use_threads,
        s3_block_size=s3_block_size,
        s3_additional_kwargs=s3_additional_kwargs,
        boto3_session=boto3_session,
    ) as f:
        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
            source=f,
            read_dictionary=categories,
            coerce_int96_timestamp_unit=pyarrow_args["coerce_int96_timestamp_unit"],
        )
        if pq_file is None:
            raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
        return pq_file.read(columns=columns, use_threads=False, use_pandas_metadata=False)
def _read_parquet_row_group(
    row_group: int,
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_primitives: _utils.Boto3PrimitivesType,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> pa.Table:
    boto3_session: boto3.Session = _utils.boto3_from_primitives(
        primitives=boto3_primitives)
    with open_s3_object(
            path=path,
            mode="rb",
            use_threads=use_threads,
            s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
    ) as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        num_row_groups: int = pq_file.num_row_groups
        _logger.debug("Reading Row Group %s/%s [multi-threaded]",
                      row_group + 1, num_row_groups)
        return pq_file.read_row_group(i=row_group,
                                      columns=columns,
                                      use_threads=False,
                                      use_pandas_metadata=False)
예제 #7
0
def _read_parquet_metadata_file(
    path: str,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
    version_id: Optional[str] = None,
    ignore_null: bool = False,
    pyarrow_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> Optional[Dict[str, str]]:
    pyarrow_args = _set_default_pyarrow_additional_kwargs(pyarrow_additional_kwargs)
    with open_s3_object(
        path=path,
        mode="rb",
        version_id=version_id,
        use_threads=use_threads,
        s3_block_size=131_072,  # 128 KB (128 * 2**10)
        s3_additional_kwargs=s3_additional_kwargs,
        boto3_session=boto3_session,
    ) as f:
        pq_file: Optional[pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
            source=f, coerce_int96_timestamp_unit=pyarrow_args["coerce_int96_timestamp_unit"]
        )
        if pq_file is None:
            return None
        return _data_types.athena_types_from_pyarrow_schema(
            schema=pq_file.schema.to_arrow_schema(), partitions=None, ignore_null=ignore_null
        )[0]
예제 #8
0
def test_read_line(path, mode, block_size, use_threads):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    text = "0\n11\n22222\n33333333333333\n44444444444444444444444444444444444444444444\n55555"
    expected = [
        "0\n", "11\n", "22222\n", "33333333333333\n",
        "44444444444444444444444444444444444444444444\n", "55555"
    ]
    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
    with open_s3_object(path,
                        mode=mode,
                        s3_block_size=block_size,
                        newline="\n",
                        use_threads=use_threads) as s3obj:
        for i, line in enumerate(s3obj):
            if mode == "r":
                assert line == expected[i]
            else:
                assert line == expected[i].encode("utf-8")
        s3obj.seek(0)
        lines = s3obj.readlines()
        if mode == "r":
            assert lines == expected
        else:
            assert [line.decode("utf-8") for line in lines] == expected
    if "b" in mode:
        assert s3obj._cache == b""
예제 #9
0
def _read_text_file(
    path: str,
    version_id: Optional[str],
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, _utils.Boto3PrimitivesType],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: Union[bool, int],
) -> pd.DataFrame:
    boto3_session = _utils.ensure_session(boto3_session)
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    try:
        with open_s3_object(
                path=path,
                version_id=version_id,
                mode=mode,
                use_threads=use_threads,
                s3_block_size=-1,  # One shot download
                encoding=encoding,
                s3_additional_kwargs=s3_additional_kwargs,
                newline=newline,
                boto3_session=boto3_session,
        ) as f:
            df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    except botocore.exceptions.ClientError as e:
        if e.response["Error"]["Code"] == "404":
            raise exceptions.NoFilesFound(f"No files Found on: {path}.")
        raise e
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)
예제 #10
0
def _new_writer(
    file_path: str,
    compression: Optional[str],
    schema: pa.Schema,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> Iterator[pyarrow.parquet.ParquetWriter]:
    writer: Optional[pyarrow.parquet.ParquetWriter] = None
    with open_s3_object(
            path=file_path,
            mode="wb",
            use_threads=use_threads,
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
    ) as f:
        try:
            writer = pyarrow.parquet.ParquetWriter(
                where=f,
                write_statistics=True,
                use_dictionary=True,
                coerce_timestamps="ms",
                compression="NONE" if compression is None else compression,
                flavor="spark",
                schema=schema,
            )
            yield writer
        finally:
            if writer is not None and writer.is_open is True:
                writer.close()
예제 #11
0
def _read_parquet_file(
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> pa.Table:
    s3_block_size: int = 20_971_520 if columns else -1  # One shot for a full read otherwise 20 MB (20 * 2**20)
    with open_s3_object(
            path=path,
            mode="rb",
            use_threads=use_threads,
            s3_block_size=s3_block_size,
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
    ) as f:
        pq_file: Optional[
            pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
                source=f, read_dictionary=categories)
        if pq_file is None:
            raise exceptions.InvalidFile(f"Invalid Parquet file: {path}")
        return pq_file.read(columns=columns,
                            use_threads=False,
                            use_pandas_metadata=False)
예제 #12
0
def _read_text_chunked(
    paths: List[str],
    chunksize: int,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: boto3.Session,
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    for path in paths:
        _logger.debug("path: %s", path)
        mode, encoding, newline = _get_read_details(
            path=path, pandas_kwargs=pandas_kwargs)
        with open_s3_object(
                path=path,
                mode=mode,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                encoding=encoding,
                use_threads=use_threads,
                s3_additional_kwargs=s3_additional_kwargs,
                newline=newline,
                boto3_session=boto3_session,
        ) as f:
            reader: pandas.io.parsers.TextFileReader = parser_func(
                f, chunksize=chunksize, **pandas_kwargs)
            for df in reader:
                yield _apply_partitions(df=df,
                                        dataset=dataset,
                                        path=path,
                                        path_root=path_root)
예제 #13
0
def test_read(path, use_threads, block_size, seq, length):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    text = "0123456789"
    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
    fs = s3fs.S3FileSystem()
    with fs.open(path, "rb") as f:
        with open_s3_object(path,
                            mode="rb",
                            s3_block_size=block_size,
                            use_threads=use_threads) as s3obj:
            for i in seq:
                s3obj.seek(i)
                f.seek(i)
                data = s3obj.read(length)
                assert data[0:1] == text[i].encode("utf-8")
                assert data == f.read(length)
                logger.debug(s3obj._cache)
                if block_size < 1:
                    assert len(s3obj._cache) == s3obj._size
                elif length > block_size:
                    assert block_size <= len(s3obj._cache) <= length
                else:
                    assert len(s3obj._cache) == block_size
    assert s3obj._cache == b""
예제 #14
0
def _to_text(
    file_format: str,
    df: pd.DataFrame,
    use_threads: bool,
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    path: Optional[str] = None,
    path_root: Optional[str] = None,
    **pandas_kwargs: Any,
) -> List[str]:
    if df.empty is True:
        raise exceptions.EmptyDataFrame()
    if path is None and path_root is not None:
        file_path: str = f"{path_root}{uuid.uuid4().hex}.{file_format}"
    elif path is not None and path_root is None:
        file_path = path
    else:
        raise RuntimeError("path and path_root received at the same time.")
    encoding: Optional[str] = pandas_kwargs.get("encoding", None)
    with open_s3_object(
            path=file_path,
            mode="w",
            use_threads=use_threads,
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
            encoding=encoding,
            newline=None,
    ) as f:
        _logger.debug("pandas_kwargs: %s", pandas_kwargs)
        if file_format == "csv":
            df.to_csv(f, **pandas_kwargs)
        elif file_format == "json":
            df.to_json(f, **pandas_kwargs)
    return [file_path]
예제 #15
0
def _read_text_file(
    path: str,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, Dict[str, Optional[str]]],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: bool,
) -> pd.DataFrame:
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    with open_s3_object(
            path=path,
            mode=mode,
            use_threads=use_threads,
            s3_block_size=-1,  # One shot download
            encoding=encoding,
            s3_additional_kwargs=s3_additional_kwargs,
            newline=newline,
            boto3_session=boto3_session,
    ) as f:
        df: pd.DataFrame = parser_func(f, **pandas_kwargs)
    return _apply_partitions(df=df,
                             dataset=dataset,
                             path=path,
                             path_root=path_root)
예제 #16
0
def test_cache_seek(path):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    text = "0" * 1_000_000 + "1" * 4
    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
    with open_s3_object(path, mode="rb", s3_block_size=1_000) as s3obj:
        s3obj.seek(1_000_000)
        assert s3obj.read(100).decode("utf-8") == "1" * 4
    assert s3obj._cache == b""
예제 #17
0
def test_cache(path, use_threads, block_size, text):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
    with open_s3_object(path, mode="rb", s3_block_size=block_size, use_threads=use_threads) as s3obj:
        for i in range(len(text)):
            value = s3obj.read(1)
            assert value == text[i].encode("utf-8")
            assert len(s3obj._cache) in (block_size, block_size - 1, len(text))
    assert s3obj._cache == b""
예제 #18
0
def test_write_full(path, mode, use_threads):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    text = "ajdaebdiebdkibaekdbekfbksbfksebkfjebkfjbekjfbkjebfkebwkfbewkjfbkjwebf"
    with open_s3_object(path, mode=mode, newline="\n", use_threads=use_threads) as s3obj:
        if mode == "wb":
            s3obj.write(text.encode("utf-8"))
        else:
            s3obj.write(text)
    assert client_s3.get_object(Bucket=bucket, Key=key)["Body"].read() == text.encode("utf-8")
예제 #19
0
def _read_parquet_metadata_file(
    path: str, boto3_session: boto3.Session, s3_additional_kwargs: Optional[Dict[str, str]], use_threads: bool
) -> Dict[str, str]:
    with open_s3_object(
        path=path,
        mode="rb",
        use_threads=use_threads,
        s3_block_size=131_072,  # 128 KB (128 * 2**10)
        s3_additional_kwargs=s3_additional_kwargs,
        boto3_session=boto3_session,
    ) as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f)
        return _data_types.athena_types_from_pyarrow_schema(schema=pq_file.schema.to_arrow_schema(), partitions=None)[0]
예제 #20
0
def test_write_chunked(path, mode, data_size, use_threads):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    chunks = ["a", "jdae", "bdiebdkibaekdbekfbksbfk", "sebkf", "jebkfjbekjfbkjebfkebwkfbe", "f", "0" * data_size]
    expected = b"ajdaebdiebdkibaekdbekfbksbfksebkfjebkfjbekjfbkjebfkebwkfbef" + (b"0" * data_size)
    with open_s3_object(path, mode=mode, newline="\n", use_threads=use_threads) as s3obj:
        for chunk in chunks:
            if mode == "wb":
                s3obj.write(chunk.encode("utf-8"))
            else:
                s3obj.write(chunk)
    assert client_s3.get_object(Bucket=bucket, Key=key)["Body"].read() == expected
예제 #21
0
def test_read_full(path, mode, use_threads):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    text = "AHDG*AWY&GD*A&WGd*AWgd87AGWD*GA*G*g*AGˆˆ&ÂDTW&ˆˆD&ÂTW7ˆˆTAWˆˆDAW&ˆˆAWGDIUHWOD#N"
    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
    with open_s3_object(path, mode=mode, s3_block_size=100, newline="\n", use_threads=use_threads) as s3obj:
        if mode == "r":
            assert s3obj.read() == text
        else:
            assert s3obj.read() == text.encode("utf-8")
    if "b" in mode:
        assert s3obj._cache == b""
예제 #22
0
def test_read_chunked(path, mode, block_size, use_threads):
    client_s3 = boto3.client("s3")
    path = f"{path}0.txt"
    bucket, key = wr._utils.parse_path(path)
    text = "0123456789"
    client_s3.put_object(Body=text, Bucket=bucket, Key=key)
    with open_s3_object(path, mode=mode, s3_block_size=block_size, newline="\n", use_threads=use_threads) as s3obj:
        if mode == "r":
            for i in range(3):
                assert s3obj.read(1) == text[i]
        else:
            for i in range(3):
                assert s3obj.read(1) == text[i].encode("utf-8")
                assert len(s3obj._cache) <= block_size
    if "b" in mode:
        assert s3obj._cache == b""
예제 #23
0
def _count_row_groups(
    path: str,
    categories: Optional[List[str]],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> int:
    _logger.debug("Counting row groups...")
    with open_s3_object(
        path=path,
        mode="rb",
        use_threads=use_threads,
        s3_block_size=131_072,  # 128 KB (128 * 2**10)
        s3_additional_kwargs=s3_additional_kwargs,
        boto3_session=boto3_session,
    ) as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(source=f, read_dictionary=categories)
        n: int = cast(int, pq_file.num_row_groups)
        _logger.debug("Row groups count: %d", n)
        return n
예제 #24
0
def _to_text(
    file_format: str,
    df: pd.DataFrame,
    use_threads: bool,
    boto3_session: Optional[boto3.Session],
    s3_additional_kwargs: Optional[Dict[str, str]],
    path: Optional[str] = None,
    path_root: Optional[str] = None,
    filename: Optional[str] = None,
    **pandas_kwargs: Any,
) -> List[str]:
    if df.empty is True:
        raise exceptions.EmptyDataFrame()
    if path is None and path_root is not None:
        if filename is None:
            filename = uuid.uuid4().hex
        file_path: str = (
            f"{path_root}{filename}.{file_format}{_COMPRESSION_2_EXT.get(pandas_kwargs.get('compression'))}"
        )
    elif path is not None and path_root is None:
        file_path = path
    else:
        raise RuntimeError("path and path_root received at the same time.")

    mode, encoding, newline = _get_write_details(path=file_path,
                                                 pandas_kwargs=pandas_kwargs)
    with open_s3_object(
            path=file_path,
            mode=mode,
            use_threads=use_threads,
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
            encoding=encoding,
            newline=newline,
    ) as f:
        _logger.debug("pandas_kwargs: %s", pandas_kwargs)
        if file_format == "csv":
            df.to_csv(f, mode=mode, **pandas_kwargs)
        elif file_format == "json":
            df.to_json(f, **pandas_kwargs)
    return [file_path]
def _read_parquet_file(
    path: str,
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> pa.Table:
    with open_s3_object(
            path=path,
            mode="rb",
            use_threads=use_threads,
            s3_block_size=134_217_728,  # 128 MB (128 * 2**20)
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=boto3_session,
    ) as f:
        pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
            source=f, read_dictionary=categories)
        return pq_file.read(columns=columns,
                            use_threads=False,
                            use_pandas_metadata=False)
예제 #26
0
def _read_text_file(
    path: str,
    parser_func: Callable[..., pd.DataFrame],
    path_root: Optional[str],
    boto3_session: Union[boto3.Session, Dict[str, Optional[str]]],
    pandas_kwargs: Dict[str, Any],
    s3_additional_kwargs: Optional[Dict[str, str]],
    dataset: bool,
    use_threads: bool,
) -> pd.DataFrame:
    mode, encoding, newline = _get_read_details(path=path,
                                                pandas_kwargs=pandas_kwargs)
    with open_s3_object(
            path=path,
            mode=mode,
            use_threads=use_threads,
            s3_block_size=134_217_728,  # 128 MB (128 * 2**20)
            encoding=encoding,
            s3_additional_kwargs=s3_additional_kwargs,
            newline=newline,
            boto3_session=boto3_session,
    ) as f:
        df: pd.DataFrame = parser_func(f, **pandas_kwargs)
예제 #27
0
def _read_parquet_chunked(
    paths: List[str],
    chunked: Union[bool, int],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    validate_schema: bool,
    safe: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
                source=f, read_dictionary=categories)
            schema: Dict[str,
                         str] = _data_types.athena_types_from_pyarrow_schema(
                             schema=pq_file.schema.to_arrow_schema(),
                             partitions=None)[0]
            if validate_schema is True and last_schema is not None:
                if schema != last_schema:
                    raise exceptions.InvalidSchemaConvergence(
                        f"Was detect at least 2 different schemas:\n"
                        f"    - {last_path} -> {last_schema}\n"
                        f"    - {path} -> {schema}")
            last_schema = schema
            last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            for i in range(num_row_groups):
                _logger.debug("Reading Row Group %s...", i)
                df: pd.DataFrame = _arrowtable2df(
                    table=pq_file.read_row_group(i=i,
                                                 columns=columns,
                                                 use_threads=use_threads,
                                                 use_pandas_metadata=False),
                    categories=categories,
                    safe=safe,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df], ignore_index=None)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked]
                        df = df.iloc[chunked:]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")
예제 #28
0
def download(
    path: str,
    local_file: Union[str, Any],
    version_id: Optional[str] = None,
    use_threads: Union[bool, int] = True,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
) -> None:
    """Download file from from a received S3 path to local file.

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    path : str
        S3 path (e.g. ``s3://bucket/key0``).
    local_file : Union[str, Any]
        A file-like object in binary mode or a path to local file (e.g. ``./local/path/to/key0``).
    version_id: Optional[str]
        Version id of the object.
    use_threads : bool, int
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
        If integer is provided, specified number is used.
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forward to botocore requests, only "SSECustomerAlgorithm", "SSECustomerKey" and "RequestPayer"
        arguments will be considered.

    Returns
    -------
    None

    Examples
    --------
    Downloading a file using a path to local file

    >>> import awswrangler as wr
    >>> wr.s3.download(path='s3://bucket/key', local_file='./key')

    Downloading a file using a file-like object

    >>> import awswrangler as wr
    >>> with open(file='./key', mode='wb') as local_f:
    >>>     wr.s3.download(path='s3://bucket/key', local_file=local_f)

    """
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    _logger.debug("path: %s", path)
    with open_s3_object(
            path=path,
            mode="rb",
            use_threads=use_threads,
            version_id=version_id,
            s3_block_size=-1,  # One shot download
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=session,
    ) as s3_f:
        if isinstance(local_file, str):
            _logger.debug("Downloading local_file: %s", local_file)
            with open(file=local_file, mode="wb") as local_f:
                local_f.write(s3_f.read())
        else:
            _logger.debug("Downloading file-like object.")
            local_file.write(s3_f.read())
def to_excel(
    df: pd.DataFrame,
    path: str,
    boto3_session: Optional[boto3.Session] = None,
    s3_additional_kwargs: Optional[Dict[str, Any]] = None,
    use_threads: Union[bool, int] = True,
    **pandas_kwargs: Any,
) -> str:
    """Write EXCEL file on Amazon S3.

    Note
    ----
    This function accepts any Pandas's read_excel() argument.
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html

    Note
    ----
    Depending on the file extension ('xlsx', 'xls', 'odf'...), an additional library
    might have to be installed first (e.g. xlrd).

    Note
    ----
    In case of `use_threads=True` the number of threads
    that will be spawned will be gotten from os.cpu_count().

    Parameters
    ----------
    df: pandas.DataFrame
        Pandas DataFrame https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
    path : str
        Amazon S3 path (e.g. s3://bucket/filename.xlsx).
    boto3_session : boto3.Session(), optional
        Boto3 Session. The default boto3 Session will be used if boto3_session receive None.
    s3_additional_kwargs : Optional[Dict[str, Any]]
        Forwarded to botocore requests.
        e.g. s3_additional_kwargs={'ServerSideEncryption': 'aws:kms', 'SSEKMSKeyId': 'YOUR_KMS_KEY_ARN'}
    use_threads : bool, int
        True to enable concurrent requests, False to disable multiple threads.
        If enabled os.cpu_count() will be used as the max number of threads.
        If integer is provided, specified number is used.
    pandas_kwargs:
        KEYWORD arguments forwarded to pandas.DataFrame.to_excel(). You can NOT pass `pandas_kwargs` explicit, just add
        valid Pandas arguments in the function call and Wrangler will accept it.
        e.g. wr.s3.to_excel(df, path, na_rep="", index=False)
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_excel.html

    Returns
    -------
    str
        Written S3 path.

    Examples
    --------
    Writing EXCEL file

    >>> import awswrangler as wr
    >>> import pandas as pd
    >>> wr.s3.to_excel(df, 's3://bucket/filename.xlsx')

    """
    if "pandas_kwargs" in pandas_kwargs:
        raise exceptions.InvalidArgument(
            "You can NOT pass `pandas_kwargs` explicit, just add valid "
            "Pandas arguments in the function call and Wrangler will accept it."
            "e.g. wr.s3.to_excel(df, path, na_rep="
            ", index=False)")
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    with open_s3_object(
            path=path,
            mode="wb",
            use_threads=use_threads,
            s3_additional_kwargs=s3_additional_kwargs,
            boto3_session=session,
    ) as f:
        _logger.debug("pandas_kwargs: %s", pandas_kwargs)
        df.to_excel(f, **pandas_kwargs)
    return path
예제 #30
0
def test_basics(path, glue_database, glue_table, workgroup0, workgroup1):
    args = {"table": glue_table, "path": "", "columns_types": {"col0": "bigint"}}

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.create_parquet_table(**args)

    # Configuring default database value
    wr.config.database = glue_database

    # Testing configured database
    wr.catalog.create_parquet_table(**args)

    # Configuring default database with wrong value
    wr.config.database = "missing_database"
    with pytest.raises(boto3.client("glue").exceptions.EntityNotFoundException):
        wr.catalog.create_parquet_table(**args)

    # Overwriting configured database
    wr.catalog.create_parquet_table(database=glue_database, **args)

    # Testing configured s3 block size
    size = 1 * 2 ** 20  # 1 MB
    wr.config.s3_block_size = size
    with open_s3_object(path, mode="wb") as s3obj:
        s3obj.write(b"foo")
    with open_s3_object(path, mode="rb") as s3obj:
        assert s3obj._s3_block_size == size

    # Resetting all configs
    wr.config.reset()

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.does_table_exist(table=glue_table)

    # Configuring default database value again
    wr.config.database = glue_database

    # Testing configured database again
    assert wr.catalog.does_table_exist(table=glue_table) is True

    # Resetting this specific config
    wr.config.reset("database")

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.does_table_exist(table=glue_table)

    # exporting environment variable
    os.environ["WR_DATABASE"] = glue_database
    wr.config.reset("database")
    assert wr.catalog.does_table_exist(table=glue_table) is True
    del os.environ["WR_DATABASE"]
    wr.config.reset("database")

    # Missing database argument
    with pytest.raises(TypeError):
        wr.catalog.does_table_exist(table=glue_table)

    assert wr.config.to_pandas().shape == (len(wr._config._CONFIG_ARGS), 7)

    # Workgroup
    wr.config.workgroup = workgroup0
    df = wr.athena.read_sql_query(sql="SELECT 1 as col0", database=glue_database)
    assert df.query_metadata["WorkGroup"] == workgroup0
    os.environ["WR_WORKGROUP"] = workgroup1
    wr.config.reset()
    df = wr.athena.read_sql_query(sql="SELECT 1 as col0", database=glue_database)
    assert df.query_metadata["WorkGroup"] == workgroup1

    # Endpoints URLs
    region = boto3.Session().region_name
    wr.config.sts_endpoint_url = f"https://sts.{region}.amazonaws.com"
    wr.config.s3_endpoint_url = f"https://s3.{region}.amazonaws.com"
    wr.config.athena_endpoint_url = f"https://athena.{region}.amazonaws.com"
    wr.config.glue_endpoint_url = f"https://glue.{region}.amazonaws.com"
    _urls_test(glue_database)
    os.environ["WR_STS_ENDPOINT_URL"] = f"https://sts.{region}.amazonaws.com"
    os.environ["WR_S3_ENDPOINT_URL"] = f"https://s3.{region}.amazonaws.com"
    os.environ["WR_ATHENA_ENDPOINT_URL"] = f"https://athena.{region}.amazonaws.com"
    os.environ["WR_GLUE_ENDPOINT_URL"] = f"https://glue.{region}.amazonaws.com"
    wr.config.reset()
    _urls_test(glue_database)