예제 #1
0
def extract_partitions_metadata_from_paths(
    path: str, paths: List[str]
) -> Tuple[Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]:
    """Extract partitions metadata from Amazon S3 paths."""
    path = path if path.endswith("/") else f"{path}/"
    partitions_types: Dict[str, str] = {}
    partitions_values: Dict[str, List[str]] = {}
    for p in paths:
        if path not in p:
            raise exceptions.InvalidArgumentValue(
                f"Object {p} is not under the root path ({path})."
            )  # pragma: no cover
        path_wo_filename: str = p.rpartition("/")[0] + "/"
        if path_wo_filename not in partitions_values:
            path_wo_prefix: str = path_wo_filename.replace(f"{path}/", "")
            dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)]
            if dirs:
                values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs]  # type: ignore
                values_dics: Dict[str, str] = dict(values_tups)
                p_values: List[str] = list(values_dics.values())
                p_types: Dict[str, str] = {x: "string" for x in values_dics.keys()}
                if not partitions_types:
                    partitions_types = p_types
                if p_values:
                    partitions_types = p_types
                    partitions_values[path_wo_filename] = p_values
                elif p_types != partitions_types:  # pragma: no cover
                    raise exceptions.InvalidSchemaConvergence(
                        f"At least two different partitions schema detected: {partitions_types} and {p_types}"
                    )
    if not partitions_types:
        return None, None
    return partitions_types, partitions_values
예제 #2
0
def read_parquet_metadata_internal(
    path: Union[str, List[str]],
    dtype: Optional[Dict[str, str]],
    sampling: float,
    dataset: bool,
    use_threads: bool,
    boto3_session: Optional[boto3.Session],
) -> Tuple[Dict[str, str], Optional[Dict[str, str]], Optional[Dict[
        str, List[str]]]]:
    """Handle wr.s3.read_parquet_metadata internally."""
    session: boto3.Session = _utils.ensure_session(session=boto3_session)
    if dataset is True:
        if isinstance(path, str):
            _path: Optional[str] = path if path.endswith("/") else f"{path}/"
            paths: List[str] = path2list(path=_path, boto3_session=session)
        else:  # pragma: no cover
            raise exceptions.InvalidArgumentType(
                "Argument <path> must be str if dataset=True.")
    else:
        if isinstance(path, str):
            _path = None
            paths = path2list(path=path, boto3_session=session)
        elif isinstance(path, list):
            _path = None
            paths = path
        else:  # pragma: no cover
            raise exceptions.InvalidArgumentType(
                f"Argument path must be str or List[str] instead of {type(path)}."
            )
    schemas: List[Dict[str, str]] = [
        _read_parquet_metadata_file(path=x,
                                    use_threads=use_threads,
                                    boto3_session=session)
        for x in _utils.list_sampling(lst=paths, sampling=sampling)
    ]
    _logger.debug("schemas: %s", schemas)
    columns_types: Dict[str, str] = {}
    for schema in schemas:
        for column, _dtype in schema.items():
            if (column in columns_types) and (columns_types[column] !=
                                              _dtype):  # pragma: no cover
                raise exceptions.InvalidSchemaConvergence(
                    f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})."
                )
            columns_types[column] = _dtype
    partitions_types: Optional[Dict[str, str]] = None
    partitions_values: Optional[Dict[str, List[str]]] = None
    if (dataset is True) and (_path is not None):
        partitions_types, partitions_values = _utils.extract_partitions_metadata_from_paths(
            path=_path, paths=paths)
    if dtype:
        for k, v in dtype.items():
            if columns_types and k in columns_types:
                columns_types[k] = v
            if partitions_types and k in partitions_types:
                partitions_types[k] = v
    _logger.debug("columns_types: %s", columns_types)
    return columns_types, partitions_types, partitions_values
예제 #3
0
def _validate_schemas(schemas: Tuple[Dict[str, str], ...]) -> None:
    if len(schemas) < 2:
        return None
    first: Dict[str, str] = schemas[0]
    for schema in schemas[1:]:
        if first != schema:
            raise exceptions.InvalidSchemaConvergence(
                f"Was detect at least 2 different schemas:\n    1 - {first}\n    2 - {schema}."
            )
    return None
예제 #4
0
def _merge_schemas(schemas: Tuple[Dict[str, str], ...]) -> Dict[str, str]:
    columns_types: Dict[str, str] = {}
    for schema in schemas:
        for column, dtype in schema.items():
            if (column in columns_types) and (columns_types[column] != dtype):
                raise exceptions.InvalidSchemaConvergence(
                    f"Was detect at least 2 different types in column {column} ({columns_types[column]} and {dtype})."
                )
            columns_types[column] = dtype
    return columns_types
예제 #5
0
def _read_parquet_chunked(
    paths: List[str],
    chunked: Union[bool, int],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    validate_schema: bool,
    safe: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: bool,
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: pyarrow.parquet.ParquetFile = pyarrow.parquet.ParquetFile(
                source=f, read_dictionary=categories)
            schema: Dict[str,
                         str] = _data_types.athena_types_from_pyarrow_schema(
                             schema=pq_file.schema.to_arrow_schema(),
                             partitions=None)[0]
            if validate_schema is True and last_schema is not None:
                if schema != last_schema:
                    raise exceptions.InvalidSchemaConvergence(
                        f"Was detect at least 2 different schemas:\n"
                        f"    - {last_path} -> {last_schema}\n"
                        f"    - {path} -> {schema}")
            last_schema = schema
            last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            for i in range(num_row_groups):
                _logger.debug("Reading Row Group %s...", i)
                df: pd.DataFrame = _arrowtable2df(
                    table=pq_file.read_row_group(i=i,
                                                 columns=columns,
                                                 use_threads=use_threads,
                                                 use_pandas_metadata=False),
                    categories=categories,
                    safe=safe,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df], ignore_index=None)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked]
                        df = df.iloc[chunked:]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")
예제 #6
0
def _read_parquet_chunked(  # pylint: disable=too-many-branches
    paths: List[str],
    chunked: Union[bool, int],
    validate_schema: bool,
    ignore_index: Optional[bool],
    columns: Optional[List[str]],
    categories: Optional[List[str]],
    safe: bool,
    map_types: bool,
    boto3_session: boto3.Session,
    dataset: bool,
    path_root: Optional[str],
    s3_additional_kwargs: Optional[Dict[str, str]],
    use_threads: Union[bool, int],
) -> Iterator[pd.DataFrame]:
    next_slice: Optional[pd.DataFrame] = None
    last_schema: Optional[Dict[str, str]] = None
    last_path: str = ""
    for path in paths:
        with open_s3_object(
                path=path,
                mode="rb",
                use_threads=use_threads,
                s3_block_size=10_485_760,  # 10 MB (10 * 2**20)
                s3_additional_kwargs=s3_additional_kwargs,
                boto3_session=boto3_session,
        ) as f:
            pq_file: Optional[
                pyarrow.parquet.ParquetFile] = _pyarrow_parquet_file_wrapper(
                    source=f, read_dictionary=categories)
            if pq_file is None:
                continue
            if validate_schema is True:
                schema: Dict[
                    str, str] = _data_types.athena_types_from_pyarrow_schema(
                        schema=pq_file.schema.to_arrow_schema(),
                        partitions=None)[0]
                if last_schema is not None:
                    if schema != last_schema:
                        raise exceptions.InvalidSchemaConvergence(
                            f"Was detect at least 2 different schemas:\n"
                            f"    - {last_path} -> {last_schema}\n"
                            f"    - {path} -> {schema}")
                last_schema = schema
                last_path = path
            num_row_groups: int = pq_file.num_row_groups
            _logger.debug("num_row_groups: %s", num_row_groups)
            use_threads_flag: bool = use_threads if isinstance(
                use_threads, bool) else bool(use_threads > 1)
            # iter_batches is only available for pyarrow >= 3.0.0
            if callable(getattr(pq_file, "iter_batches", None)):
                chunk_generator = _pyarrow_chunk_generator(
                    pq_file=pq_file,
                    chunked=chunked,
                    columns=columns,
                    use_threads_flag=use_threads_flag)
            else:
                chunk_generator = _row_group_chunk_generator(
                    pq_file=pq_file,
                    columns=columns,
                    use_threads_flag=use_threads_flag,
                    num_row_groups=num_row_groups)

            for chunk in chunk_generator:
                df: pd.DataFrame = _arrowtable2df(
                    table=chunk,
                    categories=categories,
                    safe=safe,
                    map_types=map_types,
                    use_threads=use_threads,
                    dataset=dataset,
                    path=path,
                    path_root=path_root,
                )
                if chunked is True:
                    yield df
                elif isinstance(chunked, int) and chunked > 0:
                    if next_slice is not None:
                        df = _union(dfs=[next_slice, df],
                                    ignore_index=ignore_index)
                    while len(df.index) >= chunked:
                        yield df.iloc[:chunked, :].copy()
                        df = df.iloc[chunked:, :]
                    if df.empty:
                        next_slice = None
                    else:
                        next_slice = df
                else:
                    raise exceptions.InvalidArgument(f"chunked: {chunked}")