コード例 #1
0
ファイル: _read.py プロジェクト: romankoles/aws-data-wrangler
def _add_query_metadata_generator(
        dfs: Iterator[pd.DataFrame],
        query_metadata: _QueryMetadata) -> Iterator[pd.DataFrame]:
    """Add Query Execution metadata to every DF in iterator."""
    for df in dfs:
        df = _apply_query_metadata(df=df, query_metadata=query_metadata)
        yield df
コード例 #2
0
ファイル: _read.py プロジェクト: romankoles/aws-data-wrangler
def _fetch_csv_result(
    query_metadata: _QueryMetadata,
    keep_files: bool,
    chunksize: Optional[int],
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, Any]],
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    _chunksize: Optional[int] = chunksize if isinstance(chunksize,
                                                        int) else None
    _logger.debug("_chunksize: %s", _chunksize)
    if query_metadata.output_location is None or query_metadata.output_location.endswith(
            ".csv") is False:
        chunked = _chunksize is not None
        return _empty_dataframe_response(chunked, query_metadata)
    path: str = query_metadata.output_location
    _logger.debug("Start CSV reading from %s", path)
    ret = s3.read_csv(
        path=[path],
        dtype=query_metadata.dtype,
        parse_dates=query_metadata.parse_timestamps,
        converters=query_metadata.converters,
        quoting=csv.QUOTE_ALL,
        keep_default_na=False,
        na_values=["", "NaN"],
        chunksize=_chunksize,
        skip_blank_lines=False,
        use_threads=False,
        boto3_session=boto3_session,
    )
    _logger.debug("Start type casting...")
    _logger.debug(type(ret))
    if _chunksize is None:
        df = _fix_csv_types(df=ret,
                            parse_dates=query_metadata.parse_dates,
                            binaries=query_metadata.binaries)
        df = _apply_query_metadata(df=df, query_metadata=query_metadata)
        if keep_files is False:
            s3.delete_objects(
                path=[path, f"{path}.metadata"],
                use_threads=use_threads,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
            )
        return df
    dfs = _fix_csv_types_generator(dfs=ret,
                                   parse_dates=query_metadata.parse_dates,
                                   binaries=query_metadata.binaries)
    dfs = _add_query_metadata_generator(dfs=dfs, query_metadata=query_metadata)
    if keep_files is False:
        return _delete_after_iterate(
            dfs=dfs,
            paths=[path, f"{path}.metadata"],
            use_threads=use_threads,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
        )
    return dfs
コード例 #3
0
ファイル: _read.py プロジェクト: romankoles/aws-data-wrangler
def _fetch_parquet_result(
    query_metadata: _QueryMetadata,
    keep_files: bool,
    categories: Optional[List[str]],
    chunksize: Optional[int],
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, Any]],
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    ret: Union[pd.DataFrame, Iterator[pd.DataFrame]]
    chunked: Union[bool, int] = False if chunksize is None else chunksize
    _logger.debug("chunked: %s", chunked)
    if query_metadata.manifest_location is None:
        return _empty_dataframe_response(bool(chunked), query_metadata)
    manifest_path: str = query_metadata.manifest_location
    metadata_path: str = manifest_path.replace("-manifest.csv", ".metadata")
    _logger.debug("manifest_path: %s", manifest_path)
    _logger.debug("metadata_path: %s", metadata_path)
    paths: List[str] = _extract_ctas_manifest_paths(
        path=manifest_path, boto3_session=boto3_session)
    if not paths:
        return _empty_dataframe_response(bool(chunked), query_metadata)
    ret = s3.read_parquet(
        path=paths,
        use_threads=use_threads,
        boto3_session=boto3_session,
        chunked=chunked,
        categories=categories,
        ignore_index=True,
    )
    if chunked is False:
        ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)
    else:
        ret = _add_query_metadata_generator(dfs=ret,
                                            query_metadata=query_metadata)
    paths_delete: List[str] = paths + [manifest_path, metadata_path]
    _logger.debug("type(ret): %s", type(ret))
    if chunked is False:
        if keep_files is False:
            s3.delete_objects(
                path=paths_delete,
                use_threads=use_threads,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
            )
        return ret
    if keep_files is False:
        return _delete_after_iterate(
            dfs=ret,
            paths=paths_delete,
            use_threads=use_threads,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
        )
    return ret
コード例 #4
0
ファイル: _read.py プロジェクト: rjEks/aws-data-wrangler
def _fetch_parquet_result(
    query_metadata: _QueryMetadata,
    keep_files: bool,
    categories: Optional[List[str]],
    chunksize: Optional[int],
    use_threads: bool,
    boto3_session: boto3.Session,
    s3_additional_kwargs: Optional[Dict[str, Any]],
    temp_table_fqn: Optional[str] = None,
) -> Union[pd.DataFrame, Iterator[pd.DataFrame]]:
    ret: Union[pd.DataFrame, Iterator[pd.DataFrame]]
    chunked: Union[bool, int] = False if chunksize is None else chunksize
    _logger.debug("chunked: %s", chunked)
    if query_metadata.manifest_location is None:
        return _empty_dataframe_response(bool(chunked), query_metadata)
    manifest_path: str = query_metadata.manifest_location
    metadata_path: str = manifest_path.replace("-manifest.csv", ".metadata")
    _logger.debug("manifest_path: %s", manifest_path)
    _logger.debug("metadata_path: %s", metadata_path)
    paths: List[str] = _extract_ctas_manifest_paths(path=manifest_path, boto3_session=boto3_session)
    if not paths:
        if not temp_table_fqn:
            raise exceptions.EmptyDataFrame("Query would return untyped, empty dataframe.")
        database, temp_table_name = map(lambda x: x.replace('"', ""), temp_table_fqn.split("."))
        dtype_dict = catalog.get_table_types(database=database, table=temp_table_name)
        df = pd.DataFrame(columns=list(dtype_dict.keys()))
        df = cast_pandas_with_athena_types(df=df, dtype=dtype_dict)
        df = _apply_query_metadata(df=df, query_metadata=query_metadata)
        return df
    ret = s3.read_parquet(
        path=paths,
        use_threads=use_threads,
        boto3_session=boto3_session,
        chunked=chunked,
        categories=categories,
        ignore_index=True,
    )
    if chunked is False:
        ret = _apply_query_metadata(df=ret, query_metadata=query_metadata)
    else:
        ret = _add_query_metadata_generator(dfs=ret, query_metadata=query_metadata)
    paths_delete: List[str] = paths + [manifest_path, metadata_path]
    _logger.debug("type(ret): %s", type(ret))
    if chunked is False:
        if keep_files is False:
            s3.delete_objects(
                path=paths_delete,
                use_threads=use_threads,
                boto3_session=boto3_session,
                s3_additional_kwargs=s3_additional_kwargs,
            )
        return ret
    if keep_files is False:
        return _delete_after_iterate(
            dfs=ret,
            paths=paths_delete,
            use_threads=use_threads,
            boto3_session=boto3_session,
            s3_additional_kwargs=s3_additional_kwargs,
        )
    return ret