Beispiel #1
0
def _paginate_stream(
    args: Dict[str, Any], path: str, use_threads: Union[bool, int], boto3_session: Optional[boto3.Session]
) -> pd.DataFrame:
    obj_size: int = size_objects(  # type: ignore
        path=[path],
        use_threads=False,
        boto3_session=boto3_session,
    ).get(path)
    if obj_size is None:
        raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}")
    scan_ranges = _gen_scan_range(obj_size=obj_size)

    if use_threads is False:
        stream_records = list(
            _select_object_content(
                args=args,
                boto3_session=boto3_session,
                scan_range=scan_range,
            )
            for scan_range in scan_ranges
        )
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor:
            stream_records = list(
                executor.map(
                    _select_object_content,
                    itertools.repeat(args),
                    itertools.repeat(boto3_session),
                    scan_ranges,
                )
            )
    return pd.DataFrame([item for sublist in stream_records for item in sublist])  # Flatten list of lists
def _paginate_stream(args: Dict[str, Any], path: str, use_threads: Union[bool,
                                                                         int],
                     boto3_session: Optional[boto3.Session]) -> pd.DataFrame:
    obj_size: int = size_objects(  # type: ignore
        path=[path],
        use_threads=False,
        boto3_session=boto3_session,
    ).get(path)
    if obj_size is None:
        raise exceptions.InvalidArgumentValue(
            f"S3 object w/o defined size: {path}")

    dfs: List[pd.Dataframe] = []
    client_s3: boto3.client = _utils.client(service_name="s3",
                                            session=boto3_session)

    if use_threads is False:
        dfs = list(
            _select_object_content(
                args=args,
                client_s3=client_s3,
                scan_range=scan_range,
            ) for scan_range in _gen_scan_range(obj_size=obj_size))
    else:
        cpus: int = _utils.ensure_cpu_count(use_threads=use_threads)
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=cpus) as executor:
            dfs = list(
                executor.map(
                    _select_object_content,
                    itertools.repeat(args),
                    itertools.repeat(client_s3),
                    _gen_scan_range(obj_size=obj_size),
                ))
    return pd.concat(dfs, ignore_index=True)
Beispiel #3
0
    def __init__(
        self,
        path: str,
        s3_block_size: int,
        mode: str,
        use_threads: Union[bool, int],
        s3_additional_kwargs: Optional[Dict[str, str]],
        boto3_session: Optional[boto3.Session],
        newline: Optional[str],
        encoding: Optional[str],
    ) -> None:
        super().__init__()
        self._use_threads = use_threads
        self._newline: str = "\n" if newline is None else newline
        self._encoding: str = "utf-8" if encoding is None else encoding
        self._bucket, self._key = _utils.parse_path(path=path)
        self._boto3_session: boto3.Session = _utils.ensure_session(session=boto3_session)
        if mode not in {"rb", "wb", "r", "w"}:
            raise NotImplementedError("File mode must be {'rb', 'wb', 'r', 'w'}, not %s" % mode)
        self._mode: str = "rb" if mode is None else mode
        self._one_shot_download: bool = False
        if 0 < s3_block_size < 3:
            raise exceptions.InvalidArgumentValue(
                "s3_block_size MUST > 2 to define a valid size or "
                "< 1 to avoid blocks and always execute one shot downloads."
            )
        if s3_block_size <= 0:
            _logger.debug("s3_block_size of %d, enabling one_shot_download.", s3_block_size)
            self._one_shot_download = True
        self._s3_block_size: int = s3_block_size
        self._s3_half_block_size: int = s3_block_size // 2
        self._s3_additional_kwargs: Dict[str, str] = {} if s3_additional_kwargs is None else s3_additional_kwargs
        self._client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session)
        self._loc: int = 0

        if self.readable() is True:
            self._cache: bytes = b""
            self._start: int = 0
            self._end: int = 0
            size: Optional[int] = size_objects(
                path=[path],
                use_threads=False,
                boto3_session=self._boto3_session,
                s3_additional_kwargs=self._s3_additional_kwargs,
            )[path]
            if size is None:
                raise exceptions.InvalidArgumentValue(f"S3 object w/o defined size: {path}")
            self._size: int = size
            _logger.debug("self._size: %s", self._size)
            _logger.debug("self._s3_block_size: %s", self._s3_block_size)
        elif self.writable() is True:
            self._mpu: Dict[str, Any] = {}
            self._buffer: io.BytesIO = io.BytesIO()
            self._parts_count: int = 0
            self._size = 0
            self._upload_proxy: _UploadProxy = _UploadProxy(use_threads=self._use_threads)
        else:
            raise RuntimeError(f"Invalid mode: {self._mode}")