def flush(self, force: bool = False) -> None: """Write buffered data to S3.""" if self.closed: # pylint: disable=using-constant-test raise RuntimeError("I/O operation on closed file.") if self.writable() and self._buffer.closed is False: total_size: int = self._buffer.tell() if total_size < _MIN_WRITE_BLOCK and force is False: return None if total_size == 0: return None _logger.debug("Flushing: %s bytes", total_size) self._mpu = self._mpu or _utils.try_it( f=self._client.create_multipart_upload, ex=_S3_RETRYABLE_ERRORS, base=0.5, max_num_tries=6, Bucket=self._bucket, Key=self._key, **get_botocore_valid_kwargs( function_name="create_multipart_upload", s3_additional_kwargs=self._s3_additional_kwargs), ) self._buffer.seek(0) for chunk_size in _utils.get_even_chunks_sizes( total_size=total_size, chunk_size=_MIN_WRITE_BLOCK, upper_bound=False): _logger.debug("chunk_size: %s bytes", chunk_size) self._parts_count += 1 self._upload_proxy.upload( bucket=self._bucket, key=self._key, part=self._parts_count, upload_id=self._mpu["UploadId"], data=self._buffer.read(chunk_size), boto3_session=self._boto3_session, boto3_kwargs=get_botocore_valid_kwargs( function_name="upload_part", s3_additional_kwargs=self._s3_additional_kwargs), ) self._buffer.seek(0) self._buffer.truncate(0) self._buffer.close() self._buffer = io.BytesIO() return None
def _fetch_range_proxy(self, start: int, end: int) -> bytes: _logger.debug("Fetching: s3://%s/%s - Range: %s-%s", self._bucket, self._key, start, end) s3_client: boto3.client = _utils.client(service_name="s3", session=self._boto3_session) boto3_kwargs: Dict[str, Any] = get_botocore_valid_kwargs( function_name="get_object", s3_additional_kwargs=self._s3_additional_kwargs ) cpus: int = _utils.ensure_cpu_count(use_threads=self._use_threads) range_size: int = end - start if cpus < 2 or range_size < (2 * _MIN_PARALLEL_READ_BLOCK): return _fetch_range( range_values=(start, end), bucket=self._bucket, key=self._key, s3_client=s3_client, boto3_kwargs=boto3_kwargs, version_id=self._version_id, )[1] sizes: Tuple[int, ...] = _utils.get_even_chunks_sizes( total_size=range_size, chunk_size=_MIN_PARALLEL_READ_BLOCK, upper_bound=False ) ranges: List[Tuple[int, int]] = [] chunk_start: int = start for size in sizes: ranges.append((chunk_start, chunk_start + size)) chunk_start += size with concurrent.futures.ThreadPoolExecutor(max_workers=cpus) as executor: return self._merge_range( ranges=list( executor.map( _fetch_range, ranges, itertools.repeat(self._bucket), itertools.repeat(self._key), itertools.repeat(s3_client), itertools.repeat(boto3_kwargs), itertools.repeat(self._version_id), ) ), )
def test_get_even_chunks_sizes(total_size, chunk_size, upper_bound, result): assert get_even_chunks_sizes(total_size, chunk_size, upper_bound) == result