def delete_listed_objects(self, objects_paths, procs_io_bound=None): if not procs_io_bound: procs_io_bound = self._session.procs_io_bound logger.debug(f"procs_io_bound: {procs_io_bound}") buckets = {} for path in objects_paths: path_cleaned = path.replace("s3://", "") bucket_name = path_cleaned.split("/", 1)[0] if bucket_name not in buckets: buckets[bucket_name] = [] buckets[bucket_name].append({"Key": path_cleaned.split("/", 1)[1]}) for bucket, batch in buckets.items(): procs = [] logger.debug(f"bucket: {bucket}") if procs_io_bound > 1: logger.debug(f"len(batch): {len(batch)}") bounders = calculate_bounders(len(batch), procs_io_bound) logger.debug(f"bounders: {bounders}") for bounder in bounders: proc = mp.Process( target=self.delete_objects_batch, args=( self._session.primitives, bucket, batch[bounder[0]:bounder[1]], ), ) proc.daemon = False proc.start() procs.append(proc) for proc in procs: proc.join() else: self.delete_objects_batch(session_primitives=self._session.primitives, bucket=bucket, batch=batch)
def get_objects_sizes(self, objects_paths, procs_io_bound=None): if not procs_io_bound: procs_io_bound = self._session.procs_io_bound logger.debug(f"procs_io_bound: {procs_io_bound}") objects_sizes = {} procs = [] receive_pipes = [] bounders = calculate_bounders(len(objects_paths), procs_io_bound) logger.debug(f"len(bounders): {len(bounders)}") for bounder in bounders: receive_pipe, send_pipe = mp.Pipe() logger.debug(f"bounder: {bounder}") proc = mp.Process( target=self._get_objects_head_remote, args=( send_pipe, self._session.primitives, objects_paths[bounder[0]:bounder[1]], ), ) proc.daemon = False proc.start() procs.append(proc) receive_pipes.append(receive_pipe) logger.debug(f"len(procs): {len(bounders)}") for i in range(len(procs)): logger.debug(f"Waiting pipe number: {i}") receved = receive_pipes[i].recv() objects_sizes.update(receved) logger.debug(f"Waiting proc number: {i}") procs[i].join() logger.debug(f"Closing proc number: {i}") receive_pipes[i].close() return objects_sizes
def copy_listed_objects(self, objects_paths, source_path, target_path, mode="append", procs_io_bound=None): if not procs_io_bound: procs_io_bound = self._session.procs_io_bound logger.debug(f"procs_io_bound: {procs_io_bound}") logger.debug(f"len(objects_paths): {len(objects_paths)}") if source_path[-1] == "/": source_path = source_path[:-1] if target_path[-1] == "/": target_path = target_path[:-1] if mode == "overwrite": logger.debug(f"Deleting to overwrite: {target_path}") self._session.s3.delete_objects(path=target_path) elif mode == "overwrite_partitions": objects_wo_prefix = [ o.replace(f"{source_path}/", "") for o in objects_paths ] objects_wo_filename = [ f"{o.rpartition('/')[0]}/" for o in objects_wo_prefix ] partitions_paths = list(set(objects_wo_filename)) target_partitions_paths = [ f"{target_path}/{p}" for p in partitions_paths ] for path in target_partitions_paths: logger.debug(f"Deleting to overwrite_partitions: {path}") self._session.s3.delete_objects(path=path) batch = [] for obj in objects_paths: object_wo_prefix = obj.replace(f"{source_path}/", "") target_object = f"{target_path}/{object_wo_prefix}" batch.append((obj, target_object)) if procs_io_bound > 1: bounders = calculate_bounders(len(objects_paths), procs_io_bound) logger.debug(f"bounders: {bounders}") procs = [] for bounder in bounders: proc = mp.Process( target=self.copy_objects_batch, args=( self._session.primitives, batch[bounder[0]:bounder[1]], ), ) proc.daemon = False proc.start() procs.append(proc) for proc in procs: proc.join() else: self.copy_objects_batch( session_primitives=self._session.primitives, batch=batch)
def delete_listed_objects(self, objects_paths: List[str], procs_io_bound: Optional[int] = None) -> None: """ Delete a list of S3 objects (Parallel). :param objects_paths: List of S3 paths (e.g. ["s3://...", "s3://..."]) :param procs_io_bound: Number of processes to be used for I/O bound operations :return: None """ procs_io_bound = procs_io_bound if procs_io_bound is not None else self._session.procs_io_bound if self._session.procs_io_bound is not None else 1 logger.debug(f"procs_io_bound: {procs_io_bound}") buckets: Dict[str, List[Dict[str, str]]] = {} for path in objects_paths: path_cleaned: str = path.replace("s3://", "") bucket_name: str = path_cleaned.split("/", 1)[0] if bucket_name not in buckets: buckets[bucket_name] = [] buckets[bucket_name].append({"Key": path_cleaned.split("/", 1)[1]}) bucket: str batch: List[Dict[str, str]] for bucket, batch in buckets.items(): procs: List[mp.Process] = [] logger.debug(f"bucket: {bucket}") if procs_io_bound > 1: logger.debug(f"len(batch): {len(batch)}") num_requests: int = int(ceil((float(len(batch)) / 1000.0))) num_requests = procs_io_bound if procs_io_bound < num_requests else num_requests bounders: List[Tuple[int, int]] = calculate_bounders( len(batch), num_requests) logger.debug(f"bounders: {bounders}") bounder: Tuple[int, int] if len(bounders) == 1: bounder = bounders[0] self._delete_objects_batch( session_primitives=self._session.primitives, bucket=bucket, batch=batch[bounder[0]:bounder[1]]) else: for bounder in bounders: proc: mp.Process = mp.Process( target=self._delete_objects_batch, args=( self._session.primitives, bucket, batch[bounder[0]:bounder[1]], ), ) proc.daemon = False proc.start() procs.append(proc) for proc in procs: proc.join() else: self._delete_objects_batch( session_primitives=self._session.primitives, bucket=bucket, batch=batch)
def delete_objects_batch(session_primitives, bucket, batch): session = session_primitives.session client = session.boto3_session.client(service_name="s3", config=session.botocore_config) num_requests = int(ceil((float(len(batch)) / 1000.0))) bounders = calculate_bounders(len(batch), num_requests) logger.debug(f"Bounders: {bounders}") for bounder in bounders: client.delete_objects(Bucket=bucket, Delete={"Objects": batch[bounder[0]:bounder[1]]})
def _delete_objects_batch(session_primitives: "SessionPrimitives", bucket: str, batch: List[Dict[str, str]]) -> None: session: "Session" = session_primitives.session client_s3: client = session.boto3_session.client( service_name="s3", use_ssl=True, config=session.botocore_config) num_requests: int = int(ceil((float(len(batch)) / 1000.0))) bounders: List[Tuple[int, int]] = calculate_bounders( len(batch), num_requests) logger.debug(f"Bounders: {bounders}") for bounder in bounders: client_s3.delete_objects( Bucket=bucket, Delete={"Objects": batch[bounder[0]:bounder[1]]})
def get_objects_sizes( self, objects_paths: List[str], procs_io_bound: Optional[int] = None) -> Dict[str, int]: """ Get sizes of a list of S3 objects. :param objects_paths: List of objects paths to be held. :param procs_io_bound: Number of processes to be used for I/O bound operations :return: Dictionary of paths and sizes (bytes) """ procs_io_bound = procs_io_bound if procs_io_bound is not None else self._session.procs_io_bound if self._session.procs_io_bound is not None else 1 logger.debug(f"procs_io_bound: {procs_io_bound}") objects_sizes: Dict[str, int] = {} procs = [] receive_pipes = [] bounders = calculate_bounders(len(objects_paths), procs_io_bound) logger.debug(f"len(bounders): {len(bounders)}") for bounder in bounders: receive_pipe, send_pipe = mp.Pipe() logger.debug(f"bounder: {bounder}") proc = mp.Process( target=self._get_objects_head_remote, args=( send_pipe, self._session.primitives, objects_paths[bounder[0]:bounder[1]], ), ) proc.daemon = False proc.start() procs.append(proc) receive_pipes.append(receive_pipe) logger.debug(f"len(procs): {len(bounders)}") for i in range(len(procs)): logger.debug(f"Waiting pipe number: {i}") received = receive_pipes[i].recv() objects_sizes.update(received) logger.debug(f"Waiting proc number: {i}") procs[i].join() logger.debug(f"Closing proc number: {i}") receive_pipes[i].close() return objects_sizes
def _get_bounders(dataframe, num_partitions): num_rows = len(dataframe.index) return calculate_bounders(num_items=num_rows, num_groups=num_partitions)
names=names, usecols=usecols, dtype=dtype, sep=sep, thousands=thousands, decimal=decimal, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, escapechar=escapechar, parse_dates=parse_dates, infer_datetime_format=infer_datetime_format, encoding=encoding, converters=converters) else: bounders = calculate_bounders(num_items=total_size, max_size=max_result_size) logger.debug(f"bounders: {bounders}") bounders_len = len(bounders) count = 0 forgotten_bytes = 0 for ini, end in bounders: count += 1 ini -= forgotten_bytes end -= 1 # Range is inclusive, contrary from Python's List bytes_range = "bytes={}-{}".format(ini, end) logger.debug(f"bytes_range: {bytes_range}") body = client_s3.get_object(Bucket=bucket_name, Key=key_path, Range=bytes_range)["Body"].read() chunk_size = len(body) logger.debug(f"chunk_size (bytes): {chunk_size}")
def copy_listed_objects(self, objects_paths: List[str], source_path: str, target_path: str, mode: str = "append", procs_io_bound: Optional[int] = None) -> None: """ Copy a list of S3 objects. :param objects_paths: List of objects paths :param source_path: Source directory :param target_path: Target directory :param mode: "append" | "overwrite" | "overwrite_partitions" :param procs_io_bound: procs_io_bound: Number of processes to be used for I/O bound operations :return: None """ procs_io_bound = procs_io_bound if procs_io_bound is not None else self._session.procs_io_bound if self._session.procs_io_bound is not None else 1 logger.debug(f"procs_io_bound: {procs_io_bound}") logger.debug(f"len(objects_paths): {len(objects_paths)}") if source_path[-1] == "/": source_path = source_path[:-1] if target_path[-1] == "/": target_path = target_path[:-1] if mode == "overwrite": logger.debug(f"Deleting to overwrite: {target_path}") self._session.s3.delete_objects(path=target_path) elif mode == "overwrite_partitions": objects_wo_prefix = [ o.replace(f"{source_path}/", "") for o in objects_paths ] objects_wo_filename = [ f"{o.rpartition('/')[0]}/" for o in objects_wo_prefix ] partitions_paths = list(set(objects_wo_filename)) target_partitions_paths = [ f"{target_path}/{p}" for p in partitions_paths ] for path in target_partitions_paths: logger.debug(f"Deleting to overwrite_partitions: {path}") self._session.s3.delete_objects(path=path) batch = [] for obj in objects_paths: object_wo_prefix = obj.replace(f"{source_path}/", "") target_object = f"{target_path}/{object_wo_prefix}" batch.append((obj, target_object)) if procs_io_bound > 1: bounders = calculate_bounders(len(objects_paths), procs_io_bound) logger.debug(f"bounders: {bounders}") procs = [] for bounder in bounders: proc = mp.Process( target=self._copy_objects_batch, args=( self._session.primitives, batch[bounder[0]:bounder[1]], ), ) proc.daemon = False proc.start() procs.append(proc) for proc in procs: proc.join() else: self._copy_objects_batch( session_primitives=self._session.primitives, batch=batch)