コード例 #1
0
ファイル: s3.py プロジェクト: afonsomy/One_project
    def delete_listed_objects(self, objects_paths, procs_io_bound=None):
        if not procs_io_bound:
            procs_io_bound = self._session.procs_io_bound
        logger.debug(f"procs_io_bound: {procs_io_bound}")
        buckets = {}
        for path in objects_paths:
            path_cleaned = path.replace("s3://", "")
            bucket_name = path_cleaned.split("/", 1)[0]
            if bucket_name not in buckets:
                buckets[bucket_name] = []
            buckets[bucket_name].append({"Key": path_cleaned.split("/", 1)[1]})

        for bucket, batch in buckets.items():
            procs = []
            logger.debug(f"bucket: {bucket}")
            if procs_io_bound > 1:
                logger.debug(f"len(batch): {len(batch)}")
                bounders = calculate_bounders(len(batch), procs_io_bound)
                logger.debug(f"bounders: {bounders}")
                for bounder in bounders:
                    proc = mp.Process(
                        target=self.delete_objects_batch,
                        args=(
                            self._session.primitives,
                            bucket,
                            batch[bounder[0]:bounder[1]],
                        ),
                    )
                    proc.daemon = False
                    proc.start()
                    procs.append(proc)
                for proc in procs:
                    proc.join()
            else:
                self.delete_objects_batch(session_primitives=self._session.primitives, bucket=bucket, batch=batch)
コード例 #2
0
ファイル: s3.py プロジェクト: mauriciofm/aws-data-wrangler
 def get_objects_sizes(self, objects_paths, procs_io_bound=None):
     if not procs_io_bound:
         procs_io_bound = self._session.procs_io_bound
     logger.debug(f"procs_io_bound: {procs_io_bound}")
     objects_sizes = {}
     procs = []
     receive_pipes = []
     bounders = calculate_bounders(len(objects_paths), procs_io_bound)
     logger.debug(f"len(bounders): {len(bounders)}")
     for bounder in bounders:
         receive_pipe, send_pipe = mp.Pipe()
         logger.debug(f"bounder: {bounder}")
         proc = mp.Process(
             target=self._get_objects_head_remote,
             args=(
                 send_pipe,
                 self._session.primitives,
                 objects_paths[bounder[0]:bounder[1]],
             ),
         )
         proc.daemon = False
         proc.start()
         procs.append(proc)
         receive_pipes.append(receive_pipe)
     logger.debug(f"len(procs): {len(bounders)}")
     for i in range(len(procs)):
         logger.debug(f"Waiting pipe number: {i}")
         receved = receive_pipes[i].recv()
         objects_sizes.update(receved)
         logger.debug(f"Waiting proc number: {i}")
         procs[i].join()
         logger.debug(f"Closing proc number: {i}")
         receive_pipes[i].close()
     return objects_sizes
コード例 #3
0
ファイル: s3.py プロジェクト: mauriciofm/aws-data-wrangler
    def copy_listed_objects(self,
                            objects_paths,
                            source_path,
                            target_path,
                            mode="append",
                            procs_io_bound=None):
        if not procs_io_bound:
            procs_io_bound = self._session.procs_io_bound
        logger.debug(f"procs_io_bound: {procs_io_bound}")
        logger.debug(f"len(objects_paths): {len(objects_paths)}")
        if source_path[-1] == "/":
            source_path = source_path[:-1]
        if target_path[-1] == "/":
            target_path = target_path[:-1]

        if mode == "overwrite":
            logger.debug(f"Deleting to overwrite: {target_path}")
            self._session.s3.delete_objects(path=target_path)
        elif mode == "overwrite_partitions":
            objects_wo_prefix = [
                o.replace(f"{source_path}/", "") for o in objects_paths
            ]
            objects_wo_filename = [
                f"{o.rpartition('/')[0]}/" for o in objects_wo_prefix
            ]
            partitions_paths = list(set(objects_wo_filename))
            target_partitions_paths = [
                f"{target_path}/{p}" for p in partitions_paths
            ]
            for path in target_partitions_paths:
                logger.debug(f"Deleting to overwrite_partitions: {path}")
                self._session.s3.delete_objects(path=path)

        batch = []
        for obj in objects_paths:
            object_wo_prefix = obj.replace(f"{source_path}/", "")
            target_object = f"{target_path}/{object_wo_prefix}"
            batch.append((obj, target_object))

        if procs_io_bound > 1:
            bounders = calculate_bounders(len(objects_paths), procs_io_bound)
            logger.debug(f"bounders: {bounders}")
            procs = []
            for bounder in bounders:
                proc = mp.Process(
                    target=self.copy_objects_batch,
                    args=(
                        self._session.primitives,
                        batch[bounder[0]:bounder[1]],
                    ),
                )
                proc.daemon = False
                proc.start()
                procs.append(proc)
            for proc in procs:
                proc.join()
        else:
            self.copy_objects_batch(
                session_primitives=self._session.primitives, batch=batch)
コード例 #4
0
    def delete_listed_objects(self,
                              objects_paths: List[str],
                              procs_io_bound: Optional[int] = None) -> None:
        """
        Delete a list of S3 objects (Parallel).

        :param objects_paths: List of S3 paths (e.g. ["s3://...", "s3://..."])
        :param procs_io_bound: Number of processes to be used for I/O bound operations
        :return: None
        """
        procs_io_bound = procs_io_bound if procs_io_bound is not None else self._session.procs_io_bound if self._session.procs_io_bound is not None else 1
        logger.debug(f"procs_io_bound: {procs_io_bound}")
        buckets: Dict[str, List[Dict[str, str]]] = {}
        for path in objects_paths:
            path_cleaned: str = path.replace("s3://", "")
            bucket_name: str = path_cleaned.split("/", 1)[0]
            if bucket_name not in buckets:
                buckets[bucket_name] = []
            buckets[bucket_name].append({"Key": path_cleaned.split("/", 1)[1]})

        bucket: str
        batch: List[Dict[str, str]]
        for bucket, batch in buckets.items():
            procs: List[mp.Process] = []
            logger.debug(f"bucket: {bucket}")
            if procs_io_bound > 1:
                logger.debug(f"len(batch): {len(batch)}")
                num_requests: int = int(ceil((float(len(batch)) / 1000.0)))
                num_requests = procs_io_bound if procs_io_bound < num_requests else num_requests
                bounders: List[Tuple[int, int]] = calculate_bounders(
                    len(batch), num_requests)
                logger.debug(f"bounders: {bounders}")
                bounder: Tuple[int, int]
                if len(bounders) == 1:
                    bounder = bounders[0]
                    self._delete_objects_batch(
                        session_primitives=self._session.primitives,
                        bucket=bucket,
                        batch=batch[bounder[0]:bounder[1]])
                else:
                    for bounder in bounders:
                        proc: mp.Process = mp.Process(
                            target=self._delete_objects_batch,
                            args=(
                                self._session.primitives,
                                bucket,
                                batch[bounder[0]:bounder[1]],
                            ),
                        )
                        proc.daemon = False
                        proc.start()
                        procs.append(proc)
                    for proc in procs:
                        proc.join()
            else:
                self._delete_objects_batch(
                    session_primitives=self._session.primitives,
                    bucket=bucket,
                    batch=batch)
コード例 #5
0
ファイル: s3.py プロジェクト: mangaonk/aws-data-wrangler
 def delete_objects_batch(session_primitives, bucket, batch):
     session = session_primitives.session
     client = session.boto3_session.client(service_name="s3", config=session.botocore_config)
     num_requests = int(ceil((float(len(batch)) / 1000.0)))
     bounders = calculate_bounders(len(batch), num_requests)
     logger.debug(f"Bounders: {bounders}")
     for bounder in bounders:
         client.delete_objects(Bucket=bucket, Delete={"Objects": batch[bounder[0]:bounder[1]]})
コード例 #6
0
 def _delete_objects_batch(session_primitives: "SessionPrimitives",
                           bucket: str, batch: List[Dict[str,
                                                         str]]) -> None:
     session: "Session" = session_primitives.session
     client_s3: client = session.boto3_session.client(
         service_name="s3", use_ssl=True, config=session.botocore_config)
     num_requests: int = int(ceil((float(len(batch)) / 1000.0)))
     bounders: List[Tuple[int, int]] = calculate_bounders(
         len(batch), num_requests)
     logger.debug(f"Bounders: {bounders}")
     for bounder in bounders:
         client_s3.delete_objects(
             Bucket=bucket,
             Delete={"Objects": batch[bounder[0]:bounder[1]]})
コード例 #7
0
    def get_objects_sizes(
            self,
            objects_paths: List[str],
            procs_io_bound: Optional[int] = None) -> Dict[str, int]:
        """
        Get sizes of a list of S3 objects.

        :param objects_paths: List of objects paths to be held.
        :param procs_io_bound: Number of processes to be used for I/O bound operations
        :return: Dictionary of paths and sizes (bytes)
        """
        procs_io_bound = procs_io_bound if procs_io_bound is not None else self._session.procs_io_bound if self._session.procs_io_bound is not None else 1
        logger.debug(f"procs_io_bound: {procs_io_bound}")
        objects_sizes: Dict[str, int] = {}
        procs = []
        receive_pipes = []
        bounders = calculate_bounders(len(objects_paths), procs_io_bound)
        logger.debug(f"len(bounders): {len(bounders)}")
        for bounder in bounders:
            receive_pipe, send_pipe = mp.Pipe()
            logger.debug(f"bounder: {bounder}")
            proc = mp.Process(
                target=self._get_objects_head_remote,
                args=(
                    send_pipe,
                    self._session.primitives,
                    objects_paths[bounder[0]:bounder[1]],
                ),
            )
            proc.daemon = False
            proc.start()
            procs.append(proc)
            receive_pipes.append(receive_pipe)
        logger.debug(f"len(procs): {len(bounders)}")
        for i in range(len(procs)):
            logger.debug(f"Waiting pipe number: {i}")
            received = receive_pipes[i].recv()
            objects_sizes.update(received)
            logger.debug(f"Waiting proc number: {i}")
            procs[i].join()
            logger.debug(f"Closing proc number: {i}")
            receive_pipes[i].close()
        return objects_sizes
コード例 #8
0
ファイル: pandas.py プロジェクト: noverde/aws-data-wrangler
def _get_bounders(dataframe, num_partitions):
    num_rows = len(dataframe.index)
    return calculate_bounders(num_items=num_rows, num_groups=num_partitions)
コード例 #9
0
ファイル: pandas.py プロジェクト: noverde/aws-data-wrangler
                                        names=names,
                                        usecols=usecols,
                                        dtype=dtype,
                                        sep=sep,
                                        thousands=thousands,
                                        decimal=decimal,
                                        lineterminator=lineterminator,
                                        quotechar=quotechar,
                                        quoting=quoting,
                                        escapechar=escapechar,
                                        parse_dates=parse_dates,
                                        infer_datetime_format=infer_datetime_format,
                                        encoding=encoding,
                                        converters=converters)
        else:
            bounders = calculate_bounders(num_items=total_size, max_size=max_result_size)
            logger.debug(f"bounders: {bounders}")
            bounders_len = len(bounders)
            count = 0
            forgotten_bytes = 0
            for ini, end in bounders:
                count += 1

                ini -= forgotten_bytes
                end -= 1  # Range is inclusive, contrary from Python's List
                bytes_range = "bytes={}-{}".format(ini, end)
                logger.debug(f"bytes_range: {bytes_range}")
                body = client_s3.get_object(Bucket=bucket_name, Key=key_path, Range=bytes_range)["Body"].read()
                chunk_size = len(body)
                logger.debug(f"chunk_size (bytes): {chunk_size}")
コード例 #10
0
    def copy_listed_objects(self,
                            objects_paths: List[str],
                            source_path: str,
                            target_path: str,
                            mode: str = "append",
                            procs_io_bound: Optional[int] = None) -> None:
        """
        Copy a list of S3 objects.

        :param objects_paths: List of objects paths
        :param source_path: Source directory
        :param target_path: Target directory
        :param mode: "append" | "overwrite" | "overwrite_partitions"
        :param procs_io_bound: procs_io_bound: Number of processes to be used for I/O bound operations
        :return: None
        """
        procs_io_bound = procs_io_bound if procs_io_bound is not None else self._session.procs_io_bound if self._session.procs_io_bound is not None else 1
        logger.debug(f"procs_io_bound: {procs_io_bound}")
        logger.debug(f"len(objects_paths): {len(objects_paths)}")
        if source_path[-1] == "/":
            source_path = source_path[:-1]
        if target_path[-1] == "/":
            target_path = target_path[:-1]

        if mode == "overwrite":
            logger.debug(f"Deleting to overwrite: {target_path}")
            self._session.s3.delete_objects(path=target_path)
        elif mode == "overwrite_partitions":
            objects_wo_prefix = [
                o.replace(f"{source_path}/", "") for o in objects_paths
            ]
            objects_wo_filename = [
                f"{o.rpartition('/')[0]}/" for o in objects_wo_prefix
            ]
            partitions_paths = list(set(objects_wo_filename))
            target_partitions_paths = [
                f"{target_path}/{p}" for p in partitions_paths
            ]
            for path in target_partitions_paths:
                logger.debug(f"Deleting to overwrite_partitions: {path}")
                self._session.s3.delete_objects(path=path)

        batch = []
        for obj in objects_paths:
            object_wo_prefix = obj.replace(f"{source_path}/", "")
            target_object = f"{target_path}/{object_wo_prefix}"
            batch.append((obj, target_object))

        if procs_io_bound > 1:
            bounders = calculate_bounders(len(objects_paths), procs_io_bound)
            logger.debug(f"bounders: {bounders}")
            procs = []
            for bounder in bounders:
                proc = mp.Process(
                    target=self._copy_objects_batch,
                    args=(
                        self._session.primitives,
                        batch[bounder[0]:bounder[1]],
                    ),
                )
                proc.daemon = False
                proc.start()
                procs.append(proc)
            for proc in procs:
                proc.join()
        else:
            self._copy_objects_batch(
                session_primitives=self._session.primitives, batch=batch)