Example #1
0
    def put_multipart(self,
                      local_path,
                      destination_s3_path,
                      part_size=67108864,
                      **kwargs):
        """
        Put an object stored locally to an S3 path
        using S3 multi-part upload (for files > 5GB).

        :param local_path: Path to source local file
        :param destination_s3_path: URL for target S3 location
        :param part_size: Part size in bytes. Default: 67108864 (64MB), must be >= 5MB and <= 5 GB.
        :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload`
        """
        # calculate number of parts to upload
        # based on the size of the file
        source_size = os.stat(local_path).st_size

        if source_size <= part_size:
            # fallback to standard, non-multipart strategy
            return self.put(local_path, destination_s3_path, **kwargs)

        (bucket, key) = self._path_to_bucket_and_key(destination_s3_path)

        # grab and validate the bucket
        s3_bucket = self.s3.get_bucket(bucket, validate=True)

        # calculate the number of parts (int division).
        # use modulo to avoid float precision issues
        # for exactly-sized fits
        num_parts = \
            (source_size // part_size) \
            if source_size % part_size == 0 \
            else (source_size // part_size) + 1

        mp = None
        try:
            mp = s3_bucket.initiate_multipart_upload(key, **kwargs)

            for i in range(num_parts):
                # upload a part at a time to S3
                offset = part_size * i
                bytes = min(part_size, source_size - offset)
                with open(local_path, 'rb') as fp:
                    part_num = i + 1
                    logger.info('Uploading part %s/%s to %s', part_num,
                                num_parts, destination_s3_path)
                    fp.seek(offset)
                    mp.upload_part_from_file(fp, part_num=part_num, size=bytes)

            # finish the upload, making the file available in S3
            mp.complete_upload()
        except BaseException:
            if mp:
                logger.info('Canceling multipart s3 upload for %s',
                            destination_s3_path)
                # cancel the upload so we don't get charged for
                # storage consumed by uploaded parts
                mp.cancel_upload()
            raise
Example #2
0
File: s3.py Project: palicand/luigi
    def put_multipart(self, local_path, destination_s3_path, part_size=67108864, **kwargs):
        """
        Put an object stored locally to an S3 path
        using S3 multi-part upload (for files > 5GB).

        :param local_path: Path to source local file
        :param destination_s3_path: URL for target S3 location
        :param part_size: Part size in bytes. Default: 67108864 (64MB), must be >= 5MB and <= 5 GB.
        :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload`
        """
        # calculate number of parts to upload
        # based on the size of the file
        source_size = os.stat(local_path).st_size

        if source_size <= part_size:
            # fallback to standard, non-multipart strategy
            return self.put(local_path, destination_s3_path, **kwargs)

        (bucket, key) = self._path_to_bucket_and_key(destination_s3_path)

        # grab and validate the bucket
        s3_bucket = self.s3.get_bucket(bucket, validate=True)

        # calculate the number of parts (int division).
        # use modulo to avoid float precision issues
        # for exactly-sized fits
        num_parts = \
            (source_size // part_size) \
            if source_size % part_size == 0 \
            else (source_size // part_size) + 1

        mp = None
        try:
            mp = s3_bucket.initiate_multipart_upload(key, **kwargs)

            for i in range(num_parts):
                # upload a part at a time to S3
                offset = part_size * i
                bytes = min(part_size, source_size - offset)
                with open(local_path, 'rb') as fp:
                    part_num = i + 1
                    logger.info('Uploading part %s/%s to %s', part_num, num_parts, destination_s3_path)
                    fp.seek(offset)
                    mp.upload_part_from_file(fp, part_num=part_num, size=bytes)

            # finish the upload, making the file available in S3
            mp.complete_upload()
        except BaseException:
            if mp:
                logger.info('Canceling multipart s3 upload for %s', destination_s3_path)
                # cancel the upload so we don't get charged for
                # storage consumed by uploaded parts
                mp.cancel_upload()
            raise
Example #3
0
 def is_writable(self):
     """
     Currently only works with hadoopcli
     """
     if "/" in self.path:
         # example path: /log/ap/2013-01-17/00
         parts = self.path.split("/")
         # start with the full path and then up the tree until we can check
         length = len(parts)
         for part in range(length):
             path = "/".join(parts[0:length - part]) + "/"
             if self.fs.exists(path):
                 # if the path exists and we can write there, great!
                 if self._is_writable(path):
                     return True
                 # if it exists and we can't =( sad panda
                 else:
                     return False
         # We went through all parts of the path and we still couldn't find
         # one that exists.
         return False
Example #4
0
 def is_writable(self):
     """
     Currently only works with hadoopcli
     """
     if "/" in self.path:
         # example path: /log/ap/2013-01-17/00
         parts = self.path.split("/")
         # start with the full path and then up the tree until we can check
         length = len(parts)
         for part in range(length):
             path = "/".join(parts[0:length - part]) + "/"
             if self.fs.exists(path):
                 # if the path exists and we can write there, great!
                 if self._is_writable(path):
                     return True
                 # if it exists and we can't =( sad panda
                 else:
                     return False
         # We went through all parts of the path and we still couldn't find
         # one that exists.
         return False
Example #5
0
    def __copy_multipart(self, pool, src_bucket, src_key, dst_bucket, dst_key, part_size, **kwargs):
        """
        Copy a single S3 object to another S3 object, falling back to multipart copy where necessary

        NOTE: This is a private method and should only be called from within the `s3.copy` method

        :param pool: The threadpool to put the s3 copy processes onto
        :param src_bucket: source bucket name
        :param src_key: source key name
        :param dst_bucket: destination bucket name
        :param dst_key: destination key name
        :param key_size: size of the key to copy in bytes
        :param part_size: Part size in bytes. Must be >= 5MB and <= 5 GB.
        :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload`
        """

        source_bucket = self.s3.get_bucket(src_bucket, validate=True)
        dest_bucket = self.s3.get_bucket(dst_bucket, validate=True)

        key_size = source_bucket.lookup(src_key).size

        # We can't do a multipart copy on an empty Key, so handle this specially.
        # Also, don't bother using the multipart machinery if we're only dealing with a small non-multipart file
        if key_size == 0 or key_size <= part_size:
            result = pool.apply_async(dest_bucket.copy_key, args=(dst_key, src_bucket, src_key), kwds=kwargs)
            # Bubble up any errors we may encounter
            return result.get()

        mp = None

        try:
            mp = dest_bucket.initiate_multipart_upload(dst_key, **kwargs)
            cur_pos = 0

            # Store the results from the apply_async in a list so we can check for failures
            results = []

            # Calculate the number of chunks the file will be
            num_parts = (key_size + part_size - 1) // part_size

            for i in range(num_parts):
                # Issue an S3 copy request, one part at a time, from one S3 object to another
                part_start = cur_pos
                cur_pos += part_size
                part_end = min(cur_pos - 1, key_size - 1)
                part_num = i + 1
                results.append(pool.apply_async(mp.copy_part_from_key, args=(src_bucket, src_key, part_num, part_start, part_end)))
                logger.info('Requesting copy of %s/%s to %s/%s', part_num, num_parts, dst_bucket, dst_key)

            logger.info('Waiting for multipart copy of %s/%s to finish', dst_bucket, dst_key)

            # This will raise any exceptions in any of the copy threads
            for result in results:
                result.get()

            # finish the copy, making the file available in S3
            mp.complete_upload()
            return mp.key_name

        except BaseException:
            logger.info('Error during multipart s3 copy for %s/%s to %s/%s...', src_bucket, src_key, dst_bucket, dst_key)
            # cancel the copy so we don't get charged for storage consumed by copied parts
            if mp:
                mp.cancel_upload()
            raise
Example #6
0
File: s3.py Project: URXtech/luigi
    def copy_multipart(self, source_path, destination_path, part_size=67108864, **kwargs):
        """
        Copy a single S3 object to another S3 object using S3 multi-part copy (for files > 5GB).
        It will use a single thread per request so that all parts are requested to be moved simultaneously
        for maximum speed.

        :param source_path: URL for S3 Source
        :param destination_path: URL for target S3 location
        :param part_size: Part size in bytes. Default: 67108864 (64MB), must be >= 5MB and <= 5 GB.
        :param kwargs: Keyword arguments are passed to the boto function `initiate_multipart_upload`
        """
        (src_bucket, src_key) = self._path_to_bucket_and_key(source_path)
        (dst_bucket, dst_key) = self._path_to_bucket_and_key(destination_path)

        dest_bucket = self.s3.get_bucket(dst_bucket, validate=True)
        source_bucket = self.s3.get_bucket(src_bucket, validate=True)

        source_size = source_bucket.lookup(src_key).size

        num_parts = (source_size + part_size - 1) // part_size

        # As the S3 copy command is completely server side, there is no issue with issuing a single
        # API call per part, however, this may in theory cause issues on systems with low ulimits for
        # number of threads when copying really large files, e.g. with a ~100GB file this will open ~1500
        # threads. We take the max of this and 1 as if we're copying an empty file we will have  `num_part == 0`
        pool = ThreadPool(processes=max(1, num_parts))

        mp = None
        try:
            mp = dest_bucket.initiate_multipart_upload(dst_key, **kwargs)
            cur_pos = 0

            # Store the results from the apply_async in a list so we can check for failures
            results = []

            for i in range(num_parts):
                # Issue an S3 copy request, one part at a time, from one S3 object to another
                part_start = cur_pos
                cur_pos += part_size
                part_end = min(cur_pos - 1, source_size - 1)
                part_num = i + 1
                results.append(pool.apply_async(mp.copy_part_from_key, args=(src_bucket, src_key, part_num, part_start, part_end)))
                logger.info('Requesting copy of %s/%s to %s', part_num, num_parts, destination_path)

            logger.info('Waiting for multipart copy of %s to finish', destination_path)
            pool.close()
            pool.join()

            # This will raise any exceptions in any of the copy threads
            for result in results:
                result.get()

            # finish the copy, making the file available in S3
            mp.complete_upload()
        except BaseException:
            if mp:
                logger.info('Canceling multipart s3 copy for %s to %s', source_path, destination_path)
                # cancel the copy so we don't get charged for
                # storage consumed by copied parts
                mp.cancel_upload()
            raise