def test_get_all_keys_tree(self):
        """
        test storing and retrieving a directory tree
        """
        # 2011-12-04 -- s3 clips leading slash
        key_names = [
            "aaa/b/cccc/1",
            "aaa/b/ccccccccc/1",
            "aaa/b/ccccccccc/2",
            "aaa/b/ccccccccc/3",
            "aaa/b/dddd/1",
            "aaa/b/dddd/2",
            "aaa/e/ccccccccc/1",
            "fff/e/ccccccccc/1",
        ]

        # create the bucket
        bucket = self._s3_connection.create_unique_bucket()
        self.assertTrue(bucket is not None)
        for key in bucket.list():
            key.delete()

        # create some keys
        keys = list()
        for key_name in key_names:
            key = Key(bucket)

            # set the name
            key.name = key_name

            # upload some data
            test_string = os.urandom(1024)
            key.set_contents_from_string(test_string)
            self.assertTrue(key.exists())

            keys.append(key)

        result_set = BucketListResultSet(bucket, prefix="aaa")
        self.assertEqual(len(list(result_set)), 7)

        result_set = BucketListResultSet(bucket, prefix="aaa/b")
        self.assertEqual(len(list(result_set)), 6)

        result_set = BucketListResultSet(bucket, prefix="aaa/b/ccccccccc/")
        self.assertEqual(len(list(result_set)), 3)

        result_set = BucketListResultSet(bucket, prefix="aaa/b/dddd")
        self.assertEqual(len(list(result_set)), 2)

        result_set = BucketListResultSet(bucket, prefix="aaa/e")
        self.assertEqual(len(list(result_set)), 1)

        # delete the keys
        for key in bucket.list():
            key.delete()

        # delete the bucket
        self._s3_connection.delete_bucket(bucket.name)
 def list(self, prefix='', delimiter='', marker='', headers=None):
     """
     List key objects within a bucket.  This returns an instance of an
     BucketListResultSet that automatically handles all of the result
     paging, etc. from S3.  You just need to keep iterating until
     there are no more results.
     Called with no arguments, this will return an iterator object across
     all keys within the bucket.
     
     :type prefix: string
     :param prefix: allows you to limit the listing to a particular
                     prefix.  For example, if you call the method with
                     prefix='/foo/' then the iterator will only cycle
                     through the keys that begin with the string '/foo/'.
                     
     :type delimiter: string
     :param delimiter: can be used in conjunction with the prefix
                     to allow you to organize and browse your keys
                     hierarchically. See:
                     http://docs.amazonwebservices.com/AmazonS3/2006-03-01/
                     for more details.
                     
     :type marker: string
     :param marker: The "marker" of where you are in the result set
     
     :rtype: :class:`boto.s3.bucketlistresultset.BucketListResultSet`
     :return: an instance of a BucketListResultSet that handles paging, etc
     """
     return BucketListResultSet(self, prefix, delimiter, marker, headers)
Exemple #3
0
    def list_bucket_contents(self, bucket, subdir=None):
        """Returns files in the Google Storage bucket as a (dirs, files) tuple.

    TODO(epoger): This should raise an exception if subdir does not exist in
    Google Storage; right now, it just returns empty contents.

    Args:
      bucket: name of the Google Storage bucket
      subdir: directory within the bucket to list, or None for root directory
    """
        # The GS command relies on the prefix (if any) ending with a slash.
        prefix = subdir or ''
        if prefix and not prefix.endswith('/'):
            prefix += '/'
        prefix_length = len(prefix) if prefix else 0

        b = self._connect_to_bucket(bucket=bucket)
        items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/')
        dirs = []
        files = []
        for item in items:
            t = type(item)
            if t is Key:
                files.append(item.name[prefix_length:])
            elif t is Prefix:
                dirs.append(item.name[prefix_length:-1])
        return (dirs, files)
Exemple #4
0
def reap(s3_bucket, capacity, dry=False):
    keys = [key for key in BucketListResultSet(s3_bucket)]
    if len(keys) <= capacity:
        return 0
    keys = sorted(keys, key=lambda x: iso8601.parse_date(x.last_modified))
    keys.reverse()
    for key in keys[capacity:]:
        logger.debug("deleting key %s last modified @ %s from s3 bucket %s",
                     key.name, key.last_modified, s3_bucket.name)
        if not dry:
            key.delete()
    return len(keys) - capacity
Exemple #5
0
def info():
    if connected == 0:
        print 'Not connected!'
    elif connected == 1:
        bucket = raw_input('Bucket name:').strip()
        filename = raw_input('Filename:').strip()
        from boto.s3.bucketlistresultset import BucketListResultSet
        b = conn.get_bucket(bucket)
        brs = BucketListResultSet(bucket=b)
        for f in brs:
            key = b.lookup(f.name)
            print 'File: ' + f.name
            print 'size: ' + str(key.size)
            print 'last modified: ' + str(key.last_modified)
            print 'etag (md5): ' + str(key.etag)
    def test_get_all_keys_empty_bucket(self):
        """
        test get_all_keys() on an empty buckey
        """
        log = logging.getLogger("empty")

        # create the bucket
        bucket = self._s3_connection.create_unique_bucket()
        self.assertTrue(bucket is not None)
        for key in bucket.list():
            key.delete()

        # try a simple get_all_keys()
        result_set = BucketListResultSet(bucket)
        self.assertEqual(list(result_set), [])

        # delete the bucket
        self._s3_connection.delete_bucket(bucket.name)
Exemple #7
0
    def list(self, prefix='', delimiter='', marker='', headers=None):
        """
        List key objects within a bucket.  This returns an instance of an
        BucketListResultSet that automatically handles all of the result
        paging, etc. from S3.  You just need to keep iterating until
        there are no more results.
        
        Called with no arguments, this will return an iterator object across
        all keys within the bucket.

        The Key objects returned by the iterator are obtained by parsing
        the results of a GET on the bucket, also known as the List Objects
        request.  The XML returned by this request contains only a subset
        of the information about each key.  Certain metadata fields such
        as Content-Type and user metadata are not available in the XML.
        Therefore, if you want these additional metadata fields you will
        have to do a HEAD request on the Key in the bucket.
        
        :type prefix: string
        :param prefix: allows you to limit the listing to a particular
                        prefix.  For example, if you call the method with
                        prefix='/foo/' then the iterator will only cycle
                        through the keys that begin with the string '/foo/'.
                        
        :type delimiter: string
        :param delimiter: can be used in conjunction with the prefix
                        to allow you to organize and browse your keys
                        hierarchically. See:
                        http://docs.amazonwebservices.com/AmazonS3/2006-03-01/
                        for more details.
                        
        :type marker: string
        :param marker: The "marker" of where you are in the result set
        
        :rtype: :class:`boto.s3.bucketlistresultset.BucketListResultSet`
        :return: an instance of a BucketListResultSet that handles paging, etc
        """
        return BucketListResultSet(self, prefix, delimiter, marker, headers)
Exemple #8
0
 def __iter__(self):
     return iter(BucketListResultSet(self))
Exemple #9
0
    def upload_dir_contents(self,
                            source_dir,
                            dest_bucket,
                            dest_dir,
                            num_threads=DEFAULT_UPLOAD_THREADS,
                            upload_if=UploadIf.ALWAYS,
                            **kwargs):
        """Recursively upload contents of a local directory to Google Storage.

    params:
      source_dir: full path (local-OS-style) on local disk of directory to copy
          contents of
      dest_bucket: GS bucket to copy the files into
      dest_dir: full path (Posix-style) within that bucket; write the files into
          this directory.  If None, write into the root directory of the bucket.
      num_threads: how many files to upload at once
      upload_if: one of the UploadIf values, describing in which cases we should
          upload the file
      kwargs: any additional keyword arguments "inherited" from upload_file()

    The copy operates as a merge: any files in source_dir will be "overlaid" on
    top of the existing content in dest_dir.  Existing files with the same names
    may or may not be overwritten, depending on the value of upload_if.

    TODO(epoger): Upload multiple files simultaneously to reduce latency.
    """
        b = self._connect_to_bucket(bucket=dest_bucket)
        if not dest_dir:
            dest_dir = ''

        # Create a set of all files within source_dir.
        source_fileset = set()
        prefix_length = len(source_dir) + 1
        for dirpath, _, filenames in os.walk(source_dir):
            relative_dirpath = dirpath[prefix_length:]
            for filename in filenames:
                source_fileset.add(os.path.join(relative_dirpath, filename))
        num_files_total = len(source_fileset)

        # If we are only uploading files conditionally, remove any unnecessary
        # files from source_fileset.
        if upload_if == self.UploadIf.ALWAYS:
            pass  # there are no shortcuts... upload them all
        else:
            # Create a mapping of filename to Key for existing files within dest_dir
            existing_dest_filemap = {}
            prefix = dest_dir
            if prefix and not prefix.endswith('/'):
                prefix += '/'
            prefix_length = len(prefix)
            items = BucketListResultSet(bucket=b, prefix=prefix)
            for item in items:
                if type(item) is Key:
                    existing_dest_filemap[item.name[prefix_length:]] = item

            # Now, depending on upload_if, trim files we should skip uploading.
            files_in_common = source_fileset.intersection(
                existing_dest_filemap.keys())
            if upload_if == self.UploadIf.IF_NEW:
                source_fileset -= files_in_common
            elif upload_if == self.UploadIf.IF_MODIFIED:
                for rel_path in files_in_common:
                    local_md5 = '"%s"' % _get_local_md5(
                        path=os.path.join(source_dir, rel_path))
                    key = existing_dest_filemap[rel_path]
                    if local_md5 == key.etag:
                        source_fileset.remove(rel_path)
            else:
                raise Exception('unknown value of upload_if: %s' % upload_if)

        # Upload any files still in source_fileset.
        num_files_to_upload = len(source_fileset)
        print('Uploading %d files, skipping %d ...' %
              (num_files_to_upload, num_files_total - num_files_to_upload))
        if num_files_to_upload == 0:
            return
        if num_threads > num_files_to_upload:
            num_threads = num_files_to_upload

        # Create a work queue with all files that need to be uploaded.
        q = Queue.Queue(maxsize=num_files_to_upload)
        for rel_path in source_fileset:
            q.put(rel_path)

        err = {}

        # Spin up worker threads to read from the task queue.
        def worker():
            while True:
                try:
                    rel_path = q.get(block=False)
                except Queue.Empty:
                    return  # no more tasks in the queue, so exit
                print(' Uploading file %d/%d: %s' %
                      (num_files_to_upload - q.qsize(), num_files_to_upload,
                       rel_path))

                retries = 5
                for retry in range(retries):
                    try:
                        self.upload_file(
                            source_path=os.path.join(source_dir, rel_path),
                            dest_bucket=b,
                            dest_path=posixpath.join(dest_dir, rel_path),
                            upload_if=self.UploadIf.ALWAYS,
                            **kwargs)
                        q.task_done()
                        break
                    except Exception as error:
                        if retry < retries - 1:
                            print '  Retrying upload, attempt #%d' % (retry +
                                                                      1)
                            time.sleep(2**retry)
                        else:
                            err[rel_path] = error

        for _ in range(num_threads):
            t = threading.Thread(target=worker)
            t.daemon = True
            t.start()

        # Block until all files have been uploaded and all workers have exited.
        q.join()

        if err:
            errMsg = 'Failed to upload the following: \n\n'
            for rel_path, e in err.iteritems():
                errMsg += '%s: %s\n' % (rel_path, e)
            raise Exception(errMsg)