def du(summarize, human_readable, s3_paths): """ display disk usage statistics """ size_formatter = magic.human_bytes if human_readable else lambda s: s for s3_path in s3_paths: totals = {} # In recursive tallying of totals this key is the stop condition. stop_path = '/' + s3_util.bucket_and_key(s3_path)[1].rstrip('/') def tally(path_segment, size): if not summarize or path_segment == stop_path: # If the summarize option is present, we only care about the grand total. if path_segment not in totals: totals[path_segment] = 0 totals[path_segment] += size if path_segment != stop_path: parent_dir = path_segment.rsplit('/', 1)[0] if parent_dir == '': # Edge case when totalling the entire bucket. parent_dir = '/' tally(parent_dir, size) for obj in s3_util.keys(s3_path): # usage for all given paths, and recursively for directories (excludes individual files) dbg('adding {}', obj) dir_key = '/' + obj['key'].rsplit('/', 1)[0] tally(dir_key, obj['len']) for path, total in sorted(totals.items()): out('{}\t{}', size_formatter(total), path)
def scanb(start="", end=None): bucket_start, first_bucket_key_prefix = bucket_and_key(start) dbg("start b:{} k:{}", bucket_start, first_bucket_key_prefix if first_bucket_key_prefix else "(unbounded)") bucket_end = None last_bucket_key_end = None if end: bucket_end, last_bucket_key_end = bucket_and_key(end) if not start <= end: err("start must be lexically before end") dbg("end b:{} k:{}", bucket_end, last_bucket_key_end if last_bucket_key_end else "(unbounded)") s3 = boto3.resource("s3") def bucket_gen(): _buckets = s3.buckets.all() for _bucket in _buckets: if _bucket.name < bucket_start: continue if bucket_end is not None and _bucket.name >= bucket_end: break yield _bucket def returned_generator(): buckets = list(bucket_gen()) for index, bucket in enumerate(buckets): key_start = first_bucket_key_prefix if index == 0 else "" key_end = last_bucket_key_end if index == len(buckets) - 1 else None yield scank(bucket.name, key_start, key_end) return magic.flatten(returned_generator())
def returned_gen(): list_objects = s3.get_paginator("list_objects") response_iterator = list_objects.paginate(Bucket=bucket_name, Prefix=key_prefix, Delimiter="/") try: for entry in magic.flatten(map(generate_objs, response_iterator)): yield entry except SignalUnslashedDirectory: dbg("Saw unslashed directory at {}. Restarting paginator.", key_prefix) response_iterator = list_objects.paginate( Bucket=bucket_name, Prefix=key_prefix + "/", # Step into that dir! The user left off the trailing '/'. Delimiter="/", ) for entry in magic.flatten(map(generate_objs, response_iterator)): yield entry
def scank(bucket_name, key_start="", key_end=None): dbg("scanning b:{} s:{} e:{}", bucket_name, key_start, str(key_end)) prefix = "" if key_start and key_end: # Skip some unnecessary scanning by only going over the common prefix prefix = os.path.commonprefix([key_start, key_end]) s3 = boto3.resource("s3") bucket = s3.Bucket(bucket_name) def key_gen(): _objects = bucket.objects.filter(Prefix=prefix) for _obj in _objects: if _obj.key < key_start: continue if key_end is not None and _obj.key >= key_end: break yield _obj return ({"bucket": obj.bucket_name, "key": obj.key} for obj in key_gen())